In [69]:
#all imports are here

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random

import re
import string

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
    

In [5]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/muzammil/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/muzammil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import twitter_samples,stopwords

In [11]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [13]:
tokenizer = TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
stemmer = PorterStemmer()

In [17]:
def preprocess_tweets(tweet):
    #removing old style retweet text RT
    tweet = re.sub(r'RT[\s]+','',tweet)
    #removing hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    tweet_tokens = tokenizer.tokenize(tweet) #tokenizing tweets
    tweets_clean = []
    for word in tweet_tokens:
        if word in stopwords.words('english') or word in string.punctuation:
            continue
        else:
            tweets_clean.append(word)
    tweets_stem = []
    for word in tweets_clean:
        stem_word  = stemmer.stem(word)
        tweets_stem.append(stem_word)
    return tweets_stem
    

In [36]:
def build_frequency_dictionary(X,Y):
    frequency_dictionary = {}
    for label,tweet in zip(Y,X):
        for word in tweet:
            if (word,label) in frequency_dictionary:
                frequency_dictionary[(word,label)]+=1
            else:
                frequency_dictionary[(word,label)]=1
    return frequency_dictionary
        

In [55]:
def build_features(X,Y,frequency_dictionary):
    first_column = np.ones(len(X))
    second_column = []
    third_column = []
    positive_label = Y[0]
    negative_label = Y[len(Y)-1]
    for tweet in X:
        pos = 0
        neg = 0
        
        for word in tweet:
            if (word,positive_label) in frequency_dictionary:
                pos+=frequency_dictionary[(word,positive_label)]
            if (word,negative_label) in frequency_dictionary:
                neg+=frequency_dictionary[(word,negative_label)]
        second_column.append(pos)
        third_column.append(neg)
    return np.column_stack((first_column,second_column,third_column))    
        
        
    

In [28]:
X = []
Y = np.concatenate((np.ones(len(all_positive_tweets)),np.zeros(len(all_negative_tweets))))
all_tweets = all_positive_tweets+all_negative_tweets

In [29]:
for tweet in all_tweets:
    X.append(preprocess_tweets(tweet))

In [37]:
frequency_dictionary = build_frequency_dictionary(X,Y)

In [56]:
data_X = build_features(X,Y,frequency_dictionary)

In [89]:
train_X = np.concatenate((data_X[:4000] , data_X[5000:9000])) 
test_X  = np.concatenate((data_X[4000:5000],data_X[9000:]))
train_Y = np.concatenate((np.ones(4000),np.zeros(4000)))
test_Y = np.concatenate((np.ones(1000),np.zeros(1000)))

In [91]:
len(train_X)

8000

In [92]:
clf = LogisticRegression()
clf.fit(train_X,train_Y)

LogisticRegression()

In [94]:
Y_predicted = clf.predict(test_X)

In [95]:
clf.score(test_X,test_Y)

0.992