# Download Data

In [1]:
import nltk 
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [2]:
from nltk.corpus import twitter_samples

## Read Data

In [3]:
positive_tweets=twitter_samples.strings('positive_tweets.json')
negative_tweets=twitter_samples.strings('negative_tweets.json')

## Convert to pandas dataframe 

In [4]:
import pandas as pd
pos_df = pd.DataFrame(positive_tweets, columns=["tweet"])
pos_df["target"] = 1
neg_df = pd.DataFrame(negative_tweets, columns=["tweet"])
neg_df["target"] = 0

In [5]:
# Combine both dataframes
df = pd.concat([pos_df, neg_df])
df.shape

(10000, 2)

In [6]:
df.sample(5) 

Unnamed: 0,tweet,target
2188,@NefariousBella9 @laurenkatebooks @Fallen_Seri...,1
3343,I got my half day and I'm 10/10 excited for se...,1
1787,true : ((( https://t.co/hfGyVJQ5RA,0
3207,@minewsexual oh :( lol how it is? XD,0
4314,@LusciousLyndee1 hahaha...and thus...I sneak a...,1


## Text Preprocessing

In [7]:
import string
import re

from nltk.corpus import stopwords 
nltk.download('stopwords')
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

from nltk.tokenize import TweetTokenizer

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [8]:
import html
def remove_special_chars(text):
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x1))

import unicodedata
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


In [9]:
def clean_tweets(tweet):
    #to_lowercase
    tweet=tweet.lower().strip()

    # remove special chars
    tweet=remove_special_chars(tweet)

    # remove_non_ascii
    tweet=remove_non_ascii(tweet)

    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)

    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    #replace all interger occurrences in list of tokenized words with textual representation"""
    tweet=re.sub(r'\d+', '', tweet)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    

    # tokenize tweets
    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
   
   
     
    tweets_clean = []   
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in string.punctuation): # remove punctuation
           
            stem_word = stemmer.stem(word) # stemming word

            #lemmatizer = WordNetLemmatizer()
            #lem_word=lemmatizer.lemmatize(word)
            #lem_word=lemmatizer.lemmatize(lem_word, pos='v') 
            tweets_clean.append(stem_word)
            

    return tweets_clean

In [10]:
df["tweet"] = df.tweet.apply(clean_tweets)
df.sample(5)

Unnamed: 0,tweet,target
3253,"[stat, day, arriv, new, follow, unfollow, :), ...",1
3728,"[hii, vin, plss, rpli, tweet, :(]",0
3699,"[appreci, proprieti, :-)]",1
1658,"[rt, free, follow, mbf, need, mutual, kpop, ac...",0
1549,"[iren, :-(]",0


## Split the data into train and test set

In [11]:
from sklearn.model_selection import train_test_split
train_df,test_df = train_test_split(df, shuffle=True)
print("Shape of train and test set:", train_df.shape, test_df.shape)
 

Shape of train and test set: (7500, 2) (2500, 2)


## Building frequency dictionary using training set 

In [12]:
def build_frequency_dict(df):
    freqs = {}
    for i in range(len(df)):
        row = df.iloc[i]
        y = row.target
        for word in row.tweet :
            pair = (word, y)
            
            if pair in freqs:
              freqs[pair] += 1
            else:
              freqs[pair] = 1
            
    return freqs

In [13]:
freqs = build_frequency_dict(train_df)

In [14]:
freqs

{('worst', 0): 12,
 ('part', 0): 11,
 ('still', 0): 99,
 ('feel', 0): 110,
 ('bad', 0): 48,
 (':(', 0): 3422,
 ('stat', 1): 43,
 ('day', 1): 189,
 ('arriv', 1): 49,
 ('new', 1): 104,
 ('follow', 1): 280,
 ('unfollow', 1): 46,
 (':)', 1): 2671,
 ('via', 1): 51,
 ('word', 0): 14,
 ('like', 0): 167,
 ('knive', 0): 1,
 ('damnit', 0): 1,
 ('omg', 0): 47,
 ('alli', 0): 2,
 ('hug', 0): 21,
 ('mani', 0): 21,
 ('wrap', 0): 1,
 ('arm', 0): 3,
 ('around', 0): 19,
 ('neck', 0): 1,
 ('pull', 0): 4,
 ('closer', 0): 2,
 (':-(', 0): 395,
 ('add', 0): 14,
 ('kik', 0): 47,
 ('nothaveld', 0): 1,
 ('hornykik', 0): 8,
 ('edm', 0): 1,
 ('sexi', 0): 11,
 ('likeforfollow', 0): 1,
 ('hannib', 0): 3,
 ('camsex', 0): 2,
 ('ye', 1): 57,
 ('one', 1): 88,
 ('team', 1): 21,
 ('impress', 1): 3,
 ('...', 1): 244,
 ('feel', 1): 28,
 ('bad', 1): 13,
 ('welcom', 1): 62,
 ('nairobi', 1): 1,
 ('hrdstellobama', 1): 1,
 ('region', 1): 2,
 ('civil', 1): 1,
 ('societi', 1): 2,
 ('dialogu', 1): 2,
 ('human', 1): 4,
 ('right', 1

## we extract three features from the frequencies.
1-Bias: 1 for all tweets

2- positive frequencies count

3- negative frequencies count 

In [15]:
import numpy as np
def extract_features(tweet, freqs):
    # Initialize a zeros array with size 3
    feats = np.zeros(3, dtype=int)
    # set bias to 1
    feats[0] = 1
    
    for word in tweet :
        # Set positive frequencies count
        if (word, 1) in freqs.keys(): feats[1] += freqs[(word, 1)]
        # Set negative frequencies count
        if (word, 0) in freqs.keys(): feats[2] += freqs[(word, 0)]
    
    assert(feats.shape == (3,))
    return feats

In [16]:
# Test extract_features function
sample = df.tweet.iloc[10]
sample_feats = extract_features(sample, freqs)
print("tweet:", sample)
print("features:", sample_feats)

tweet: ['followfriday', 'top', 'influenc', 'commun', 'week', ':)']
features: [   1 2826   52]


## extract features from train and test set

In [17]:
from functools import partial
from sklearn.linear_model import LogisticRegression
X_train = train_df.tweet.apply(partial(extract_features, freqs=freqs))
X_train = np.stack(X_train.values)
y_train = train_df.target.values

X_test = test_df.tweet.apply(partial(extract_features, freqs=freqs))
X_test = np.stack(X_test.values)
y_test = test_df.target.values

print(X_train.shape, y_train.shape,X_test.shape, y_test.shape)

(7500, 3) (7500,) (2500, 3) (2500,)


In [18]:
X_train

array([[   1,   93, 3702],
       [   1, 3433,  374],
       [   1,  195, 3604],
       ...,
       [   1,   24, 3458],
       [   1,  300, 3739],
       [   1,  565,   75]])

In [19]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
print("Train accuracy:", clf.score(X_train, y_train))

Train accuracy: 0.9902666666666666


In [20]:
y_pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print("test accuracy:", accuracy_score(y_test,y_pred))

test accuracy: 0.9892


## test our model with new samples

In [21]:
def test(tweet):
    tweet =  clean_tweets(tweet)
    test_feats = extract_features(tweet, freqs)
    y_pred = clf.predict(test_feats.reshape(1,-1))
    if y_pred[0] == 1: return "positive"
    elif y_pred[0] == 0: return "negative"
    else: return None

In [22]:
test("@seanactual You mean you're not offering? :(")

'negative'

In [23]:
test("This movie is really good")

'positive'

END
