## Sentiment analysis using Logistic Regression
Given a tweet, classify it as having either a positive or negative sentiment

In [2]:
import re
import string
import numpy as np 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

Define function to process a tweet into cleaned text:

In [29]:
def process_tweet(tweet):

    '''
    Function to take in a tweet, process, and return cleaned text
    Input:
        tweet: str 
    Output:
        processed_tweet: list of processed strs
    '''

    stemmer = PorterStemmer()
    stop_words = stopwords.words('english')

    # Process through re
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)

    # Tokenize
    tokenizer = TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    tokens = tokenizer.tokenize(tweet)

    processed_tweet = []

    for word in tokens:
        if word not in stop_words and word not in string.punctuation:
            stem_word = stemmer.stem(word)
            processed_tweet.append(stem_word)

    return processed_tweet


Define function to build frequency table of words appearing in positive, negative tweets:

In [30]:
def build_freqs(tweets,ys):

    ''' 
    Function to build table of word frequenices
    Input:
        tweets: list of tweets (strs)
        ys: mx1 array with sentiment label of tweets (0,1)
    Output:
        freqs: dictionary mapping (word,sentiment) pairs to frequencies 
    '''

    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, tweet in zip(yslist,tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [5]:
import nltk

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/bretthagan/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bretthagan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
import pandas as pd

from nltk.corpus import twitter_samples

Split into training and testing sets:

In [15]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split into 80/20 train/test split
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [16]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

In [18]:
print(train_x[0])
print(process_tweet(train_x[0]))

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [19]:
def sigmoid(z):
    
    '''
    Input:
        z: scalar or array input
    Output:
        h: sigmoid of input z
    '''

    h = 1 / (1 + np.exp(-z))

    return h

In [21]:
def gradient_desecent(x,y,theta,alpha,num_iters):

    ''' 
    Input:
        x: matrix of features
        y: corresponding labels of x
        theta: weight vector
        alpha: learning rate
        num_iters: number of training iterations
    Output:
        J: cost
        theta: final weight vector
    '''

    m = len(x)

    for i in range(0,num_iters):

        z = np.dot(x,theta)
        h = sigmoid(z)
        J = (-1/m) * (np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h)))
        theta = theta - (alpha/m) * np.dot(x.T,(h-y))

    return float(J),theta

In [22]:
def extract_features(tweet,freqs,process_tweet=process_tweet):

    ''' 
    Input:
        tweet: str
        freqs: dict of frequencies of (word,label) tuples
    Output:
        x: feature vector
    '''

    word_l = process_tweet(tweet)

    # 3 element feature vector [bias, pos, neg]
    x = np.zeros(3)
    # Set bias to 1
    x[0] = 1

    for word in word_l:
        x[1] += freqs.get((word,1.0),0)
        x[2] += freqs.get((word,0.0),0)
    
    x = x[None,:]
    assert(x.shape==(1,3))
    return x


In [23]:
X = np.zeros((len(train_x),3))
for i in range(len(train_x)):
    X[i,:] = extract_features(train_x[i],freqs)

Y = train_y

# gradient descent
J,theta = gradient_desecent(X,Y,np.zeros((3,1)),1e-9,1500)

  return float(J),theta


In [24]:
def predict_tweet(tweet,freqs,theta):

    ''' 
    Input:
        tweet: str
        freqs: dictionary of tuples
        theta: weight vector
    Output:
        y_pred: probability of a tweet being pos or neg
    '''

    x = extract_features(tweet,freqs)
    y_pred = sigmoid(np.dot(x,theta))

    return y_pred

In [25]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))    

I am happy -> 0.519259
I am bad -> 0.494338
this movie should have been great. -> 0.515962
great -> 0.516052
great great -> 0.532070
great great great -> 0.548023
great great great great -> 0.563877


  print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))


In [26]:
def test_logistic_regression(test_x,test_y,freqs,theta,predict_tweet=predict_tweet):

    ''' 
    Input:
        test_x: list of tweets
        test_y: corresponding labels
        freqs: dictionary of freq counts
        theta: weight vector
    Output:
        accuracy
    '''

    y_hat = []

    for tweet in test_x:
        y_pred = predict_tweet(tweet,freqs,theta)

        if y_pred > 0.5: y_hat.append(1.0)
        else: y_hat.append(0.0)

    accuracy = (y_hat==np.squeeze(test_y)).sum() / len(test_x)

    return accuracy

In [27]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9965


Classify example tweets as either positive or negative:

In [28]:
# test sentiment classification with example tweets
tweet_1 = 'This movie is awful. I was bored out of my mind the whole time!'
tweet_2 = 'Best movie of the year, incredible!'
tweet_3 = 'This movie is awfully hard to watch, but still an incredible experience.'

tweets = [tweet_1,tweet_2,tweet_3]
for tweet in tweets:
    y_hat = predict_tweet(tweet,freqs,theta)
    if y_hat > 0.5: print('Positive tweet')
    else: print('Negative tweet')

Negative tweet
Positive tweet
Negative tweet
