In [1]:

from os import getcwd
import numpy as np
import pandas as pd

import re
import string
import nltk
from nltk.corpus import twitter_samples 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


## download dataset and english stopwords

In [2]:
#nltk.download('twitter_samples')
#nltk.download('stopwords')
# or directly download from here : http://www.nltk.org/nltk_data/

In [3]:
filePath = f"{getcwd()}/data/"
nltk.data.path.append(filePath)

In [4]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')


In [5]:
len(all_positive_tweets),len(all_negative_tweets)

(5000, 5000)

In [6]:
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

train = train_pos + train_neg
test = test_pos + test_neg


# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [7]:
train_y.shape, test_y.shape

((8000, 1), (2000, 1))

In [8]:
train_y = np.squeeze(train_y).tolist()
test_y = np.squeeze(test_y).tolist()

## clean text

In [9]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            if(len(stem_word)>1):
                tweets_clean.append(stem_word)

    return tweets_clean

In [10]:
index = np.random.randint(0,7999, 1)[0]
train[index],process_tweet(train[index])

('Last time I was here, was a funeral and a again funeral. Modimo ho tseba wena fela. :( — feeling emotional at... http://t.co/mQYsswdot7',
 ['last',
  'time',
  'funer',
  'funer',
  'modimo',
  'ho',
  'tseba',
  'wena',
  'fela',
  ':(',
  'feel',
  'emot',
  '...'])

## make dictionary of count of each word in text

In [11]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    freqs={}
    for label,tweet in zip(ys,tweets):
        for word in process_tweet(tweet):
            pair = (word, label)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [12]:
freqs = build_freqs(train, train_y)

In [13]:
len(freqs.keys())

11094

## implement logestic regression

### sigmoid function

In [14]:
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    h = 1 / (1 + np.exp(-z))
    
    return h

### gradient decent

In [31]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        z = np.dot(x,theta)
        h = sigmoid(z)
        
        # calculate the cost function for logestic regressin
        J = -1./m * (np.dot(y.T, np.log(h)) + np.dot((1-y).T,np.log(1-h)))                                                    

        # update the weights theta
        theta = theta - (alpha/m) * np.dot(x.T,(h-y))
        
    J = float(J)
    return J, theta

In [32]:
#examin our function
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


## Extract text features

### Given a list of tweets, extract the features and store them in a matrix. You will extract two features.
#### The first feature is the number of positive words in a tweet.
#### The second feature is the number of negative words in a tweet.
#### Then train your logistic regression classifier on these features.

In [24]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 

    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
    
    return x

In [25]:
index = np.random.randint(0,7999, 1)[0]
print(train[index])
print(extract_features(train[index],freqs))

An apartment makeover? http://t.co/ctLa1jppdb great ideas for factory living :)
[[1. 3. 7.]]


In [26]:
X = np.zeros((len(train), 3))
for i in range(len(train)):
    X[i, :]= extract_features(train[i], freqs)

In [27]:
X.shape

(8000, 3)

In [35]:
Y = np.array(train_y).reshape(-1,1)
# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.2425343780638846.
The resulting vector of weights is [7e-08, 0.00052301, -0.00055635]


## Test model


In [36]:
def prediction(tweet,freqs,theta):
    
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

In [38]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, prediction(tweet, freqs, theta)))

I am happy -> 0.518539
I am bad -> 0.494320
this movie should have been great. -> 0.515288
great -> 0.515430
great great -> 0.530830
great great great -> 0.546172
great great great great -> 0.561427


In [41]:
## Test model
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = prediction(tweet, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)

    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)

    
    return accuracy

In [42]:
tmp_accuracy = test_logistic_regression(test, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950
