In [1]:
#Importing dependencies
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 
import re                                  
import string                              
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer

In [2]:
# getting positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [3]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [4]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [5]:
# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [6]:
# performs preprocessing on tweet and outputs a list of words containing the processed tweet
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks    
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [7]:
# takes a list of tweets and builds a dict that maps (word, label) -> count
def build_freqs(tweets, ys):
    ys_list = np.squeeze(ys).tolist()
    freqs= {}
    for tweet, y in zip(tweets, ys_list):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [8]:
# takes a string and outputs a 1x3 list where it is [bias, sum of positive freq of all words, sum of negative freq of all words]
def extract_features(tweet, freqs):
    word_list = process_tweet(tweet)
    data = []
    pos_sum, neg_sum = 0.0,0.0
    for i in word_list:
        if (i, 0) in freqs:
            neg_sum  += freqs[(i,0)]
        if (i, 1) in freqs:
            pos_sum += freqs[(i,1)]
    data.append([1, pos_sum, neg_sum])
    return data
            

In [17]:
# performs simple sigmoid operation
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    h = 1 / (1 + np.exp(-z))
    return h

In [10]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    ### START CODE HERE ###
    # get 'm', the number of rows in matrix x
    m = len(x)
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
#         print(x.shape, theta.shape,z.shape)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = (-1/m)*(np.dot(y.T, np.log(h)) + np.dot((1-y).T,np.log(1-h)))

        # update the weights theta
        theta = theta - (alpha/m) * (np.dot(x.T, h-y))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta

In [11]:
def train(X, y, freqs):
    train_x = np.zeros((len(X), 3))
    for i in range(len(X)):
        train_x[i, :]= extract_features(X[i], freqs)[0]

    # training labels corresponding to X
    train_y = y

#     # Apply gradient descent
    J, theta = gradientDescent(train_x, train_y, np.zeros((3, 1)), 1e-9, 1500)
    return [round(t, 8) for t in np.squeeze(theta)]

In [12]:
freqs = build_freqs(train_x, train_y)
theta = train(train_x, train_y, freqs)

In [13]:
print(theta)

[6e-08, 0.0005382, -0.0005583]


In [14]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x, theta))
    
    ### END CODE HERE ###
    
    return y_pred
    

In [16]:
tweet = input("Enter any tweet: ")
result = predict_tweet(tweet, freqs, theta)
if result >= 0.5:
    print(f"Positive Sentiment")
else:
    print("Negative Sentiment")

Enter any tweet: I am crying because I am so happy today
Positive Sentiment
