### importing libraries

In [16]:
import numpy as np
import nltk
import pandas as pd
import re
import string

#### download corpus that we need

In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to C:\Users\Ahmed
[nltk_data]     Medra\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Ahmed
[nltk_data]     Medra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

### prepare the data
twitter sample data contains 10000 tweet; 5000 positive and 5000 nigative

In [5]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

#### split the data to train and test data

In [6]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [8]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

print("train_y shape = " + str(train_y.shape))
print("test_y shape = " + str(test_y.shape))

train_y shape = (8000, 1)
test_y shape = (2000, 1)


### Preprocessing our data
* remove stock market tickers like dollar signs
* remove old style retweet text RT
* remove hyperlinks
* remove hashtags
* tokenize tweets
* remove stopwords
* remove punctuation

In [18]:
def process_tweet(tweet):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

### Build a frequency table so that we can extract features from our data

In [19]:
# genetate frequency tabel for {(word,sentimet) : frequency}
def build_freqs(tweets, labels):
    yslist = np.squeeze(labels).tolist()

    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [20]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11340


In [21]:
# test the function below
print('This is an example: \n', train_x[101])
print('\nThis is an example of the processed version: \n', process_tweet(train_x[101]))

This is an example: 
 @AxeRade haw phela if am not looking like Mom obviously am looking like him :)

This is an example of the processed version: 
 ['haw', 'phela', 'look', 'like', 'mom', 'obvious', 'look', 'like', ':)']


## Logistic Regression
### Sigmoid function

In [22]:
def sigmoid(z):     
    h = 1 / (1 + np.exp(-z))
    return h

### Gradient Decent and Cost function

In [23]:
def gradientDescent(x, y, theta, alpha, num_iters):
   
    # the number of rows in matrix x
    m = x.shape[0]     
    for i in range(0, num_iters):
        
        # the dot product of x and theta
        z = np.dot(x,theta)
        
        # sigmoid of h
        h = sigmoid(z)
        
        # calculate the cost function
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))                                                    

        # update the weights theta
        theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
    J = float(J)
    return J, theta

## Extract Features from the data
### extract_features function 

In [24]:
def extract_features(tweet, freqs):
    
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    # 3 elements in the form of a 1 x 3 vector
    # The first feature is the number of positive words in a tweet.
    # The second feature is the number of negative words in a tweet.
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    assert(x.shape == (1, 3))
    return x

### Train the model

In [26]:
# collect the features x and into a matrix X
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

# Apply gradient descent (features, labels, theta, learning_rate, num_of_itrations)
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)

### Test the model

In [27]:
def predict_tweet(tweet, freqs, theta):
   
    # extract the features of the tweet and store it into x
    x = extract_features(tweet,freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [28]:
def test_logistic_regression(test_x, test_y, freqs, theta):
       
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)

    # convert y_hat & test_y to one-dimensional arrays to compare them    
    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    
    return accuracy

In [29]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"model's accuracy = {tmp_accuracy:.4f}")

model's accuracy = 0.9950


## Using sklearn library

In [30]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()

In [44]:
tx = []
for tweet in test_x:
        # make a numpy array of test tweets
        tx.append(extract_features(tweet,freqs))
tx = np.array(tx).squeeze()

In [46]:
lg.fit(X,Y)
lg.score(tx,test_y)

  return f(**kwargs)


0.9915