Importing all the classes :-

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string
import matplotlib.pyplot as plt
%matplotlib inline


Defining the cleaning/preprocessing method :-

In [3]:
def process_tweet(tweet):
    tweet1 = re.sub(r'https?:\/\/.*[\r\n]*','',tweet) #remove hyperlinks present
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #initiate tokenize to convert the string into list of words.
    tokens = tokenizer.tokenize(tweet1)
    stopwords_english = stopwords.words('english') #stopwords like the, is, etc that are irrelevant
    clean_tweets = []
    for word in tokens:
        if word not in stopwords_english and word not in string.punctuation:
            clean_tweets.append(word)
        
    stemmer = PorterStemmer() #stemmer for stemming words
    stemmed_tweets = []
    for word in clean_tweets:
        stem_word = stemmer.stem(word)
        stemmed_tweets.append(stem_word)

    return stemmed_tweets


Generating a word frequency dictionary based on the classification :-

In [4]:
def freq_dict(tweets,ys):
    yslist = np.squeeze(ys).tolist() #to prevent the list from ending up with one element
    freq = {}
    for y,tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            freq[pair] = freq.get(pair,0)+1
    return freq


Cost function and Gradient Descent :-

In [5]:
'''x is matrix of features                                 (m,n+1)
   y is corresponding labels (positive and negative)       (m,1)
   theta is weight vector                                  (n+1,1)
   alp is learning rate            
   iter is number of interations'''

def gradient(x,y,theta,alp,iter):
    m = len(x)
    for i in range(0,iter):
        z = np.dot(x,theta)
        h = 1/(1+np.exp(-z))   #sigmoid function
        J = (-1/m)*(np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h)))
        theta = theta - (alp/m)*np.dot(x.T,h-y)
    plt.plot(z,h)
    return J, theta

Extract features from the tweets using freq dictionary :-

In [6]:

#Output as  x: a feature vector of dimension (1,3)

def extract_features(tweet,freq):
    words = process_tweet(tweet)
    x = np.zeros((1,3))   #3 elements in for of a 1X3 vector
    x[0,0] = 1            #bias term set to 1
    for word in words:
        x[0,1] += freq.get((word,1),0)    #increment count for positive label 1
        x[0,2] += freq.get((word,0),0)    #increment count for negative label 0
    assert(x.shape==(1,3))
    return x

Loading dataset :-

In [8]:
df = pd.read_csv("train_new.csv")
train_x = df["selected_text"].values
train_y = df["sentiment"].values
freqs = freq_dict(train_x,train_y)
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):       #creating a feature vector x for each tweet``
    X[i, :]= extract_features(train_x[i], freqs)
# training labels corresponding to X
Y = train_y

Applying Gradient descent :-

In [9]:
# Apply gradient descent
J, theta = gradient(X, Y, np.zeros((3, 1)), 1e-7, 1500)
print(f"The cost after training is {J}.")

Prediction method and testing the model :-

In [16]:
def predict_tweet(tweet, freqs, theta):
    # extract the features of the tweet and store it into x
    x = extract_features(tweet, freqs)
    
    # make the prediction using x and theta
    z = np.dot(x,theta)
    y_pred = 1/(1+np.exp(-z))
    return y_pred
    
def test_logistic_regression(test_x, test_y, freqs, theta):
   
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)

        if y_pred.any > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)
# With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    y_hat = np.array(y_hat)
    test_y = test_y.reshape(-1)
    accuracy = np.sum((test_y == y_hat).astype(int))/len(test_x)
    
    return accuracy

In [17]:
test = pd.read_csv("test_new.csv")
test_x = test["selected_text"].values
test_y = test["sentiment"].values
freq1 = freq_dict(test_x,test_y)
accuracy = test_logistic_regression(test_x, test_y, freq1, theta)
print("Accuracy of the model is " + str(accuracy*100))

TypeError: '>' not supported between instances of 'builtin_function_or_method' and 'float'