# Implement Logistic Regression from scratch and apply it to Sentiment Analysis

Reference: https://medium.com/swlh/sentiment-analysis-from-scratch-with-logistic-regression-ca6f119256ab

### Text processing

In [1]:
import nltk
import numpy as np
from nltk.corpus import twitter_samples
positive_tweets =twitter_samples.strings('positive_tweets.json')
negative_tweets =twitter_samples.strings('negative_tweets.json')
example_postive_tweet=positive_tweets[0]
example_negative_tweet=negative_tweets[0]
test_pos = positive_tweets[4000:]
train_pos = positive_tweets[:4000]
test_neg = negative_tweets[4000:]
train_neg = negative_tweets[:4000]
train_x = train_pos + train_neg 
test_x = test_pos + test_neg
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [2]:
import re                                  
import string
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer

In [3]:
def text_process(tweet):
    tweet = re.sub(r'^RT[\s]+','',tweet) # remove retweet signs
    tweet = re.sub(r'https?:\/\/.*[\r\n]','',tweet) # remove https
    tweet = re.sub(r'#','',tweet) # remove #s
    tweet = re.sub(r'@\w*\s*','',tweet) # remove all names being ated.
    tweet = re.sub(r'[0-9]*','',tweet) # remove all numbers 
    tockenizer = TweetTokenizer()
    tweet_tokenized = tockenizer.tokenize(tweet)
    stopwords_english = [word for word in stopwords.words('english') if word not in ['i','we','you','no']] 
    # remove stopwords but keep i, we and you (first and second pronouns), and keep 'no'
    tweet_processed = [word for word in tweet_tokenized 
                       if word not in stopwords_english and 
                       word not in string.punctuation[1:]] # remove punctuations except !
    stemmer = PorterStemmer()
    tweet_after_stem = []
    for word in tweet_processed:
        word = stemmer.stem(word)
        tweet_after_stem.append(word)
    return tweet_after_stem

### Features extraction

#### Build a dictionary containing the frequency of the words in the positive tweets and the second dictionary will contain the frequency of the words in the negative tweets.

In [4]:
pos_words = []
for tweet in positive_tweets:
    tweet = text_process(tweet)
    for word in tweet:
        pos_words.append(word)

In [5]:
freq_pos={}
for word in pos_words:
    if (word,1) not in freq_pos:
        freq_pos[(word,1)]=1
    else:
        freq_pos[(word,1)]+=1

In [69]:
items = dict(sorted(freq_pos.items(), key=lambda item: item[1],reverse=True)).items()
n=0
for i in items:
    print(i)
    n+=1
    if n>20:
        break

((':)', 1), 3691)
(('!', 1), 1844)
(('you', 1), 1464)
(('i', 1), 1093)
((':-)', 1), 701)
((':d', 1), 659)
(('thank', 1), 643)
(('follow', 1), 447)
(('love', 1), 400)
(('...', 1), 290)
(('day', 1), 246)
(('u', 1), 245)
(('good', 1), 238)
(('like', 1), 232)
(('we', 1), 228)
(('happi', 1), 212)
(('get', 1), 209)
(('see', 1), 186)
(("i'm", 1), 183)
(('hi', 1), 176)
(('great', 1), 172)


In [7]:
neg_words = []
for tweet in negative_tweets:
    tweet = text_process(tweet)
    for word in tweet:
        neg_words.append(word)

In [8]:
freq_neg = {}
for word in neg_words:
    if (word,0) not in freq_neg:
        freq_neg[(word,0)]=1
    else:
        freq_neg[(word,0)]+=1

In [67]:
items = dict(sorted(freq_neg.items(), key=lambda item: item[1],reverse=True)).items()
n=0
for i in items:
    print(i)
    n+=1
    if n>20:
        break

((':(', 0), 4585)
(('i', 0), 2207)
(('!', 0), 829)
(('you', 0), 744)
((':-(', 0), 501)
(("i'm", 0), 343)
(('...', 0), 332)
(('miss', 0), 301)
(('no', 0), 285)
(('pleas', 0), 275)
(('follow', 0), 263)
(('want', 0), 246)
(('get', 0), 233)
(('go', 0), 223)
(('like', 0), 223)
(('♛', 0), 210)
(('》', 0), 210)
(('u', 0), 193)
(("can't", 0), 180)
(('me', 0), 174)
(('time', 0), 166)


In [10]:
import numpy as np
def feature_extraction(tweet,freq_pos,freq_neg):
    word_l=text_process(tweet)
    x = np.zeros((1,7))
    x[0,0] = 1
    for word in word_l:
        try:
            x[0,1]+=freq_pos[(word,1)]
        except:
            x[0,1]+=0
        try:
            x[0,2]+=freq_neg[(word,0)]
        except:
            x[0,2]+=0
        if word in ['i','we','u','you']:
            x[0,4] += 1

    if 'no' in word_l:
        x[0,3] = 1
    if '!' in word_l:
        x[0,5] = 1
    x[0,6] = np.log(len(word_l))
    
    assert x.shape == (1,7),'Shape is not correct'
    return x


In [11]:
X_train = np.zeros((len(train_x),7))
for i in range(len(train_x)):
    X_train[i,:] = feature_extraction(train_x[i],freq_pos,freq_neg)

In [12]:
print(X_train)
print(train_y)

[[1.00000000e+00 3.88900000e+03 7.40000000e+01 ... 0.00000000e+00
  0.00000000e+00 1.94591015e+00]
 [1.00000000e+00 1.00160000e+04 3.10600000e+03 ... 2.00000000e+00
  1.00000000e+00 2.89037176e+00]
 [1.00000000e+00 8.90200000e+03 2.68000000e+03 ... 3.00000000e+00
  1.00000000e+00 2.63905733e+00]
 ...
 [1.00000000e+00 2.37000000e+03 5.41300000e+03 ... 2.00000000e+00
  0.00000000e+00 2.19722458e+00]
 [1.00000000e+00 1.35100000e+03 7.07600000e+03 ... 1.00000000e+00
  0.00000000e+00 2.07944154e+00]
 [1.00000000e+00 1.33500000e+03 7.18800000e+03 ... 1.00000000e+00
  0.00000000e+00 2.39789527e+00]]
[[1.]
 [1.]
 [1.]
 ...
 [0.]
 [0.]
 [0.]]


## Write the model mathematically

### i)

There are 6 features in the model  
feature 1: sum of count of words that occur in positive tweets  
feature 2: sum of count of words that occur in negative tweets  
feature 3: binary; if 'no' is in tweet  
feature 4: sum of count of 1st and 2nd pronouns in the tweet  
feature 5: binary; if '!' is in tweet  
feature 6: log of number of words in tweet  

$$ log(\frac{p}{1-p}) = \beta_0 + \beta_{1}*\#positivity + \beta_{2}*\#negativity + \beta_{3} * no\_in\_doc + \beta_{4} * 1st\_2nd\_pronoun +  \beta_{5} * !\_in\_doc + \beta_{6} * log(\#words)$$

### ii) Write the likelihood function for your logistic regression model obtained in (i).

$$\prod^{n}_{i=1} p(x_i)^{y_i} * (1-p(x_i))^{1-y_i} $$
where $$p(x_i) = \frac{1}{1+e^{-\sum(\beta_{j}*x_{ij})}}$$ where $$x_{i0} = 1$$ and $$j \in \{0,1,2,3,4,5,6\} $$

### Train the logistic regression classifier using a black-box implementation and evaluate its performance on the test dataset.

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
black_box = LogisticRegression(penalty='none')
black_box.fit(X_train,train_y.ravel())

LogisticRegression(penalty='none')

In [15]:
X_test = np.zeros((len(test_x),7))
for i in range(len(test_x)):
    X_test[i,:] = feature_extraction(test_x[i],freq_pos,freq_neg)
len(X_test)

2000

In [16]:
y_pred = black_box.predict(scaler.transform(X_test))

In [17]:
accuracy_score(test_y.ravel(),y_pred)

0.9755

In [18]:
print(black_box.intercept_)
print(black_box.coef_[0][1:])

[-0.45121617]
[ 6.57621086 -6.86634029 -0.16300439  0.76632607 -1.15679155 -0.11777955]


## train the logistic regression classifier by minimizing the negative log-likelihood function using a numerical optimization procedure: gradient descent or stochastic gradient descent. Compare with the coefficients obtained in step (iii).

In [19]:
def sigmoid(Xbeta):
    return 1/(1+np.exp(-Xbeta))

In [20]:
X_train[:,0] = np.ones(X_train.shape[0])
X_train

array([[ 1.        ,  0.30545102, -1.11510595, ..., -0.70881191,
        -0.46713197,  0.04191112],
       [ 1.        ,  2.46057506, -0.1536601 , ...,  1.52059454,
         2.14072269,  1.5865076 ],
       [ 1.        ,  2.06873433, -0.28874451, ...,  2.63529777,
         2.14072269,  1.1755016 ],
       ...,
       [ 1.        , -0.22884528,  0.57788857, ...,  1.52059454,
        -0.46713197,  0.45291711],
       [ 1.        , -0.58727051,  1.10522513, ...,  0.40589131,
        -0.46713197,  0.26029175],
       [ 1.        , -0.59289838,  1.14074028, ...,  0.40589131,
        -0.46713197,  0.78109906]])

### Gradient Descent

In [21]:
gamma = 0.0000001 # learning rate
beta = np.zeros((7,1)) # initialize the 7 coefficients
iters = 1000000000
negLog_L_old=0
precision = 0.001
for k in range(iters):
    negLog_L =  - (train_y.T @ np.log(sigmoid(X_train @ beta)) + (1-train_y.T) @ np.log(1-sigmoid(X_train @ beta)))
    beta = beta - gamma * -1 * X_train.T @ (train_y-sigmoid(X_train @ beta))
    if np.abs(negLog_L_old-negLog_L) < precision:
        break
    else:
        negLog_L_old = negLog_L 
    #print(negLog_L_old)

In [22]:
print(beta)

[[-0.14933244]
 [ 2.83777927]
 [-3.4955372 ]
 [-0.15035754]
 [ 0.39991644]
 [-0.25972685]
 [-0.02214746]]


### Gradient Descent with exponentialy decaying learning rate

In [70]:
gamma = 0.0001 # exponentialy decaying rate
beta = np.zeros((7,1)) # initialize the 7 coefficients
iters = 1000000000
negLog_L_old=0
precision = 0.00001
for k in range(iters):
    negLog_L =  - (train_y.T @ np.log(sigmoid(X_train @ beta)) + (1-train_y.T) @ np.log(1.0001-sigmoid(X_train @ beta)))
    if pd.isna(negLog_L[0][0]):
        break
    beta = beta - gamma * np.exp(-0.00001*(k+1)) * -1 * X_train.T @ (train_y-sigmoid(X_train @ beta))
    if np.abs(negLog_L_old-negLog_L) < precision:
        break
    else:
        negLog_L_old = negLog_L 
    #print(negLog_L_old)

In [71]:
beta

array([[-0.45013639],
       [ 6.56400638],
       [-6.85476236],
       [-0.16305368],
       [ 0.7649779 ],
       [-1.15408355],
       [-0.11770085]])

In [72]:
beta_result = beta

### Stochastic GD

In [26]:
gamma = 0.01 # exponentialy decaying learning rate
beta = np.zeros((7,1)) # initialize the 7 coefficients
iters = 1000000000
negLog_L_old=0
precision = 0.0001
for k in range(iters):
    negLog_L =  - (train_y.T @ np.log(sigmoid(X_train @ beta)) + (1-train_y.T) @ np.log(1-sigmoid(X_train @ beta)))
    if pd.isna(negLog_L[0][0]):
        break
    #beta = beta - gamma * np.exp(-0.00001*(k+1)) * -1 * X_train.T @ (train_y-sigmoid(X_train @ beta))
    idx = np.random.randint(8000)
    beta = beta - gamma * np.exp(-0.0001*(k+1)) * -1 * X_train.T[:,idx].reshape((7,1)) @ (train_y[idx] - sigmoid(X_train[idx,:]@beta)).reshape((1,1))
    if np.abs(negLog_L_old-negLog_L) < precision:
        break
    else:
        negLog_L_old = negLog_L 
    #print(negLog_L_old)

In [27]:
print(beta)

[[-0.05639091]
 [ 1.59062501]
 [-2.14329671]
 [-0.09604767]
 [ 0.1417191 ]
 [-0.01338122]
 [ 0.10488665]]


### Minibatch GD

In [28]:
gamma = 0.01 # exponentialy decaying learning rate
beta = np.zeros((7,1)) # initialize the 7 coefficients
iters = 1000000000
negLog_L_old=0
precision = 0.0001
for k in range(iters):
    negLog_L =  - (train_y.T @ np.log(sigmoid(X_train @ beta)) + (1-train_y.T) @ np.log(1.00001-sigmoid(X_train @ beta)))
    if pd.isna(negLog_L[0][0]):
        break
    #beta = beta - gamma * np.exp(-0.00001*(k+1)) * -1 * X_train.T @ (train_y-sigmoid(X_train @ beta))
    idx = np.random.randint(8000,size=30)
    beta = beta - gamma * np.exp(-0.0001*(k+1)) * -1 * X_train.T[:,idx] @ (train_y[idx] - sigmoid(X_train[idx,:]@beta))
    if np.abs(negLog_L_old-negLog_L) < precision:
        break
    else:
        negLog_L_old = negLog_L 
    #print(negLog_L_old)

In [29]:
print(beta)

[[-0.33402575]
 [ 4.64684221]
 [-5.34238567]
 [-0.1872796 ]
 [ 0.60112884]
 [-0.87487977]
 [-0.08791036]]


### Use Gradient Descent Result to predict for test data

In [33]:
X_test = scaler.transform(X_test)
X_test[:,0] = np.ones(X_test.shape[0])

In [34]:
y_pred = sigmoid(X_test@beta_result)

In [39]:
y_category = []
for i in range(len(y_pred)):
    if y_pred[i]>0.5:
        y_category.append(1)
    else:
        y_category.append(0)

In [44]:
match = 0
for i in range(2000):
    if y_category[i]==test_y[i]:
        match +=1

In [45]:
print('accuracy: ', match/2000)

accuracy:  0.9755


### Our model achieves similar accuracy as the black box implementation (i.e., Logistic Regression with no penalty). 

In [48]:
print(black_box.intercept_)
print(black_box.coef_[0][1:])

[-0.45121617]
[ 6.57621086 -6.86634029 -0.16300439  0.76632607 -1.15679155 -0.11777955]


In [49]:

print(beta_result)

[[-0.45013639]
 [ 6.56400638]
 [-6.85476236]
 [-0.16305368]
 [ 0.7649779 ]
 [-1.15408355]
 [-0.11770085]]


### We used the default solver of LogisticRegression, which is lbfgs, and set penalty = None.  Before feeding into the black box, we scale the data to help the result converge and more stable. The black box result puts a higher weight on the first two features, which are the counts of positive and negative words. The remaining features have a relatively small effect.

### For our manual computation, we can see the prediction is as excellent as the black box version with an accuracy of 0.9755. Furthermore, the coefficients we obtained manually are roughly the same as the black box's.