# Sentiment Analysis using Logistic Regression

Case of study: Indonesian

## Import Function

In [1]:
# run this cell to import nltk
import nltk
from os import getcwd

In [2]:
# add folder, tmp2, from our local workspace containing pre-downloaded corpora files to nltk's data path
# this enables importing of these files without downloading it again when we refresh our workspace

filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

## Import Sample Data

In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 
from utils_ind import process_tweet, build_freqs

In [4]:
df = pd.read_excel("sentiment_sample.xlsx")
df = df.drop(columns=['id'])
df

Unnamed: 0,text,label
0,yuk sini kita musuhan biar aku teror km :(,-1
1,lu mah sukanga om om :(,-1
2,kannnn :( target a tapi bila dah macam ni dah ...,-1
3,tp klopun msk gw pribadipun blm yakin bisa non...,-1
4,2 personil mejile besok mau pulang tiati ya ka...,-1
...,...,...
195,"simple aja sih. jangan mainin perasaan cewek, ...",1
196,telkomsel: mhmdaliwafa jika log in instagram d...,1
197,telkomsel: lanwsp [3] info lengkap friday movi...,1
198,halobca: rizkyrinjani_ mhn bpk rizky infokan n...,1


In [5]:
# select the set of positive and negative tweets
all_positive_tweets = df[df['label'] == 1].values
all_negative_tweets = df[df['label'] == -1].values

In [6]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[80:,0]
train_pos = all_positive_tweets[:80,0]
test_neg = all_negative_tweets[80:,0]
train_neg = all_negative_tweets[:80,0]

train_x = np.hstack((train_pos,train_neg))
test_x = np.hstack((test_pos,test_neg))

## Arrange Train and Test

In [7]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [8]:
# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (160, 1)
test_y.shape = (40, 1)


In [9]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 593


#### Removing punctuation, remove stopwords, stemming and cleaning text

In [10]:
# test the function below
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

This is an example of a positive tweet: 
 lagi berjuang:)

This is an example of the processed version of the tweet: 
 ['juang', '']


## Logistic Regression Process

### Sigmoid Function

In [11]:
# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    
    # calculate the sigmoid of z
    h = 1 / (1 + np.exp(-z))
    
    return h

In [12]:
# Testing function 
if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

if (sigmoid(4.92) == 0.9927537604041685):
    print('CORRECT!')
else:
    print('Oops again!')

SUCCESS!
CORRECT!


### Gradient Descent and Cost Function

this section i will use Binary Cross Entropy

In [13]:
# UNQ_C2 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    # get 'm', the number of rows in matrix x
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of h
        h = sigmoid(z)
        
        # calculate the cost function
        # note that we can use also np.array.transpose() instead of np.array.T
        # np.array.T just makes code a little more readable :)
        J = -1./m * (np.dot(y.T, np.log(h)) + np.dot((1-y).T,np.log(1-h)))                                                    

        # update the weights theta
        theta = theta - (alpha/m) * np.dot(x.T,(h-y))
        
    J = float(J)
    return J, theta

In [14]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


### Extract Features

In [15]:
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    assert(x.shape == (1, 3))
    return x

In [16]:
# Check your function

# test 1
# test on training data
tmp1 = extract_features(train_x[0], freqs)
print(tmp1)

[[ 1. 97. 88.]]


In [17]:
# test 2:
# check for when the words are not in the freqs dictionary
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1. 0. 0.]]


### Training the Model

In [18]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.69226653.
The resulting vector of weights is [-0.0, 3.636e-05, -3.2e-07]


### Test the Logistic Regression

In [19]:
# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def predict_tweet(tweet, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    
    # extract the features of the tweet and store it into x
    x = extract_features(tweet,freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [20]:
# Run this cell to test your function
for tweet in ['Saya senang', 'Saya sedih', 'Film ini bagus', 'Luar biasa', 'Bagus', 'Bagus sekali']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

Saya senang -> 0.500000
Saya sedih -> 0.500000
Film ini bagus -> 0.500000
Luar biasa -> 0.500000
Bagus -> 0.500000
Bagus sekali -> 0.500000


In [21]:
# Test
# Feel free to check the sentiment of your own tweet below

my_tweet = 'Saya pandai :)'
predict_tweet(my_tweet, freqs, theta)

array([[0.50086538]])

### Test Data Accuracy

In [22]:
# UNQ_C5 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
    
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)

    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    
    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    
    return accuracy

In [23]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.5000


## Error Analysis

In [24]:
# Some error analysis done for you
print('Label Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = predict_tweet(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))
    else:
        print('NO ERROR')

Label Predicted Tweet
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
NO ERROR
THE TWEET IS: bom akutuh sibuk bom sbl :(
THE PROCESSED TWEET IS: ['bom', 'akutuh', 'sibuk', 'bom', 'sbl', '']
0	0.50086538	b'bom akutuh sibuk bom sbl '
THE TWEET IS: yah sedih :(
THE PROCESSED TWEET IS: ['yah', 'sedih', '']
0	0.50086498	b'yah sedih '
THE TWEET IS: yu  , maap yah kmrn ga bsa dateng :( maap yaaahh. nasi tumpeng msh ada?? sarapan yes, martabak keju nih enak
THE PROCESSED TWEET IS: ['yu', 'maap', 'yah', 'kmrn', 'ga', 'bsa', 'dateng', '', 'maap', 'yaaahh', 'nasi', 'tumpeng', 'msh', 'sarap', 'yes', 'martabak', 'keju', 'nih', 'enak']
0	0.50087334	b'yu maap yah kmrn ga bsa dateng  maap yaaahh nasi tumpeng msh sarap yes martabak keju nih enak'
THE TWEET IS: sabar banget ya lu hyung :(
THE PROCESSED TWEET IS: ['sabar', 'banget', 'ya', 'lu', 'hyung', '']
0	0.50090068	b'sabar bange

## Test by using New Data

In [25]:
# Feel free to change the tweet below
my_tweet = 'Saya tidak senang belajar bahasa'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['senang', 'ajar', 'bahasa']
[[0.5]]
Negative sentiment
