# Logistic Regression for Sentiment Analysis

Adapted from http://nbviewer.jupyter.org/github/rasbt/pattern_classification/blob/master/machine_learning/scikit-learn/outofcore_modelpersistence.ipynb

## 1. Preprocessing data

### 1.1 Importing libraries 

In [1]:
import numpy as np
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

### 1.2 Functions for pre-processing

#### 1.2.1 Tokenizing function 

The first important step is convert the comments into a list of keywords that we can analize later.

In [None]:
stop = stopwords.words('english') # Common words
porter = PorterStemmer() # Getting root of words
char3=stop[:17] # Getting 1st and 2nd person pronouns
stop=stop[17:116]+stop[118:] # Deliting 'no' and 'not' from stop list
# Te tokenizer function is used to get transform a comment into a list in order to be processed later.
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    text = [w for w in text.split() if w not in stop]
    tokenized = [porter.stem(w) for w in text]
    return tokenized

Let's give it at try:

In [2]:
tokenizer('This :) is no a <a> test! :-)</br>')

['no', 'test', ':)', ':)']

#### 1.2.2 CSV into yelds

First, we define a generator that returns the document body and the corresponding class label:

In [3]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [7]:
doc_stream=stream_docs(path='shuffled_movie_data.csv')
docs, y = [], []
for _ in range(50000):
    text_aux, label =next(doc_stream)
    text=tokenizer(text_aux)
    docs.append(text)
    y.append(label)
    #print('\n',tokenizer(text))

#### 1.2.3 Processing first positive-negative words dataset

When we apply porter.stem we have repeated words in both lists, so we need deleting thouse repeated elements.

In [4]:
def removeDuplicates(listofElements):
    # Create an empty list to store unique elements
    uniqueList = []
    
    # Iterate over the original list and for each element
    # add it to uniqueList, if its not already there.
    for elem in listofElements:
        if elem not in uniqueList:
            uniqueList.append(elem)
    
    # Return the list of unique elements        
    return uniqueList

Now we process the positive-negative words dataset

In [5]:
positives=[line.strip() for line in open('opinion-lexicon-English/positive-words.txt')]
positives = [porter.stem(w) for w in positives]
print(len(positives))
positives=removeDuplicates(positives)
print(len(positives))
negatives=[line.strip() for line in open('opinion-lexicon-English/negative-words.txt')]
negatives = [porter.stem(w) for w in negatives]
print(len(negatives))
negatives=removeDuplicates(negatives)
print(len(negatives))
pron12=stop[:17]

2007
1391
4783
3160


### 1.3 Getting parameters referenced in links above

In [8]:
N=50000
X=np.zeros((N,6))
idx=0
for com in docs:
    if idx%1000==0:
        print(idx)
    X[idx,5]+=np.log(len(com)) #len: char 6
    for word in com:
        X[idx,3]+=char3.count(word) #pronoun: char 4
        if word=='!':
            X[idx,4]=1 #! simbol: char 5
        if (word=='no' or word=='not'):
            X[idx,2]=1 #! simbol: char 5
        X[idx,0]+=positives.count(word) #positive words : char 1
        X[idx,1]+=negatives.count(word) #negative words : char 2
    idx+=1

#print(X)
print(X.shape)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
(50000, 6)


We save data data in order to avoid repeat the processing time

In [86]:
import pandas as pd 
df = pd.DataFrame(X)
df.to_csv("X.csv")

### 1.4 Using positive-negative list base in histogram of the current dataset

We'll use a new list of words based in the current comments and create an input paramenter in order to the number its presence in each comment.

In [169]:
P2=100 # Number of features

#We separate common words
positive2 = pd.read_csv('data/positive.csv', index_col=0)
positive2_i = positive2.index.values[:1000]
stop2 = stopwords.words('english')
positive2_i=set(positive2_i).difference(stop2)


negative2 = pd.read_csv('data/negative.csv', index_col=0)
negative2_i = negative2.index.values[:1000]
negative2_i=set(negative2_i).difference(stop2)

# We proced to delete repeated words.
positive2=set(positive2_i).difference(negative2_i)
negative2=set(negative2_i).difference(positive2_i)
positive2=list(positive2)[:P2]
negative2=list(negative2)[:P2]

positive2 = [porter.stem(w) for w in positive2]
negative2 = [porter.stem(w) for w in negative2]

print(positive2)
print(negative2)

['older', 'terrif', 'beauti', 'follow', 'set', 'move', 'begin', 'disney', 'thank', 'centuri', 'imag', 'season', 'solid', 'match', 'deep', 'move', 'person', 'agre', 'touch', 'york', 'awesom', 'fantasi', 'truth', 'wonder', 'portray', 'present', 'today', 'sweet', 'enjoy', 'situat', 'brother', 'lead', 'creat', 'keep', 'western', 'emot', 'beauti', 'earlier', 'creepi', 'featur', 'rare', 'offic', 'scott', 'clever', 'uniqu', 'realist', 'portray', 'fill', 'return', 'adventur', 'marri', 'masterpiec', 'favorit', 'secret', 'memor', 'danc', 'plenti', 'natur', 'romanc', 'popular', '9', 'bring', 'social', 'stun', 'japanes', 'oscar', 'anim', 'see', 'geniu', 'manag', 'edg', 'natur', 'outstand', 'perfectli', 'anim', 'outsid', '70', 'touch', 'tale', 'insid', 'mark', 'spirit', 'cold', 'polit', 'subtl', 'support', 'die', 'pace', 'larg', 'dream', 'incred', 'creat', 'appreci', 'emot', 'recent', 'busi', 'paul', 'intellig', 'variou', 'surprisingli']
['maker', 'hardli', 'monster', 'fail', 'shoot', 'direct', 'la

Calculating the new features

In [170]:
X2=np.zeros((N,2*P2))
idx=0
for com in docs:
    #if com.count('terrific'):
        #print('foundddddd')
    if idx%1000==0:
        print(idx)
    idx2=0
    for word in positive2:
        X2[idx,idx2]=com.count(word)
        idx2+=1
    #print(idx2)
    for word in negative2:
        X2[idx,idx2]=com.count(word)
        idx2+=1
    idx+=1
print(X.shape)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
(50000, 6)


### 1.5 Merging all features

In [172]:
X_data=np.concatenate((X, X2), axis=1)

### 1.5 Splitting dataset

In [173]:
y=np.asarray(y)
y=y.reshape(y.shape[0],1)
x_train=X_data[:40000]
x_valid=X_data[40000:45000]
x_test=X_data[45000:50000]
y_train=y[:40000]
y_valid=y[40000:45000]
y_test=y[45000:50000]


In [174]:
def sigmoid(x):
    return 1 / (1 + np.e ** -x)

### 1.5 Training

In [179]:
############################################
# BATCH
############################################
alfa=0.001
reg=0.002
epochs=100000
W=np.random.rand(1,x_train.shape[1])/x_train.shape[1]
bias=np.random.rand(1,1)/x_train.shape[1]
W2=W*1
prec=0
ep=0
for epoch in range(epochs):
    err=np.transpose(y_train)-sigmoid(bias+np.matmul(W2,np.transpose(x_train)))
    dw=alfa*np.matmul(err,x_train)/x_train.shape[0]
    W2+=dw
    if epoch%1000==0:
        y_pred=np.round(sigmoid(np.matmul(x_valid,np.transpose(W2))))
        precision=100*(1-sum(abs(y_pred-y_valid))/y_pred.shape[0])
        print('Epoca: ',epoch)
        print('--------------')
        print('Precision: ',precision)
        if prec<precision:
            prec=precision
            ep=epoch
            W=W2

print(W2)


Epoca:  0
--------------
Precision:  [49.1]
Epoca:  1000
--------------
Precision:  [72.94]
Epoca:  2000
--------------
Precision:  [73.62]
Epoca:  3000
--------------
Precision:  [74.1]
Epoca:  4000
--------------
Precision:  [74.48]
Epoca:  5000
--------------
Precision:  [75.02]
Epoca:  6000
--------------
Precision:  [75.48]
Epoca:  7000
--------------
Precision:  [75.82]
Epoca:  8000
--------------
Precision:  [76.02]
Epoca:  9000
--------------
Precision:  [76.06]
Epoca:  10000
--------------
Precision:  [76.2]
Epoca:  11000
--------------
Precision:  [76.54]
Epoca:  12000
--------------
Precision:  [76.86]
Epoca:  13000
--------------
Precision:  [77.08]
Epoca:  14000
--------------
Precision:  [77.3]
Epoca:  15000
--------------
Precision:  [77.38]
Epoca:  16000
--------------
Precision:  [77.38]
Epoca:  17000
--------------
Precision:  [77.52]
Epoca:  18000
--------------
Precision:  [77.68]
Epoca:  19000
--------------
Precision:  [77.8]
Epoca:  20000
--------------
Precision

### 1.5 Getting results in test

In [180]:
err=np.transpose(y_train)-sigmoid(bias+np.matmul(W2,np.transpose(x_train)))
err.shape

(1, 40000)

In [181]:
from sklearn.metrics import confusion_matrix
y_pred2=np.round(sigmoid(np.matmul(x_test,np.transpose(W))))
confusion_matrix(y_test, y_pred2)


array([[1896,  572],
       [ 461, 2071]])

In [182]:
precision

array([79.86])