# NLP - Sentiment Analysis from scratch

## Bogdan Macovei, Grupa 344
## Universitatea din Bucuresti, Facultatea de Matematica si Informatica

## Tf-idf vectorizer

In [1]:
import numpy as np
import math

In [2]:
corpus = ['El se duce sa vada unde este.', 
          'Se pare ca este unde se credea.', 
          'Cine se duce sa il caute?']

In [3]:
corpus

['El se duce sa vada unde este.',
 'Se pare ca este unde se credea.',
 'Cine se duce sa il caute?']

In [4]:
words = []
for i in range(0, len(corpus)):
    word = corpus[i].split(' ')
    for j in range(0, len(word)):
        words.append(word[j].replace("-", '').replace(",", '').replace(".", '').replace("[", '')
             .replace("]", '').replace("_", '').replace("\\", '').replace("\'", '')
             .replace("0", '').replace("1", '').replace("2", '').replace("3", '').replace("4", '')
             .replace("5", '').replace("6", '').replace("7", '').replace("8", '').replace("9", '')
             .replace("(", '').replace(")", '').replace("?", '').lower())

In [5]:
words = set(words)
words = list(words)

In [6]:
X = np.zeros((len(corpus), len(words)))

## Tf

In [7]:
for i in range(0, len(corpus)):
    for j in range(0, len(words)):
        X[i][j] = corpus[i].lower().count(words[j])

In [8]:
X

array([[0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1.],
       [1., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 2.],
       [0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1.]])

In [9]:
tf = list(map(lambda x: list(map(lambda y: y / len(x), x)), X))

## Idf

In [10]:
N = len(corpus)
idf = np.zeros((len(corpus), len(words)))
for j in range(0, len(words)):
    count = 0
    for i in range(0, N):
        if X[i][j] != 0:
            count = count + 1
        if count != 0:
            for i in range(0, N):
                if X[i][j] != 0:
                    idf[i][j] = math.log(N / count)
            

In [11]:
idf

array([[0.        , 0.40546511, 0.        , 1.09861229, 0.40546511,
        1.09861229, 0.        , 0.        , 0.        , 0.40546511,
        0.        , 0.40546511, 0.        ],
       [1.09861229, 0.40546511, 1.09861229, 0.        , 0.        ,
        0.        , 0.40546511, 0.        , 0.        , 0.40546511,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.40546511,
        0.        , 0.40546511, 1.09861229, 1.09861229, 0.        ,
        1.09861229, 0.40546511, 0.        ]])

In [12]:
tf

[[0.0,
  0.07692307692307693,
  0.0,
  0.07692307692307693,
  0.07692307692307693,
  0.07692307692307693,
  0.0,
  0.0,
  0.0,
  0.07692307692307693,
  0.0,
  0.07692307692307693,
  0.07692307692307693],
 [0.07692307692307693,
  0.07692307692307693,
  0.07692307692307693,
  0.0,
  0.0,
  0.0,
  0.07692307692307693,
  0.0,
  0.0,
  0.07692307692307693,
  0.0,
  0.0,
  0.15384615384615385],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.07692307692307693,
  0.0,
  0.07692307692307693,
  0.07692307692307693,
  0.07692307692307693,
  0.0,
  0.07692307692307693,
  0.07692307692307693,
  0.07692307692307693]]

In [13]:
tf * idf

array([[0.        , 0.03118962, 0.        , 0.08450864, 0.03118962,
        0.08450864, 0.        , 0.        , 0.        , 0.03118962,
        0.        , 0.03118962, 0.        ],
       [0.08450864, 0.03118962, 0.08450864, 0.        , 0.        ,
        0.        , 0.03118962, 0.        , 0.        , 0.03118962,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.03118962,
        0.        , 0.03118962, 0.08450864, 0.08450864, 0.        ,
        0.08450864, 0.03118962, 0.        ]])

## Demo Movie Review - Sentiment Analysis

In [14]:
import csv
import matplotlib.pyplot as plt

### Preluarea datelor

In [15]:
with open('train.tsv', 'r') as fin:
    cr = csv.reader(fin, delimiter='\t')
    corpus = [line for line in cr]

In [16]:
corpus = corpus[1:]

In [17]:
df = []
for i in range(0, len(corpus)):
    df.append(corpus[i][2]
      .replace("-", '').replace(",", '').replace(".", '').replace("[", '')
      .replace("]", '').replace("_", '').replace("\\", '').replace("\'", '')
      .replace("0", '').replace("1", '').replace("2", '').replace("3", '').replace("4", '')
      .replace("5", '').replace("6", '').replace("7", '').replace("8", '').replace("9", '')
      .replace("(", '').replace(")", '').lower())

### Aplicare Tfidf-vectorizer

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b')
vector = vectorizer.fit_transform(df)

In [22]:
vector.shape

(156060, 15896)

In [23]:
X = vector.toarray()

In [18]:
y = []
for i in range(0, len(corpus)):
    if (corpus[i][3] == '0' or corpus[i][3] == '1'):
        y.append(0)
    else:
        y.append(1)

Pentru a antrena mai rapid, vom limita setul de date (e doar demonstrativ, pentru o analiza reala se elimina celula urmatoare):

In [59]:
X = X[0:8000]
y = y[0:8000]

Split-uim datele in train si test:

In [60]:
X_train = []
y_train = []
X_test = []
y_test = []
for i in range(0, X.shape[0]):
    if i % 5 == 0:
        X_test.append(X[i])
        y_test.append(y[i])
    else:
        X_train.append(X[i])
        y_train.append(y[i])

Definim o functie care sa ne ajute in evaluarea modelului:

In [28]:
def model_evaluation(prediction, y_test):
    tp = 0; tn = 0; fp = 0; fn = 0;
    for i in range(0, len(prediction)):
        if prediction[i] == 0 and y_test[i] == 0:
            tn = tn + 1
        elif prediction[i] == 1 and y_test[i] == 1:
            tp = tp + 1
        elif prediction[i] == 1 and y_test[i] == 0:
            fp = fp + 1
        else:
            fn = fn + 1
        
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    return precision, recall, accuracy

Definim functia sigmoid, necesara antrenarii modelului in implementarea Gradient Descent:

In [29]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

### Antrenarea modelului

#### Modelul Perceptron

In [31]:
def perceptron(X_train, y_train):
    m = len(X_train); n = len(X_train[0])
    
    weights = np.zeros((n, 1))
    weights = weights.reshape((n, 1))
    bias = np.zeros((1, 1))
    
    max_iter = 1 # setam numarul de perioade de evolutie
    
    for iteration in range(0, max_iter):
        print(iteration, '/', max_iter - 1)
        errors = 0
        for i in range(0, m):
            xi = np.array(X_train[i])
            xi = xi.reshape((n, 1))
            predicted = np.sign(weights.T.dot(xi) + bias)
            
            if y_train[i] * predicted <= 0:
                weights = weights + y_train[i] * xi
                bias = bias + y_train[i]
                errors = errors + 1
                
        if errors == 0:
            break
        
        y_pred = np.sign(X_t.dot(weights) + bias)
        precision, recall, accuracy = model_evaluation(y_pred, y_test)
        print('Errors:', errors, 'Train accuracy:', (m - errors) / m, 'Test accuracy:', accuracy)
        print('')
        
    return weights, bias

Pentru rapiditate, vom antrena pe o singura epoca:

In [33]:
X_t = np.array(X_test)
weights, bias = perceptron(X_train, y_train)

0 / 0
Errors: 4715 Train accuracy: 0.8035416666666667 Test accuracy: 0.7998333333333333



In [34]:
y_pred = np.sign(X_t.dot(weights) + bias)

In [35]:
precision, recall, accuracy = model_evaluation(y_pred, y_test)

In [36]:
accuracy

0.7998333333333333

In [37]:
precision

0.7998333333333333

In [38]:
recall

1.0

#### Modelul utilizand Gradient Descent

In [74]:
def GradientDescentFit(X_train, y_train):
    m = len(X_train); alpha = 1e-1; coef = alpha/m
    oldW = np.random.rand(len(X_train[0]) + 1)
    oldW = oldW.reshape((len(X_train[0]) + 1, 1))

    localX_train = list(X_train)
    localX_train = np.array(list(map(lambda x: np.concatenate([[1], x]), localX_train))) # adaug o coloana de 1
    
    localy_train = np.array(y_train)
    localy_train = localy_train.reshape((len(y_train), 1))
    
    while 1:
        product = localX_train.dot(oldW)
        applied_sigmoid = np.array(list(map(lambda x: sigmoid(x), product)))
        applied_sigmoid = applied_sigmoid.reshape(localy_train.shape)
        
        newW = oldW - coef * localX_train.T.dot(applied_sigmoid - localy_train)
        
        if np.linalg.norm(newW) > np.linalg.norm(oldW):
            print(np.linalg.norm(newW) - np.linalg.norm(oldW))
            break
        
        oldW = newW
    
    return newW

In [75]:
weights = GradientDescentFit(X_train, y_train)

In [65]:
X_t = list(X_test)
X_t = np.array(list(map(lambda x: np.concatenate([[1], x]), X_t)))

In [66]:
pred_init = X_t.dot(weights)
y_pred = list(map(lambda x: round(sigmoid(x)), pred_init))

In [79]:
precision, recall, accuracy = model_evaluation(y_pred, y_test)
print('Precision: ', precision, 'Recall: ', recall, 'Accuracy: ', accuracy)

Precision:  0.800625 Recall:  1.0 Accuracy:  0.800625
