In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_svmlight_file
import os

In [24]:
import nltk
from nltk.corpus import stopwords

In [25]:
stopword_lst = list(stopwords.words('english'))

## Importing Bag of Words for Reviews

The feature indices in these files start from 0, and the text
tokens corresponding to a feature index is found in [imdb.vocab]. So a
line with 0:7 in a .feat file means the first word in [imdb.vocab]
(the) appears 7 times in that review.

In [27]:
X_train, y_train = load_svmlight_file("../Data/train/labeledBow.feat")

X_train = X_train.todense()
y_train = pd.DataFrame(y_train).rename(columns = {0 :"Score"})

In [28]:
X_train

matrix([[ 9.,  1.,  4., ...,  0.,  0.,  0.],
        [ 7.,  4.,  2., ...,  0.,  0.,  0.],
        [ 4.,  4.,  4., ...,  0.,  0.,  0.],
        ...,
        [17.,  6.,  7., ...,  0.,  0.,  0.],
        [15.,  8.,  3., ...,  0.,  0.,  0.],
        [10.,  2.,  2., ...,  0.,  0.,  0.]])

In [29]:
y_train.head()

Unnamed: 0,Score
0,9.0
1,7.0
2,9.0
3,10.0
4,8.0


In [362]:
y_train["Sentiment"] = 0
y_train.loc[y_train["Score"] >= 7, ("Sentiment")] = 1
y_train.loc[y_train["Score"] <= 4, ("Sentiment")] = 0
y_train

Unnamed: 0,Score,Sentiment
0,9.0,1
1,7.0,1
2,9.0,1
3,10.0,1
4,8.0,1
...,...,...
24995,1.0,0
24996,1.0,0
24997,4.0,0
24998,2.0,0


## Importing Vocab

In [31]:
Vocabulary = []
f = open("../Data/imdb.vocab", "r")
for token in f:
    Vocabulary.append(token.replace('\n',''))
f.close()

## Importing Expected Rating Per Token

In [32]:
Expected_Vocabulary_Rating = []
f = open("../Data/imdbEr.txt", "r")
for expected_rating in f:
    Expected_Vocabulary_Rating.append(float(expected_rating.replace('\n','')))
f.close()

## Reduce Dimensions

In [None]:
stopwords_idx = []
for word in stopword_lst:
    try:
        stopwords_idx.append(Vocabulary.index(word)) 
    except:
        pass

In [114]:
def calc_tf_idf(X, max_features=None):
    num_terms_in_document = X.sum(axis=1).reshape(X.shape[0],1)

    tf = (X / num_terms_in_document)

    num_documents = X.shape[0]

    num_documents_with_t = np.count_nonzero(X, axis=0) + 1

    idf = np.log((num_documents + 1) / num_documents_with_t) + 1

    return np.multiply(tf,idf)

In [115]:
tf_idf = calc_tf_idf(X_train)

In [116]:
tf_idf = pd.DataFrame(tf_idf)

In [117]:
tf_idf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89517,89518,89519,89520,89521,89522,89523,89524,89525,89526
0,0.065762,0.007493,0.029961,0.030506,0.046244,0.032135,0.016168,0.016339,0.036578,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.061917,0.036284,0.018134,0.018464,0.000000,0.038900,0.009786,0.000000,0.022139,0.019287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.035073,0.035968,0.035953,0.064063,0.018498,0.009640,0.009701,0.009803,0.000000,0.009560,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.096957,0.019886,0.019878,0.000000,0.030681,0.021320,0.042907,0.021680,0.000000,0.010571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.056503,0.040116,0.026732,0.018146,0.009169,0.023893,0.048086,0.029156,0.000000,0.009477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.061319,0.027948,0.031428,0.026667,0.019763,0.014982,0.007538,0.020948,0.017053,0.014856,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.021454,0.044004,0.043985,0.016795,0.016973,0.000000,0.035604,0.017990,0.013425,0.017543,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.074208,0.026860,0.031322,0.022781,0.004604,0.009599,0.009659,0.009761,0.010926,0.004759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.098858,0.054070,0.020267,0.041273,0.000000,0.043476,0.014583,0.022105,0.000000,0.014371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
weights = tf_idf.sum(axis = 0)

In [126]:
top_1000 = weights.sort_values(ascending = False).head(1000).index

In [747]:
X_train_subset = pd.DataFrame(X_train[:,top_1000])
X_train_subset_tfidf = tf_idf[top_1000]

In [731]:
X_train_subset_tfidf

Unnamed: 0,0,2,1,3,4,5,6,8,7,9,...,963,877,1231,1029,1024,1083,956,967,1119,1020
0,0.065762,0.029961,0.007493,0.030506,0.046244,0.032135,0.016168,0.036578,0.016339,0.000000,...,0.0,0.034249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.061917,0.018134,0.036284,0.018464,0.000000,0.038900,0.009786,0.022139,0.000000,0.019287,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.035073,0.035953,0.035968,0.064063,0.018498,0.009640,0.009701,0.000000,0.009803,0.009560,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.096957,0.019878,0.019886,0.000000,0.030681,0.021320,0.042907,0.000000,0.021680,0.010571,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.056503,0.026732,0.040116,0.018146,0.009169,0.023893,0.048086,0.000000,0.029156,0.009477,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.061319,0.031428,0.027948,0.026667,0.019763,0.014982,0.007538,0.017053,0.020948,0.014856,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
24996,0.021454,0.043985,0.044004,0.016795,0.016973,0.000000,0.035604,0.013425,0.017990,0.017543,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
24997,0.074208,0.031322,0.026860,0.022781,0.004604,0.009599,0.009659,0.010926,0.009761,0.004759,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
24998,0.098858,0.020267,0.054070,0.041273,0.000000,0.043476,0.014583,0.000000,0.022105,0.014371,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031924


## General ML Functions

In [851]:
# Splitting Training Set into Training and Validation 
def train_test_split(frac, X, y):
    X_train = X.sample(frac = frac)
    X_test = X.drop(X_train.index, axis = 0)
    

    y_train = y.loc[X_train.index]
    
   
    y_test = y.loc[X_test.index]
    return X_train, y_train, X_test, y_test

def accuracy(yhat,y):
    yhat_copy = yhat.copy()
    yhat_copy[yhat >= 0.5] = 1
    yhat_copy[yhat < 0.5] = 0
    return (yhat_copy == y).mean()


## Logistic Regression

Use linear predictor to model the log-odds

### Form

\begin{align}
\log \frac{P(Y_i = 1)}{1 - P(Y_i = 1)} &= \beta_0 + \beta_1x_i \\
P(Y_i = 1) &= \frac{1}{1 + e^{-(\beta_0 + \beta_1x_i)}}
\end{align}

### Loss Function

\begin{align}
L(Y, \hat p) &= -\log [\hat p ^{Y}(1 - \hat p)^{1 - Y}] \\
&= -Y \log \hat p - (1 - Y) \log (1 -\hat p)
\end{align}

### Estimating Logistic Regression Coefficients

WTF $\beta_j \leftarrow \beta_j - \eta\frac{\partial L}{\partial \beta_j}$

\begin{align}
\frac{\partial L}{\partial \beta_j} &= \frac{\partial L}{\partial p}\frac{\partial \hat p}{\partial \beta_j} \\
&=  \bigg(-\frac{Y}{\hat p} + \frac{1 - Y}{1 - \hat p}\bigg)\big(\hat p (1 - \hat p)\cdot x_j\big) \\
&= \big(\hat p - Y\big)x_j
\end{align}


In [709]:
class LogisticRegression:

    def fit(self, X, y, descent = "stochastic"):
        X = X.values
        intercept_col = np.ones(X_train.shape[0]).reshape(X_train.shape[0],1)
        X = np.hstack((intercept_col, X))
        self._X = X
        self._y = y
        if descent == "stochastic":
            self._b = self._stochastic_gradient_descent(X,
                                                        y,
                                                        n_iter = 100000,
                                                        learning_rate = 0.1,
                                                        lossConvergence = .001,
                                                        batch_size = 32)
        else:
             self._b = self._gradient_descent(X,
                                             y,
                                             n_iter = 10000,
                                             learning_rate = 0.1)           
        
    # Stochastic Gradient Descent Algorithm for Logistic Regression 
    def _stochastic_gradient_descent(self, X, y, n_iter = 10000, learning_rate = 0.01, lossConvergence = .001, batch_size = 128):
        y = y.values.reshape(len(y),1)
        betas = np.zeros(X.shape[1]).reshape(X.shape[1],1) + 0.1
        for i in range(n_iter):
            X_index = np.arange(X.shape[0])
            np.random.shuffle(X_index)
            batch_index = X_index[:batch_size]

            X_batch = X[batch_index,:]
            y_batch = y[batch_index,:]

            yhat = self._sigmoid(np.dot(X_batch, betas))
            gradient = np.dot(X_batch.T,(y_batch - yhat))
            if self._loss(yhat, y_batch) < lossConvergence:
                break
            betas +=  learning_rate * (gradient/X.shape[1])
        return betas
    
   # Gradient Descent Algorithm for Logistic Regression 
    def _gradient_descent(self, X,y, n_iter = 10000, learning_rate = 0.01, lossConvergence = .001):
        y = y.values.reshape(len(y),1)
        betas = np.zeros(X.shape[1]).reshape(X.shape[1],1) + 0.1
        for i in range(n_iter):
            yhat = self._sigmoid(np.dot(X, betas))
            gradient = np.dot(X.T,(y - yhat))
            if self._loss(yhat, y) < lossConvergence:
                break
            betas +=  learning_rate * (gradient/X.shape[1])
        return X, betas
    
    def _loss(self, yhat, y):
        loss_vals = yhat.copy()
        loss_vals[y == 1] = -np.log(yhat[y==1])
        loss_vals[y == 0] = -np.log(1 - yhat[y==0])
        return(loss_vals.mean())

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
        
    def predict(self, X):
        X = X.values
        intercept_col = np.ones(X_train.shape[0]).reshape(X_train.shape[0],1)
        X = np.hstack((intercept_col, X))
        return self._sigmoid(np.dot(X, self._b))
    

In [710]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_subset_tfidf, y_train["Sentiment"])

In [852]:
yhat = logistic_regression.predict(X_train_subset_tfidf)
accuracy(yhat,y)

0.75468

## Linear Discriminant Analysis

In [847]:
class LinearDiscriminantAnalysis:

    def fit(self, df, y, label = "Sentiment"):
        self._df = df.copy()
        self._label = label
        self._df[label] = y
        self._classes =  self._df[label].unique()
        
        self._class_means = self._get_class_means()
        self._cov_matrix = df.cov()
        
        self._w = self._w_calc()
        self._c = self._c_calc()
    
    def _w_calc(self):
        cov_inv = np.linalg.inv(self._cov_matrix)
        diff_vec = self._class_means[1] - self._class_means[0]
        w = np.dot(cov_inv, diff_vec)
        return w
    
    def _c_calc(self):
        sum_vec = self._class_means[1] + self._class_means[0]
        return np.dot(self._w, 0.5 * (sum_vec))
        
    def predict(self, X):
        X = X.values
        w = self._w
        c = self._c
        score = np.dot(X, w)
        yhat = score.copy()
        yhat[score > c] = 1
        yhat[score <= c] = 0
        return yhat.reshape(len(X),1)
            
            
    def _get_class_means(self):
        class_feature_means = pd.DataFrame()

        for c, rows in self._df.groupby(self._label):
            class_feature_means[c] = rows.mean()

        return class_feature_means.drop([self._label], axis = 0)
    
    def _discrim_func(self, X, k):
    
        pi_k = len(self._df[self._df[self._label] == k]) #[0]

        c_means = self._class_means[k]

        cov_inv = np.linalg.inv(self._cov_matrix)

        result = X.transpose().dot(cov_inv).dot(c_means)
        result = result - (1/2) * c_means.transpose().dot(cov_inv).dot(c_means) 
        result = result + np.log(pi_k)

        return result

In [848]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_subset_tfidf, y_train["Sentiment"])

In [850]:
def accuracy(yhat,y):
    yhat_copy = yhat.copy()
    yhat_copy[yhat >= 0.5] = 1
    yhat_copy[yhat < 0.5] = 0
    return (yhat_copy == y).mean()

In [853]:
yhat = lda.predict(X_train_subset_tfidf)
accuracy(yhat, y)

0.87164