In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_svmlight_file

## Importing Bag of Words for Reviews

The feature indices in these files start from 0, and the text
tokens corresponding to a feature index is found in [imdb.vocab]. So a
line with 0:7 in a .feat file means the first word in [imdb.vocab]
(the) appears 7 times in that review.

In [2]:
X_train, y_train = load_svmlight_file("../Data/train/labeledBow.feat")

X_train = pd.DataFrame(X_train.todense())
y_train = pd.DataFrame(y_train).rename(columns = {0 :"Score"})

In [3]:
intercept_col = np.ones(X_train.shape[0]).reshape(X_train.shape[0],1)
X_train = pd.DataFrame(np.hstack((intercept_col, X_train.values)))

In [4]:
X_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89518,89519,89520,89521,89522,89523,89524,89525,89526,89527
0,1.0,9.0,1.0,4.0,4.0,6.0,4.0,2.0,2.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,7.0,4.0,2.0,2.0,0.0,4.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,4.0,4.0,4.0,7.0,2.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,10.0,2.0,2.0,0.0,3.0,2.0,4.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,13.0,9.0,6.0,4.0,2.0,5.0,10.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
y_train.head()

Unnamed: 0,Score
0,9.0
1,7.0
2,9.0
3,10.0
4,8.0


In [6]:
y_train["Sentiment"] = ""
y_train.loc[y_train["Score"] >= 7, ("Sentiment")] = 1
y_train.loc[y_train["Score"] <= 4, ("Sentiment")] = 0
y_train

Unnamed: 0,Score,Sentiment
0,9.0,1
1,7.0,1
2,9.0,1
3,10.0,1
4,8.0,1
...,...,...
24995,1.0,0
24996,1.0,0
24997,4.0,0
24998,2.0,0


## Importing Vocab

In [7]:
Vocabulary = []
f = open("../Data/imdb.vocab", "r")
for token in f:
    Vocabulary.append(token.replace('\n',''))
f.close()

## Importing Expected Rating Per Token

In [8]:
Expected_Vocabulary_Rating = []
f = open("../Data/imdbEr.txt", "r")
for expected_rating in f:
    Expected_Vocabulary_Rating.append(float(expected_rating.replace('\n','')))
f.close()

## Logistic Regression

Use linear predictor to model the log-odds

### Form

\begin{align}
\log \frac{P(Y_i = 1)}{1 - P(Y_i = 1)} &= \beta_0 + \beta_1x_i \\
P(Y_i = 1) &= \frac{1}{1 + e^{-(\beta_0 + \beta_1x_i)}}
\end{align}

### Loss Function

\begin{align}
L(Y, \hat p) &= -\log [\hat p ^{Y}(1 - \hat p)^{1 - Y}] \\
&= -Y \log \hat p - (1 - Y) \log (1 -\hat p)
\end{align}

### Estimating Logistic Regression Coefficients

WTF $\beta_j \leftarrow \beta_j - \eta\frac{\partial L}{\partial \beta_j}$

\begin{align}
\frac{\partial L}{\partial \beta_j} &= \frac{\partial L}{\partial p}\frac{\partial \hat p}{\partial \beta_j} \\
&=  \bigg(-\frac{Y}{\hat p} + \frac{1 - Y}{1 - \hat p}\bigg)\big(\hat p (1 - \hat p)\cdot x_j\big) \\
&= \big(\hat p - Y\big)x_j
\end{align}


In [52]:
# Get Odds of a Series
def odds(series, success_value):
    return((series == success_value).mean())

# Get Log Odds of a Series
def log_odds(series, success_value):
    return np.log(odds(series,success_value))

# Get y hats for a given set of coefficients and dataset
def logistic_function(X,b):
    y_pred = ((1 + np.exp((X @ b) * -1)) ** (-1))
    return y_pred

def gradient_descent(X,y):
    learning_rate = 0.01
    X = X.values
    y = y.values
    b = np.zeros(X.shape[1]).reshape(X.shape[1],1)
    y_pred = logistic_function(X, b)
    gradient = X.T @ (y - y_pred)
    """
    for i in range(2):
        y_pred = logistic_function(X, b)
        gradient = X.T @ (y - y_pred) 
        print(gradient.shape)
    """
    return(gradient)

In [53]:
odds(y_train["Sentiment"],1)

0.5

In [54]:
log_odds(y_train["Sentiment"],1)

-0.6931471805599453

In [55]:
np.log(0.5)

-0.6931471805599453

In [None]:
gradient_descent(X_train,y_train["Score"])