In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv("sentiment.csv")
df.head()

Unnamed: 0,Phrase,label
0,Spider-man is better than any summer blockbust...,Positive
1,of good sense,Positive
2,", then knock yourself out and enjoy the big sc...",Negative
3,So could young romantics out on a date .,Positive
4,humour,Positive


In [3]:
df.shape

(2800, 2)

In [6]:
len(df[df['label']=='Positive'])

1800

In [7]:
len(df[df['label']=='Negative'])

1000

## Vectorization of Phrase

In [4]:
total_len=len(df)
train_len=int(0.6*total_len)
val_len=int(0.2*total_len)
test_len=int(0.2*total_len)
print("total=> {} train_len=> {} val_len=> {} test_len =>{}".format(total_len,train_len,val_len,test_len))

total=> 2800 train_len=> 1680 val_len=> 560 test_len =>560


*Split dataset into train, val and test set*

In [5]:
train_df=df[0:train_len]
val_df=df[train_len:train_len+val_len]
test_df=df[train_len+val_len:]

In [6]:
print("Length of train dataset, validation dataset and test dataset")
print("train_df_len=> {} val_df_len=> {} test_df_len =>{}".format(len(train_df),len(val_df),len(test_df)))

Length of train dataset, validation dataset and test dataset
train_df_len=> 1680 val_df_len=> 560 test_df_len =>560


In [7]:
vocab=list()
#create a corpus of vocab
for i in train_df.Phrase.tolist():
    vocab.extend(i.split(" "))
    
#take unique words from vocab   
unique_vocab=list(set(vocab)) #set can't be indexed so change it to list for simplicity    

In [8]:
print("Vocab Length => {} Unique Vocab Length => {}".format(len(vocab),len(unique_vocab)))

Vocab Length => 13855 Unique Vocab Length => 1844


**Feature_Extraction**

In [39]:
def extract_feature(unique_vocab,dataset):

    feature=[]
    label=[]
    for index,row in dataset.iterrows():
        words=row["Phrase"].split(" ") 
        f=[]
        for v in unique_vocab:
            if v in words:
                f.append(1)
            else:
                f.append(0)
                
        if row["label"]=="Positive":
            label.append(1)
        elif row["label"]=="Negative":
            label.append(0)

        feature.append(f)
    
    #change features and label list into numpy array
    feature=np.array(feature)
    label=np.array(label) 
    #change label into 2D
    label=label.reshape(-1,1)
    print(feature)
    
    return feature,label 

**create train, val and test set feature and labels**

In [40]:
X_train,y_train=extract_feature(unique_vocab,train_df)
X_val,y_val=extract_feature(unique_vocab,val_df)
X_test,y_test=extract_feature(unique_vocab,test_df)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [41]:
X_train.shape

(1680, 1844)

**Add bias to X**

In [42]:
X_train=np.insert(X_train,0,values=1,axis=1)
X_val=np.insert(X_val,0,values=1,axis=1)
X_test=np.insert(X_test,0,values=1,axis=1)

In [46]:
X_val

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

**Build Model**

In [53]:
np.random.seed(42)
params={
    "num_of_iterations": 1000,
    "learning_rate": 0.0001
}

grid_param={
    "num_of_iteration":[1000,1500,2000],
    "learning_rate":[0.001,0.1,0.01]
}

In [61]:
#Binary Cross Entropy Loss
def bce_loss(h,y):
    m=len(h)
    return -np.average(y*np.log(h)+(1-y)*np.log(1-h))

In [62]:
#Sigmoid function
def sigmoid(x):
    return 1/(1+np.exp(-x))
    

In [63]:
#train model 
def train_model(X_train,y_train, X_val, y_val, param):
    W=np.random.rand(X_train.shape[1]).reshape(-1,1)
    train_error=[]
    for i in range(1,param["num_of_iterations"]):
        h_train=sigmoid(np.matmul(X_train,W))
        train_loss=bce_loss(h_train,y_train)
        
        train_error.append(train_loss)
        
        gradient=(np.matmul(np.transpose(X_train),(h_train-y_train)))/X_train.shape[0]
        W-=params["learning_rate"]*gradient
        
    h_val=sigmoid(np.matmul(X_val,W))
    val_loss=bce_loss(h_val,y_val)        
    print(param,val_loss)
    return W 

**Hyperparameter Tuning**

In [64]:
import itertools
grid=list(itertools.product(grid_param["num_of_iteration"],grid_param["learning_rate"]))
grid

[(1000, 0.001),
 (1000, 0.1),
 (1000, 0.01),
 (1500, 0.001),
 (1500, 0.1),
 (1500, 0.01),
 (2000, 0.001),
 (2000, 0.1),
 (2000, 0.01)]

In [None]:
for g in grid:
    p={
        'num_of_iterations':g[0],
        'learning_rate':g[1]
    }
    train_model(X_train,y_train,X_val,y_val,p)

{'num_of_iterations': 1000, 'learning_rate': 0.001} 1.3523121603855173
{'num_of_iterations': 1000, 'learning_rate': 0.1} 1.5596584495616708
{'num_of_iterations': 1000, 'learning_rate': 0.01} 1.5973515652125052
{'num_of_iterations': 1500, 'learning_rate': 0.001} 1.4572115616899546
{'num_of_iterations': 1500, 'learning_rate': 0.1} 1.4611984029003207
{'num_of_iterations': 1500, 'learning_rate': 0.01} 1.4999655632346918
{'num_of_iterations': 2000, 'learning_rate': 0.001} 1.3914841157000288
{'num_of_iterations': 2000, 'learning_rate': 0.1} 1.3551164954664228
