# Logistic Regression, Naive Bayes, SVM

### Importing Libraries

In [1]:
import re
import pandas as pd
from tqdm import tqdm
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import naive_bayes, linear_model, svm
from sklearn.metrics import confusion_matrix

### Loading Dataset

In [2]:
dataset = pd.read_csv('Sentiments.csv', delimiter='\t')
labels = dataset.Labels.copy()
reviews = dataset.Reviews.copy()

### Data Preprocessing

In [3]:
stemmer = SnowballStemmer('english')
for ind,text in tqdm(enumerate(reviews)):
    text = re.sub('[!.]','',text.lower()).split()
    review = [stemmer.stem(word) 
              for word in text
              if word.isalpha()
              ]
    reviews[ind] = ' '.join(review)

1000it [00:00, 7291.09it/s]


In [4]:
vectorizer1 = TfidfVectorizer()
reviews1 = vectorizer1.fit_transform(reviews)
reviews1 = reviews1.toarray()

In [5]:
vectorizer2 = CountVectorizer()
reviews2 = vectorizer2.fit_transform(reviews)
reviews2 = reviews2.toarray()

In [8]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(reviews1, labels, train_size=0.8)
x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(reviews2, labels, train_size=0.8)

### Model Creation

In [9]:
model = [naive_bayes.GaussianNB ,linear_model.LogisticRegression, svm.SVC]

In [10]:
def model_designing(model, x, y):
    # training the model
    model = model()
    model.fit(x, y)
    return model

In [11]:
models_tfidf = [model_designing(model[i], x_train_tfidf, y_train_tfidf) for i in range(3)]
models_cv = [model_designing(model[i], x_train_cv, y_train_cv) for i in range(3)]

### Model Prediction 

In [12]:
def prediction_evulation(model, x_test1):
    # testing model
    y_pred1, y_pred2, y_pred3 = [model[i].predict(x_test1) for i in range(3)]
    return y_pred1, y_pred2, y_pred3

In [13]:
y_tfidf = prediction_evulation(models_tfidf, x_test_tfidf)
y_cv = prediction_evulation(models_cv, x_test_cv)

In [14]:
# confusion_martix
def confusion_martices(y, y_pred, dataset_name, model_name) :
        y_pred = confusion_matrix(y, y_pred)
        print("Result for "+dataset_name+" and "+model_name+":\n", 
              y_pred,"\n",
              "Correct prediction are:",sum(y_pred.diagonal()),"\n")
        
for j,algorithm in  enumerate(["logistic regression","naive bayes","svm"]):
    confusion_martices(y_tfidf[j], y_test_tfidf, 'tf-idf', algorithm)
    confusion_martices(y_cv[j], y_test_cv, 'cv', algorithm)

Result for tf-idf and logistic regression:
 [[55 20]
 [43 82]] 
 Correct prediction are: 137 

Result for cv and logistic regression:
 [[64  8]
 [52 76]] 
 Correct prediction are: 140 

Result for tf-idf and naive bayes:
 [[88 22]
 [10 80]] 
 Correct prediction are: 168 

Result for cv and naive bayes:
 [[95 18]
 [21 66]] 
 Correct prediction are: 161 

Result for tf-idf and svm:
 [[92 24]
 [ 6 78]] 
 Correct prediction are: 170 

Result for cv and svm:
 [[96 20]
 [20 64]] 
 Correct prediction are: 160 



Since the dataset is small, how <b>train test split</b> works is very important. That decides the number of words that will be considered for training and what will be the count of number of labels in each of train and test set.