## Loading the Necessary Libraries

In [249]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading Yelp Dataset

In [250]:
data_yelp = pd.read_csv('yelp_labelled.txt', sep ='\t', header = None)

In [251]:
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [252]:
#Assigning Names for Columns 0(Review) and 1(Sentiment)
column_name = ['Review','Sentiment']
data_yelp.columns = column_name
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [253]:
data_yelp.shape

(1000, 2)

## Loading Amazon Dataset

In [254]:
data_amazon = pd.read_csv('amazon_cells_labelled.txt', sep ='\t', header = None)

In [255]:
data_amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [256]:
#Assigning Names for Columns 0(Review) and 1(Sentiment)
data_amazon.columns = column_name
data_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [257]:
data_amazon.shape

(1000, 2)

## Loading IMDB Dataset

In [258]:
data_imdb = pd.read_csv('imdb_labelled.txt', sep ='\t', header = None)

In [259]:
data_imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [260]:
#Assigning Names for Columns 0(Review) and 1(Sentiment)
data_imdb.columns = column_name
data_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [261]:
data_imdb.shape

(748, 2)

## Appending all the Datasets

In [262]:
data = data_yelp.append([data_amazon, data_imdb], ignore_index = True)
data.shape

(2748, 2)

In [263]:
data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [264]:
#Checking total no of positive and negative sentiments
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [265]:
#checking no of null values
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [266]:
#x is the input data set and y is the output data set
x = data['Review']
y = data['Sentiment']

## Data Cleaning or Data Preprocessing for a Single Record

In [267]:
import string
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [268]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)

In [269]:
import spacy
nlp = spacy.load('en_core_web_sm')
def text_data_cleaning(sentence):
    doc=nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [270]:
text_data_cleaning("Hello all, It's a beautiful day outside there!")

['hello', 'beautiful', 'day', 'outside']

## Data Preprocessing for All the Records

In [271]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [272]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)
classifier = LinearSVC()

In [273]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((2198,), (550,), (2198,), (550,))

In [288]:
l1 = []

## Applying SVM Algorithm

In [289]:
svm = Pipeline([("tfidf",TfidfVectorizer()) , ("classifier",LinearSVC())])
svm.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('classifier', LinearSVC())])

In [290]:
y_pred_svm = svm.predict(x_test)

In [291]:
print(accuracy_score(y_test, y_pred_svm))

0.8236363636363636


In [292]:
print(confusion_matrix(y_test, y_pred_svm))

[[228  51]
 [ 46 225]]


In [293]:
l1.append(accuracy_score(y_test, y_pred_svm))

In [294]:
clf.predict(["Wow, Iam learning Natural Language Processing in fun fashion!"])

array([1], dtype=int64)

In [295]:
clf.predict(["It's hard to learn new things!"])

array([0], dtype=int64)

## Applying  Random Forest

In [296]:
from sklearn.ensemble import RandomForestClassifier

In [297]:
rf = Pipeline([("tfidf",TfidfVectorizer()) , ("classifier",RandomForestClassifier())])
rf.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', RandomForestClassifier())])

In [298]:
y_pred_rf = rf.predict(x_test)

In [299]:
print(accuracy_score(y_test, y_pred_rf))

0.8090909090909091


In [300]:
l1.append(accuracy_score(y_test, y_pred_rf))

In [301]:
print(confusion_matrix(y_test, y_pred_rf))

[[232  47]
 [ 58 213]]


In [302]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.80      0.83      0.82       279
           1       0.82      0.79      0.80       271

    accuracy                           0.81       550
   macro avg       0.81      0.81      0.81       550
weighted avg       0.81      0.81      0.81       550



## Applying Logistic Regression

In [303]:
from sklearn.linear_model import LogisticRegression
lm = Pipeline([("tfidf",TfidfVectorizer()) , ("classifier",LogisticRegression(max_iter=800))])

In [304]:
lm.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', LogisticRegression(max_iter=800))])

In [305]:
y_pred_lm = lm.predict(x_test)

In [306]:
print(accuracy_score(y_test, y_pred_lm))
l1.append(accuracy_score(y_test, y_pred_lm))

0.8254545454545454


In [307]:
print(confusion_matrix(y_test, y_pred_lm))

[[231  48]
 [ 48 223]]


In [308]:
print(classification_report(y_test, y_pred_lm))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       279
           1       0.82      0.82      0.82       271

    accuracy                           0.83       550
   macro avg       0.83      0.83      0.83       550
weighted avg       0.83      0.83      0.83       550



## Applying Decision Tree

In [309]:
from sklearn.tree import DecisionTreeClassifier
dst= Pipeline([("tfidf",TfidfVectorizer()) , ("classifier",DecisionTreeClassifier(max_depth = 1000, random_state = 0,criterion = 'entropy'))])

In [310]:
dst.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier',
                 DecisionTreeClassifier(criterion='entropy', max_depth=1000,
                                        random_state=0))])

In [311]:
y_pred_dst = dst.predict(x_test)

In [312]:
print(accuracy_score(y_test,y_pred_dst))
l1.append(accuracy_score(y_test, y_pred_dst))

0.6945454545454546


In [313]:
print(confusion_matrix(y_test, y_pred_dst))

[[208  71]
 [ 97 174]]


In [314]:
print(classification_report(y_test, y_pred_dst))

              precision    recall  f1-score   support

           0       0.68      0.75      0.71       279
           1       0.71      0.64      0.67       271

    accuracy                           0.69       550
   macro avg       0.70      0.69      0.69       550
weighted avg       0.70      0.69      0.69       550



## Applying Naive Bayes

In [315]:
from sklearn.naive_bayes import MultinomialNB
munb = Pipeline([("tfidf",TfidfVectorizer()) , ("classifier",MultinomialNB())])

In [316]:
munb.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('classifier', MultinomialNB())])

In [317]:
y_pred_munb = munb.predict(x_test)

In [318]:
print(accuracy_score(y_test,y_pred_munb))
l1.append(accuracy_score(y_test, y_pred_munb))

0.8181818181818182


In [319]:
print(confusion_matrix(y_test, y_pred_munb))

[[226  53]
 [ 47 224]]


In [320]:
print(classification_report(y_test, y_pred_munb))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82       279
           1       0.81      0.83      0.82       271

    accuracy                           0.82       550
   macro avg       0.82      0.82      0.82       550
weighted avg       0.82      0.82      0.82       550



## Applying KNN

In [321]:
from sklearn.neighbors import KNeighborsClassifier
knn = Pipeline([("tfidf",TfidfVectorizer()) , ("classifier",KNeighborsClassifier(n_neighbors=50))])

In [322]:
knn.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', KNeighborsClassifier(n_neighbors=50))])

In [323]:
y_pred_knn = knn.predict(x_test)

In [324]:
print(accuracy_score(y_test,y_pred_knn))
l1.append(accuracy_score(y_test, y_pred_knn))

0.7490909090909091


In [325]:
print(confusion_matrix(y_test, y_pred_knn))

[[207  72]
 [ 66 205]]


In [326]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75       279
           1       0.74      0.76      0.75       271

    accuracy                           0.75       550
   macro avg       0.75      0.75      0.75       550
weighted avg       0.75      0.75      0.75       550



## Comparison

In [334]:
alg = ["SVM", "RF", "LR", "DT", "NB", "KNN" ]

In [335]:
for i in range(6):
    print(f" {alg[i]} -> {round(l1[i]*100,2)} %")

 SVM -> 82.36 %
 RF -> 80.91 %
 LR -> 82.55 %
 DT -> 69.45 %
 NB -> 81.82 %
 KNN -> 74.91 %
