In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Reviews.tsv', delimiter = '\t', quoting = 3)

In [3]:
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #For reducing the dimension of the sparse matrix. Love vs loved
corpus = []
for i in range(0, 1000):
    review = dataset['Review'][i]
    # Cleaning Step 1 : re.sub -> Replaces one thing with other. All things like ,'' etc with space
    review = re.sub('[^a-zA-Z]', ' ', review)
    # Cleaning Step 1 continued -> Replaces words like don't can't with do not, can not
    review = re.sub(r"\b([a-zA-Z]{1,})n't\b", r'\1 not', review)
    # Cleaning Step 2 : Converting all to lower case.
    review = review.lower() 
    # Cleaning Step 3 : Splitting review into single words that can be used in stemming later.
    review = review.split() 
    ps = PorterStemmer()
    # Defining all english stepwords
    all_stopwords = stopwords.words('english') 
    # To not include the not in the stopwords, as not gives essential info about the data
    all_stopwords.remove('not')
    # Cleaning Step 4 : Getting rid of stop words, by applying stemming
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]    
    # Joining all words together back to form the review, including space intentionally.
    review = ' '.join(review)
    # Appending back to corpus
    corpus.append(review)

In [4]:
# print(corpus)
#for x in corpus:
#    print("--> ", x)

In [5]:
# Making Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
# Fit will take all the words, Transform will put those in the columns
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [7]:
# Splitting

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [9]:
# Training Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [11]:
# Predicting

In [12]:
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [13]:
# Confusion Matrix

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy = ","{:.2f}".format(accuracy_score(y_test, y_pred)*100))
print("Precision = ","{:.2f}".format(precision_score(y_test, y_pred)*100))
print("F1 Score = ","{:.2f}".format(f1_score(y_test, y_pred)*100))
print("Recall Score = ","{:.2f}".format(recall_score(y_test, y_pred)*100))

[[55 42]
 [12 91]]
Accuracy =  73.00
Precision =  68.42
F1 Score =  77.12
Recall Score =  88.35


## Trying SVM

In [15]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [16]:
y_pred = classifier.predict(X_test)

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy = ","{:.2f}".format(accuracy_score(y_test, y_pred)*100))
print("Precision = ","{:.2f}".format(precision_score(y_test, y_pred)*100))
print("F1 Score = ","{:.2f}".format(f1_score(y_test, y_pred)*100))
print("Recall Score = ","{:.2f}".format(recall_score(y_test, y_pred)*100))

[[79 18]
 [24 79]]
Accuracy =  79.00
Precision =  81.44
F1 Score =  79.00
Recall Score =  76.70


## Trying Random forrest

In [18]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy = ","{:.2f}".format(accuracy_score(y_test, y_pred)*100))
print("Precision = ","{:.2f}".format(precision_score(y_test, y_pred)*100))
print("F1 Score = ","{:.2f}".format(f1_score(y_test, y_pred)*100))
print("Recall Score = ","{:.2f}".format(recall_score(y_test, y_pred)*100))

[[87 10]
 [45 58]]
Accuracy =  72.50
Precision =  85.29
F1 Score =  67.84
Recall Score =  56.31


## Trying KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy = ","{:.2f}".format(accuracy_score(y_test, y_pred)*100))
print("Precision = ","{:.2f}".format(precision_score(y_test, y_pred)*100))
print("F1 Score = ","{:.2f}".format(f1_score(y_test, y_pred)*100))
print("Recall Score = ","{:.2f}".format(recall_score(y_test, y_pred)*100))

[[74 23]
 [45 58]]
Accuracy =  66.00
Precision =  71.60
F1 Score =  63.04
Recall Score =  56.31


## Trying kernel SVM

In [20]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy = ","{:.2f}".format(accuracy_score(y_test, y_pred)*100))
print("Precision = ","{:.2f}".format(precision_score(y_test, y_pred)*100))
print("F1 Score = ","{:.2f}".format(f1_score(y_test, y_pred)*100))
print("Recall Score = ","{:.2f}".format(recall_score(y_test, y_pred)*100))

[[89  8]
 [36 67]]
Accuracy =  78.00
Precision =  89.33
F1 Score =  75.28
Recall Score =  65.05


## Trying Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy = ","{:.2f}".format(accuracy_score(y_test, y_pred)*100))
print("Precision = ","{:.2f}".format(precision_score(y_test, y_pred)*100))
print("F1 Score = ","{:.2f}".format(f1_score(y_test, y_pred)*100))
print("Recall Score = ","{:.2f}".format(recall_score(y_test, y_pred)*100))

[[80 17]
 [28 75]]
Accuracy =  77.50
Precision =  81.52
F1 Score =  76.92
Recall Score =  72.82


## Trying Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy = ","{:.2f}".format(accuracy_score(y_test, y_pred)*100))
print("Precision = ","{:.2f}".format(precision_score(y_test, y_pred)*100))
print("F1 Score = ","{:.2f}".format(f1_score(y_test, y_pred)*100))
print("Recall Score = ","{:.2f}".format(recall_score(y_test, y_pred)*100))


[[78 19]
 [31 72]]
Accuracy =  75.00
Precision =  79.12
F1 Score =  74.23
Recall Score =  69.90
