In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Reviews.tsv', delimiter = '\t', quoting = 3)

In [3]:
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #For reducing the dimension of the sparse matrix. Love vs loved
corpus = []
for i in range(0, 1000):
    # Cleaning Step 1 : re.sub -> Replaces one thing with other. All things like ,'' etc with space
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # Cleaning Step 2 : Converting all to lower case.
    review = review.lower() 
    # Cleaning Step 3 : Splitting review into single words that can be used in stemming later.
    review = review.split() 
    ps = PorterStemmer()
    # Defining all english stepwords
    all_stopwords = stopwords.words('english') 
    # To not include the not in the stopwords, as not gives essential info about the data
    all_stopwords.remove('not')
    # Cleaning Step 4 : Getting rid of stop words, by applying stemming
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]    
    # Joining all words together back to form the review, including space intentionally.
    review = ' '.join(review)
    # Appending back to corpus
    corpus.append(review)

In [4]:
# print(corpus)
#for x in corpus:
#    print("--> ", x)

In [5]:
# Making Bag of words

## Trying on top 60% most prequent words

In [6]:
# Tokenisation : Making columns for all works
from sklearn.feature_extraction.text import CountVectorizer
# To get 1500 most frequent words, this is selected on the basis of your actual output of the function without the limit, which was 1566 in this case.
cv = CountVectorizer(max_features = 1000)
# Fit will take all the words, Transform will put those in the columns
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [7]:
print (len(X[0]))

1000


In [8]:
# Splitting

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [10]:
# Training Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [12]:
# Predicting

In [13]:
y_pred = classifier.predict(X_test)

In [14]:
# Confusion Matrix

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73

## Trying SVM

In [16]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [17]:
y_pred = classifier.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[75 22]
 [27 76]]


0.755

## Trying Random forrest

In [19]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[83 14]
 [31 72]]


0.775

## Trying KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[82 15]
 [57 46]]


0.64

## Trying kernel SVM

In [21]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[89  8]
 [37 66]]


0.775

## Trying Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[80 17]
 [29 74]]


0.77

## Trying Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[69 28]
 [29 74]]


0.715