In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('datasets/Restaurant_reviews.tsv' , delimiter= '\t')
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
dataset.isnull().sum()

Review    0
Liked     0
dtype: int64

In [4]:
dataset['length'] = [len(t) for t in dataset.Review]
dataset.head()

Unnamed: 0,Review,Liked,length
0,Wow... Loved this place.,1,24
1,Crust is not good.,0,18
2,Not tasty and the texture was just nasty.,0,41
3,Stopped by during the late May bank holiday of...,1,87
4,The selection on the menu was great and so wer...,1,59


In [5]:
dataset[dataset.length < 15].head()

Unnamed: 0,Review,Liked,length
9,A great touch.,1,14
39,Service sucks.,0,14
109,Sooooo good!!,1,13
114,Good prices.,1,12
115,Check it out.,1,13


Overall Distribution of the length of the reviews under each sentiment class

In [6]:
# fig,ax = plt.subplots(figsize=(5,5))
# plt.boxplot(dataset.length)
# plt.show()

In [7]:
# Cleaning the text (Basically We are removing non-determiners and stemming)
# stopwords is a list of unwanted words like the,and,of,etc...
# corpus is a collection of text.
import re                          # Regular Expression
import nltk                        # natural language tool kit - Used for processing of natutal languages i.e., human languages
# un comment nltk.download('stopwords') if you're running for the first time
#nltk.download('stopwords')         # Download stopwords from nltk
#stopwords_location = 'C:\\Users\\Ankit Sharma\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'
from nltk.corpus import stopwords
#We'll be performing stemming now
from nltk.stem.porter import PorterStemmer
#ps = PorterStemmer()               # Object of PorterStemmer class
#words = stopwords.words('english')
corpus = []                        # Initialize an empty list for all reviews

# Iterating through all the reviews
for i in range(0,1000):
    # Removing unnecessary punctuations and numbers except letters and replacing removed words with space.
    #review = re.sub('[^a-zA-Z]', ' ', dataset.Review[i])
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # converting review to lowercase
    review = review.lower()
    # Converting review to list(of Strings)
    review  = review.split()
    ps = PorterStemmer()
    words = stopwords.words('english')
    words.remove('not')
    words.remove('but')
    words.remove('is')
    # Loop through all words and keep those which are not in stopwords list.
    # set is much faster than a list and is considered when the review is very large eg. an article,a book
    review = [ps.stem(word) for word in review if not word in set(words)]
    # Joining back the review list to a string with each word seperated by a space.
    review = ' '.join(review)
    corpus.append(review)

In [8]:
# stopwords_ = list(words)
# print(stopwords_)

In [9]:
# corpus

In [10]:
corpus[120]

'know place manag serv blandest food ever eaten prepar indian cuisin'

#### Applying PCA

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
#X = cv.fit_transform(corpus)                          
X = cv.fit_transform(corpus).toarray()                 # toarray() is used to convert into matrix
y = dataset.iloc[:,1].values

In [12]:
X[0:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
y[0:5]

array([1, 0, 0, 1, 1], dtype=int64)

In [14]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 25)

In [15]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=None) # since we do not know how many eigenvectors
# # are needed we keep the value of n_components = None so that we can the
# # find eigenvalues of all the eigenvectors to figure out the best one
# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)
# explained_variance = pca.explained_variance_ratio_
# print(explained_variance)

In [16]:
# len(explained_variance)

In [17]:
# sum = 0
# ev = list(explained_variance)
# for e in range(len(ev)-200):
#     sum = ev[e] + sum

In [18]:
# sum

In [19]:
from sklearn.decomposition import PCA
pca = PCA() # since we do not know how many eigenvectors
# are needed we keep the value of n_components = None so that we can the
# find eigenvalues of all the eigenvectors to figure out the best one
X = pca.fit_transform(X)
# X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
len(explained_variance)

1000

In [20]:
from sklearn.svm import SVC
clf = SVC(kernel = 'linear')
clf.fit(X, y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
data1 = pd.read_csv('datasets/Restaurant_reviews_test.tsv',sep = '\t', quoting = 3)
data1.head()

Unnamed: 0,Review,Liked
0,I really liked this hamburger,1
1,I like this,1
2,Very bad food.,0
3,Foods are very tasty. Behaviour is good. Hygie...,1
4,The breakfast brunch was great.,1


In [22]:
# Cleaning the text (Basically We are removing non-determiners and stemming)
# stopwords is a list of unwanted words like the,and,of,etc...
# corpus is a collection of text.
import re                          # Regular Expression
import nltk                        # natural language tool kit - Used for processing of natutal languages i.e., human languages
# un comment nltk.download('stopwords') if you're running for the first time
#nltk.download('stopwords')         # Download stopwords from nltk
#stopwords_location = 'C:\\Users\\Ankit Sharma\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'
from nltk.corpus import stopwords
#We'll be performing stemming now
from nltk.stem.porter import PorterStemmer
#ps = PorterStemmer()               # Object of PorterStemmer class
#words = stopwords.words('english')
corpus1 = []                        # Initialize an empty list for all reviews

# Iterating through all the reviews
for i in range(0,len(data1)):
    # Removing unnecessary punctuations and numbers except letters and replacing removed words with space.
    #review = re.sub('[^a-zA-Z]', ' ', dataset.Review[i])
    review = re.sub('[^a-zA-Z]', ' ', data1['Review'][i])
    # converting review to lowercase
    review = review.lower()
    # Converting review to list(of Strings)
    review  = review.split()
    ps = PorterStemmer()
    words = stopwords.words('english')
    words.remove('not')
    words.remove('but')
    words.remove('is')
    # Loop through all words and keep those which are not in stopwords list.
    # set is much faster than a list and is considered when the review is very large eg. an article,a book
    review = [ps.stem(word) for word in review if not word in set(words)]
    # Joining back the review list to a string with each word seperated by a space.
    review = ' '.join(review)
    corpus1.append(review)

In [23]:
test_data_x = cv.fit_transform(corpus1).toarray()                 # toarray() is used to convert into matrix
test_data_y = dataset.iloc[:,1].values

In [24]:
# test_data_x = data1['Review']
# test_data_y = data1.iloc[:,-1].values

# test_data_x = cv.fit_transform(test_data_x).toarray() 

test_data_x = pca.transform(test_data_x)
pred = clf.predict(test_data_x)

ValueError: operands could not be broadcast together with shapes (140,547) (1568,) 

In [None]:
from sklearn.metrics import plot_confusion_matrix, accuracy_score, confusion_matrix
plot_confusion_matrix(clf,test_data_x , test_data_y, cmap = plt.cm.Blues)
plt.show()
cm = confusion_matrix(test_data_y, pred)
print(cm)
accuracy = accuracy_score(test_data_y, pred)
print('Accuracy Score: ',accuracy)

In [None]:
# y_pred = clf.predict(X_test)
# y_pred

In [None]:
# from sklearn.metrics import plot_confusion_matrix, accuracy_score, confusion_matrix
# plot_confusion_matrix(clf,X_test , y_test, cmap = plt.cm.Blues)
# plt.show()
# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# accuracy = accuracy_score(y_test, y_pred)
# print('Accuracy Score: ',accuracy)

In [None]:
# from sklearn.metrics import classification_report
# print(classification_report(y_test,y_pred))