In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('datasets/Restaurant_reviews.tsv' , delimiter= '\t', quoting = 3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
# Cleaning the text (Basically We are removing non-determiners and stemming)
# stopwords is a list of unwanted words like the,and,of,etc...
# corpus is a collection of text.
import re                          # Regular Expression
import nltk                        # natural language tool kit - Used for processing of natutal languages i.e., human languages
#nltk.download('stopwords')         # Download stopwords from nltk
#stopwords_location = 'C:\\Users\\Ankit Sharma\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'
from nltk.corpus import stopwords
#We'll be performing stemming now
from nltk.stem.porter import PorterStemmer
#ps = PorterStemmer()               # Object of PorterStemmer class
#words = stopwords.words('english')
corpus = []                        # Initialize an empty list for all reviews

# Iterating through all the reviews
for i in range(0,1000):
    # Removing unnecessary punctuations and numbers except letters and replacing removed words with space.
    #review = re.sub('[^a-zA-Z]', ' ', dataset.Review[i])
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # converting review to lowercase
    review = review.lower()
    # Converting review to list(of Strings)
    review  = review.split()
    ps = PorterStemmer()
    words = stopwords.words('english')
    words.remove('not')
    words.remove('no')
    words.remove('but')
    words.remove('is')
    # Loop through all words and keep those which are not in stopwords list.
    # set is much faster than a list and is considered when the review is very large eg. an article,a book
    review = [ps.stem(word) for word in review if not word in set(words)]
    # Joining back the review list to a string with each word seperated by a space.
    review = ' '.join(review)
    corpus.append(review)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 1500)
#X = cv.fit_transform(corpus)                          
X = cv.fit_transform(corpus).toarray()                 # toarray() is used to convert into matrix
y = dataset.iloc[:,1].values

In [5]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

In [7]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            #'C': [1,10,20],
            'C': [1],
            #'kernel': ['rbf','linear'],
            'kernel': ['linear'],
            #'random_state': [0,5,10,15,20,25,30,35,40,45,50]
            'random_state': [0]
        }  
    },
    'knn': {
        'model': KNeighborsClassifier(),
        'params' : {
            #'n_neighbors': [1,2,3,4,5,6,7,8,9,10],
            'n_neighbors': [3]
        }
    }    
}

In [8]:
from sklearn.model_selection import GridSearchCV
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.796,"{'C': 1, 'kernel': 'linear', 'random_state': 0}"
1,knn,0.705,{'n_neighbors': 3}
