In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [40]:
df= pd.read_csv('../../DataSets/Restaurant_Reviews.tsv', sep='\t')

In [41]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [42]:
X= df['Review']
y= df['Liked']

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=16)

In [45]:
X_train.shape, y_train.shape

((800,), (800,))

# Text preprocessing

# Same Case- i.e. convert whole text into lover case 

In [7]:
review= 'Wow... Loved this place.'
review.lower()


'wow... loved this place.'

# Removing Unwanted Charecters from Text

In [46]:
import re

In [9]:
review= 'Wow... Loved this place.'
review= review.lower()
re.sub('[^a-z]',' ',review)


'wow    loved this place '

In [10]:
review= 'Wow... Loved this place.'
review= review.lower()
review= re.sub('[^a-z]',' ',review)
review.split()


['wow', 'loved', 'this', 'place']

In [11]:
review= 'Wow... Loved this place.'
review= review.lower()
review= re.sub('[^a-z]',' ',review)
words= review.split()


# Removing Stopwords

In [47]:
import nltk

In [48]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pavilion\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
stopwords= nltk.corpus.stopwords.words('english')    # List of Stopwords

In [50]:
stopwords.remove('not')


In [16]:
review= 'Wow... Loved this place.'
review= review.lower()
review= re.sub('[^a-z]',' ',review)
words= review.split()
final_words= [word for word in words if not word in stopwords]
final_words

['wow', 'loved', 'place']

# Stemming

In [12]:
stemmer = nltk.stem.PorterStemmer()     # Stemming
stemmer.stem('running')

'run'

In [18]:
review= 'Wow... Loved this place.'
review= review.lower()
review= re.sub('[^a-z]',' ',review)
words= review.split()
final_words= [stemmer.stem(word) for word in words if not word in stopwords]
' '.join(final_words)

'wow love place'

In [19]:
# Defining User Define Func

In [51]:
def preprocess(review):
    review= review.lower()
    review= re.sub('[^a-z]',' ',review)
    words= review.split()
    final_words= [stemmer.stem(word) for word in words if not word in stopwords]
    return ' '.join(final_words)

In [21]:
preprocess('Not tasty and the texture was just nasty.')

'not tasti textur nasti'

In [22]:
preprocess('Crust is not good.')    # as not is in stopwords it is predicting wrong

'crust not good'

In [23]:
# Apply function to DF

In [30]:
df['Review'].apply(preprocess)    # Preprocessing Done

0                                         wow love place
1                                         crust not good
2                                 not tasti textur nasti
3      stop late may bank holiday rick steve recommen...
4                                select menu great price
                             ...                        
995                        think food flavor textur lack
996                               appetit instantli gone
997                 overal not impress would not go back
998    whole experi underwhelm think go ninja sushi n...
999    wast enough life pour salt wound draw time too...
Name: Review, Length: 1000, dtype: object

In [25]:
df['processed_review']= df['Review'].apply(preprocess)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vectorizer = CountVectorizer()
vectorizer.fit(df['processed_review'])

CountVectorizer()

In [28]:
len(vectorizer.get_feature_names())

1566

# Bag OF Words Model

In [29]:
bow_table= vectorizer.transform(df['processed_review'])

In [30]:
bow_table   # In the form of sparse metrix ... so need to convert it into array

<1000x1566 sparse matrix of type '<class 'numpy.int64'>'
	with 5484 stored elements in Compressed Sparse Row format>

In [31]:
bow_table.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [32]:
X_train= bow_table.toarray()
y_yrain= df['Liked']

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
model= RandomForestClassifier(n_estimators=90,max_depth=3,min_samples_leaf=2)
model.fit(X_train,y_yrain)

RandomForestClassifier(max_depth=3, min_samples_leaf=2, n_estimators=90)

In [35]:
def predict_review(review):
    review=preprocess(review)
    X_test=vectorizer.transform([review])
    return model.predict(X_test)

In [36]:
predict_review('Not tasty and the texture was just nasty.')

array([0], dtype=int64)

In [37]:
predict_review('food was not good')

array([1], dtype=int64)

# Using Bag Of Words i.e. BOW

In [52]:
from sklearn.feature_extraction.text import CountVectorizer

In [53]:
X_train_processed= X_train.apply(preprocess)
X_test_processed = X_test.apply(preprocess)

In [54]:
vectorizer_BOW = CountVectorizer(max_features=1000,ngram_range=(1,2))
vectorizer_BOW.fit(X_train_processed)

CountVectorizer(max_features=1000, ngram_range=(1, 2))

In [55]:
bow_table= vectorizer_BOW.transform(X_train_processed)

In [56]:
X_train= bow_table.toarray()


In [57]:
from sklearn.ensemble import RandomForestClassifier

In [58]:
model_bow= RandomForestClassifier(n_estimators=90,max_depth=4)
model_bow.fit(X_train,y_train)


RandomForestClassifier(max_depth=4, n_estimators=90)

In [63]:
X_test_processed= X_test.apply(preprocess)
X_test_processed=vectorizer_BOW.transform(X_test_processed)

In [64]:
model_bow.score(X_test_processed,y_test)

0.74

# TF-IDF

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
vectorizer_tfidf = TfidfVectorizer(max_features=1000)
vectorizer_tfidf.fit(X_train_processed)
tfidf_table = vectorizer_tfidf.transform(X_train_processed)

model_tfidf= RandomForestClassifier(n_estimators=90,max_depth=4)
model_tfidf.fit(tfidf_table.toarray(),y_train)


RandomForestClassifier(max_depth=4, n_estimators=90)

In [67]:
model_tfidf.score(X_test_processed,y_test)

0.555

# N-grams

In [69]:
vectorizer_ngrams = CountVectorizer(max_features=1000, ngram_range=(1,2))
vectorizer_ngrams.fit(X_train_processed)
ngrams_table = vectorizer_ngrams.transform(X_train_processed)

model_ngrams= RandomForestClassifier(n_estimators=90,max_depth=4)
model_ngrams.fit(ngrams_table.toarray(),y_train)


RandomForestClassifier(max_depth=4, n_estimators=90)

In [70]:
model_ngrams.score(X_test_processed,y_test)

0.74