In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from lightgbm import LGBMClassifier

In [2]:
ps = PorterStemmer()

In [3]:
data = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [4]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [6]:
data['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [7]:
data.isnull().sum()

Review    0
Liked     0
dtype: int64

In [8]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [9]:
reverse = ["not", "isn't", "wasn't", "won't", "don't", "n't"]
custom_stop = [word for word in stopwords.words('english') if word not in reverse]

In [10]:
def sentencing(review):
    review = re.sub( "[^(a-zA-Z')]", ' ', review ).lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in custom_stop]
    return ' '.join(review)

In [11]:
review = data['Review'][10]
review

'Service was very prompt.'

In [12]:
review = sentencing(review)
review

'servic prompt'

In [13]:
corpus = []
for index in range(len(data['Review'])):
    data.loc[index, 'Review'] = sentencing(data.loc[index, 'Review'])
    corpus.append(data.loc[index, 'Review'])

In [14]:
data.head()

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust not good,0
2,not tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


In [15]:
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [16]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
y = data.iloc[:,1].values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
gnb = GaussianNB()
gnb.fit(X_train, y_train).score(X_test, y_test)*100

73.5

In [20]:
def bagging(model):
    bgr = BaggingClassifier(base_estimator=model, n_estimators=100, verbose=1)
    print(bgr.fit(X_train, y_train).score(X_test, y_test)*100)
    
def ada_boost(model):
    ada = AdaBoostClassifier(base_estimator=model,n_estimators=100)
    print(ada.fit(X_train, y_train).score(X_test, y_test)*100)

In [21]:
ada_boost(gnb)

75.5


In [22]:
bagging(gnb)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


75.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s finished


In [23]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=15)
rf.fit(X_train, y_train).score(X_test, y_test)*100

79.0

In [24]:
lgbm = LGBMClassifier(boosting_type="gbdt", max_depth=8, learning_rate=0.7, feature_fraction=0.8,
                      min_data_in_leaf=8, objective='binary', metrics="accuracy", n_estimators=60
                     )
lgbm.fit(X_train, y_train).score(X_test, y_test)*100

80.0

In [25]:
y_pred = lgbm.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[87, 10],
       [30, 73]], dtype=int64)

In [26]:
def sentiment_predictor(review):
    review = sentencing(review)
    input_data = cv.transform([review]).toarray()
    input_pred = lgbm.predict(input_data)
    if input_pred[0]==1:
        return print("Review is Positive")
    return print("Review is Negative")

In [27]:
sentiment_predictor('This place is awesome, the food sound tatsy')

Review is Positive


In [28]:
sentiment_predictor('The food is bad, and service also')

Review is Negative
