In [464]:
import pandas as pd 
import numpy as np

#stats
from scipy import sparse
import scipy.stats as ss

#viz
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image

#nlp
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords

#import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


#FeatureEngineering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, decomposition, ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

import textblob
import xgboost

from textblob import TextBlob

In [465]:
yelp= pd.read_csv('reviews.csv')

In [466]:
yelp.head(5)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [467]:
### Drop the unusual columns

In [468]:
yelp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  10000 non-null  object
 1   date         10000 non-null  object
 2   review_id    10000 non-null  object
 3   stars        10000 non-null  int64 
 4   text         10000 non-null  object
 5   type         10000 non-null  object
 6   user_id      10000 non-null  object
 7   cool         10000 non-null  int64 
 8   useful       10000 non-null  int64 
 9   funny        10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [469]:
yelp=yelp.drop(columns=['business_id','review_id','user_id','type','date'])

In [470]:
yelp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   stars   10000 non-null  int64 
 1   text    10000 non-null  object
 2   cool    10000 non-null  int64 
 3   useful  10000 non-null  int64 
 4   funny   10000 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 390.8+ KB


In [471]:
yelp.head(5)

Unnamed: 0,stars,text,cool,useful,funny
0,5,My wife took me here on my birthday for breakf...,2,5,0
1,5,I have no idea why some people give bad review...,0,0,0
2,4,love the gyro plate. Rice is so good and I als...,0,1,0
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1,2,0
4,5,General Manager Scott Petello is a good egg!!!...,0,0,0


In [472]:
#### We will perform the lemmatization first .

In [473]:
from nltk.stem import WordNetLemmatizer

In [474]:
lemmatizer=WordNetLemmatizer()

In [475]:
def lemmatization(x):
    
    return ' '.join([lemmatizer.lemmatize(word,pos='a') for word in word_tokenize(x)])  ### parts of speech as verb

yelp['text'] = yelp['text'].apply(lemmatization)

In [476]:
yelp

Unnamed: 0,stars,text,cool,useful,funny
0,5,My wife took me here on my birthday for breakf...,2,5,0
1,5,I have no idea why some people give bad review...,0,0,0
2,4,love the gyro plate . Rice is so good and I al...,0,1,0
3,5,"Rosie , Dakota , and I LOVE Chaparral Dog Park...",1,2,0
4,5,General Manager Scott Petello is a good egg ! ...,0,0,0
...,...,...,...,...,...
9995,3,First visit ... Had lunch here today - used my...,1,2,0
9996,4,Should be called house of deliciousness ! I co...,0,0,0
9997,4,I recently visited Olive and Ivy for business ...,0,0,0
9998,2,My nephew just moved to Scottsdale recently so...,0,0,0


In [477]:
yelp['Is_negative'] = np.where(yelp.stars>3, 0, 1)

In [478]:
yelp

Unnamed: 0,stars,text,cool,useful,funny,Is_negative
0,5,My wife took me here on my birthday for breakf...,2,5,0,0
1,5,I have no idea why some people give bad review...,0,0,0,0
2,4,love the gyro plate . Rice is so good and I al...,0,1,0,0
3,5,"Rosie , Dakota , and I LOVE Chaparral Dog Park...",1,2,0,0
4,5,General Manager Scott Petello is a good egg ! ...,0,0,0,0
...,...,...,...,...,...,...
9995,3,First visit ... Had lunch here today - used my...,1,2,0,1
9996,4,Should be called house of deliciousness ! I co...,0,0,0,0
9997,4,I recently visited Olive and Ivy for business ...,0,0,0,0
9998,2,My nephew just moved to Scottsdale recently so...,0,0,0,1


In [479]:
r1="""My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I've ever had.  I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I've ever had.

Anyway, I can't wait to go back!"""

In [480]:
from textblob import TextBlob

In [481]:
blob1 = TextBlob(r1)

In [482]:
blob1.polarity

0.40246913580246907

In [483]:
yelp['sentiment_score'] =  yelp.text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [484]:
yelp

Unnamed: 0,stars,text,cool,useful,funny,Is_negative,sentiment_score
0,5,My wife took me here on my birthday for breakf...,2,5,0,0,0.413580
1,5,I have no idea why some people give bad review...,0,0,0,0,0.228478
2,4,love the gyro plate . Rice is so good and I al...,0,1,0,0,0.566667
3,5,"Rosie , Dakota , and I LOVE Chaparral Dog Park...",1,2,0,0,0.608646
4,5,General Manager Scott Petello is a good egg ! ...,0,0,0,0,0.468125
...,...,...,...,...,...,...,...
9995,3,First visit ... Had lunch here today - used my...,1,2,0,1,0.217708
9996,4,Should be called house of deliciousness ! I co...,0,0,0,0,0.375541
9997,4,I recently visited Olive and Ivy for business ...,0,0,0,0,0.264633
9998,2,My nephew just moved to Scottsdale recently so...,0,0,0,1,-0.114063


In [485]:
yelp[yelp.sentiment_score>0.1]

Unnamed: 0,stars,text,cool,useful,funny,Is_negative,sentiment_score
0,5,My wife took me here on my birthday for breakf...,2,5,0,0,0.413580
1,5,I have no idea why some people give bad review...,0,0,0,0,0.228478
2,4,love the gyro plate . Rice is so good and I al...,0,1,0,0,0.566667
3,5,"Rosie , Dakota , and I LOVE Chaparral Dog Park...",1,2,0,0,0.608646
4,5,General Manager Scott Petello is a good egg ! ...,0,0,0,0,0.468125
...,...,...,...,...,...,...,...
9993,4,"Judging by some of the reviews , maybe I went ...",1,1,0,0,0.430303
9995,3,First visit ... Had lunch here today - used my...,1,2,0,1,0.217708
9996,4,Should be called house of deliciousness ! I co...,0,0,0,0,0.375541
9997,4,I recently visited Olive and Ivy for business ...,0,0,0,0,0.264633


In [486]:
yelp['sentiment'] = np.where(yelp.sentiment_score>0.1,'Positive',np.where(yelp.sentiment_score<-0.1, 'Negative', 'Neutral'))

In [487]:
yelp['sentiment_score'].min()

-1.0

In [488]:
without_cleaning=pd.crosstab(yelp.stars, yelp.sentiment)

In [489]:
### Now we will perform the further cleaning of thetext and check the difference in the results f classificatio

In [490]:
import re

def clean_text(x):
    x=x.lower()
    x=x.strip()
    x=re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", x)
    
    return (x)

In [491]:
yelp['text']=yelp['text'].apply(clean_text)

In [492]:
yelp

Unnamed: 0,stars,text,cool,useful,funny,Is_negative,sentiment_score,sentiment
0,5,my wife took me here on my birthday for breakf...,2,5,0,0,0.413580,Positive
1,5,i have no idea why some people give bad review...,0,0,0,0,0.228478,Positive
2,4,love the gyro plate rice is so good and i als...,0,1,0,0,0.566667,Positive
3,5,rosie dakota and i love chaparral dog park ...,1,2,0,0,0.608646,Positive
4,5,general manager scott petello is a good egg ...,0,0,0,0,0.468125,Positive
...,...,...,...,...,...,...,...,...
9995,3,first visit had lunch here today used my gro...,1,2,0,1,0.217708,Positive
9996,4,should be called house of deliciousness i cou...,0,0,0,0,0.375541,Positive
9997,4,i recently visited olive and ivy for business ...,0,0,0,0,0.264633,Positive
9998,2,my nephew just moved to scottsdale recently so...,0,0,0,1,-0.114063,Negative


In [493]:
yelp['sentiment_score_cleaned'] =  yelp.text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [494]:
yelp

Unnamed: 0,stars,text,cool,useful,funny,Is_negative,sentiment_score,sentiment,sentiment_score_cleaned
0,5,my wife took me here on my birthday for breakf...,2,5,0,0,0.413580,Positive,0.413580
1,5,i have no idea why some people give bad review...,0,0,0,0,0.228478,Positive,0.220870
2,4,love the gyro plate rice is so good and i als...,0,1,0,0,0.566667,Positive,0.600000
3,5,rosie dakota and i love chaparral dog park ...,1,2,0,0,0.608646,Positive,0.493333
4,5,general manager scott petello is a good egg ...,0,0,0,0,0.468125,Positive,0.361111
...,...,...,...,...,...,...,...,...,...
9995,3,first visit had lunch here today used my gro...,1,2,0,1,0.217708,Positive,0.217708
9996,4,should be called house of deliciousness i cou...,0,0,0,0,0.375541,Positive,0.375541
9997,4,i recently visited olive and ivy for business ...,0,0,0,0,0.264633,Positive,0.256820
9998,2,my nephew just moved to scottsdale recently so...,0,0,0,1,-0.114063,Negative,-0.100000


In [495]:
yelp['sentiment_new'] = np.where(yelp.sentiment_score_cleaned>0.1,'Positive',np.where(yelp.sentiment_score_cleaned<-0.1, 'Negative', 'Neutral'))

In [496]:
with_cleaning=pd.crosstab(yelp.stars,yelp.sentiment_new)

In [497]:
without_cleaning

sentiment,Negative,Neutral,Positive
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,193,373,183
2,78,398,451
3,26,341,1094
4,30,355,3141
5,19,231,3087


In [498]:
with_cleaning

sentiment_new,Negative,Neutral,Positive
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,188,381,180
2,76,401,450
3,27,347,1087
4,23,377,3126
5,19,254,3064


In [499]:
#### Analysis

In [500]:
### Classification of the review is negative or not.

In [501]:
yelp=yelp.drop(columns=['sentiment_score','sentiment'])

In [502]:
yelp.rename(columns={'sentiment_score_cleaned':'sentiment_score','sentiment_new':'sentiment'},inplace=True)

In [503]:
y=yelp['Is_negative']
x=yelp.text

In [504]:
stop=stopwords.words('english')

In [505]:
##Train-Test split

In [506]:
train_X, test_X, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=123)

In [583]:
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}', 
                             ngram_range=(1, 1 ), 
                             min_df=5, 
                             encoding='latin-1' , 
                             lowercase = True,
                             max_features=1000,
                             stop_words=stop
                              )
train_x_tfidf = tfidf_vect.fit_transform(train_X)
test_x_tfidf = tfidf_vect.transform(test_X)

In [508]:
train_x_tfidf

<7000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 257489 stored elements in Compressed Sparse Row format>

In [509]:
test_x_tfidf

<3000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 110829 stored elements in Compressed Sparse Row format>

In [510]:
dtm_tfidf_train=pd.DataFrame(train_x_tfidf.todense(),columns=tfidf_vect.get_feature_names())
dtm_tfidf_test=pd.DataFrame(test_x_tfidf.todense(),columns=tfidf_vect.get_feature_names())



In [511]:
### training of model

In [512]:
from sklearn.svm import LinearSVC

In [513]:
svc=LinearSVC(C=1)

In [514]:
svc_t=svc.fit(dtm_tfidf_train,train_y)

In [515]:
train_y_pred=svc_t.predict(dtm_tfidf_train)

In [516]:
test_y_pred=svc_t.predict(dtm_tfidf_test)

In [517]:
print(metrics.classification_report(train_y,train_y_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.91      4808
           1       0.85      0.75      0.80      2192

    accuracy                           0.88      7000
   macro avg       0.87      0.84      0.86      7000
weighted avg       0.88      0.88      0.88      7000



In [518]:
print(metrics.classification_report(test_y,test_y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      2055
           1       0.73      0.65      0.69       945

    accuracy                           0.81      3000
   macro avg       0.79      0.77      0.78      3000
weighted avg       0.81      0.81      0.81      3000



In [597]:
###Now we will check the output by passing reviews

In [600]:
test="""I have no idea why some people give bad reviews about this place. It goes to show you, you can please everyone. They are probably griping about something that their own fault...there are many people like that."""

In [601]:
test=lemmatization(test)

In [602]:
test=clean_text(test)

In [542]:
s=tfidf_vect.transform(test)

In [544]:
svc_t.predict(pd.DataFrame(s,columns=vect.get_feature_names()))

<362x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [603]:
s=test.strip()

In [604]:
s

'i have no idea why some people give bad reviews about this place  it goes to show you  you can please everyone  they are probably griping about something that their own fault  there are many people like that'

In [605]:
test=[s]

In [606]:
test

['i have no idea why some people give bad reviews about this place  it goes to show you  you can please everyone  they are probably griping about something that their own fault  there are many people like that']

In [607]:
s=tfidf_vect.transform(test)

In [609]:
svc_t.predict(s)

array([1])