In [None]:
# https://www.analyticsvidhya.com/blog/2021/06/how-to-load-kaggle-datasets-directly-into-google-colab/
# https://www.marktechpost.com/2019/06/07/how-to-connect-google-colab-with-google-drive/

# To mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# To load the Kaggle dataset directly into Google Colab
# Install the Kaggle library
! pip install kaggle
# Make a directory named “.kaggle”
! mkdir ~/.kaggle
# Copy the “kaggle.json” into this new directory
!cp '/content/drive/MyDrive/Colab Notebooks/kaggle.json' ~/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Download the dataset
! kaggle datasets download thedevastator/yelp-reviews-sentiment-dataset

Downloading yelp-reviews-sentiment-dataset.zip to /content
 99% 160M/162M [00:06<00:00, 28.8MB/s]
100% 162M/162M [00:06<00:00, 27.3MB/s]


In [None]:
# Unzip the file and choose only train.csv
! unzip yelp-reviews-sentiment-dataset.zip -x train.csv

Archive:  yelp-reviews-sentiment-dataset.zip
  inflating: test.csv                


In [None]:
# Inspect the dataset
import pandas as pd
df = pd.read_csv('test.csv')

In [None]:
df.shape

(38000, 2)

In [None]:
df.head(10)

Unnamed: 0,text,label
0,"Contrary to other reviews, I have zero complai...",1
1,Last summer I had an appointment to get new ti...,0
2,"Friendly staff, same starbucks fair you get an...",1
3,The food is good. Unfortunately the service is...,0
4,Even when we didn't have a car Filene's Baseme...,1
5,"Picture Billy Joel's \""""Piano Man\"""" DOUBLED m...",1
6,Mediocre service. COLD food! Our food waited s...,0
7,Ok! Let me tell you about my bad experience fi...,0
8,I used to love D&B when it first opened in the...,0
9,"Like any Barnes & Noble, it has a nice comfy c...",1


In [None]:
df.tail(10)

Unnamed: 0,text,label
37990,This was my first Yelp Elite event and my firs...,1
37991,I'm in love with this adorable little shop! It...,1
37992,'Twas the week before Christmas and all throug...,1
37993,This place was just okay for me. It was nothin...,0
37994,Extremely disappointing and frustrating experi...,0
37995,If I could give 0...I would. Don't do it.,0
37996,Items Selected:\nChocolate Cinnamon Horn\nSmal...,1
37997,Expensive lunch meals. Fried pickles were goo...,0
37998,Highly overpriced and food was cold. Our waitr...,0
37999,I have been using this company for 11 months. ...,0


In [None]:
# Check if there are other languages other than English; if yes, remove them
# 1. using detect_langs
! pip install langdetect
from langdetect import detect_langs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def isEnglish(languages_list):
  for language in languages_list:
    if language.lang == 'en':
      return True
  return False

df['languages'] = df['text'].apply(detect_langs)
df['isEnglish'] = df['languages'].apply(isEnglish)

In [None]:
# see how it looks like
df.loc[df.isEnglish == False]

Unnamed: 0,text,label,languages,isEnglish
500,"MALL CLOSED\n\nJune 30, 2010\n\nRIP",0,"[af:0.5714265752853798, fr:0.4285719117849144]",False
705,Decent!,1,[nl:0.9999955439398616],False
1395,I do not recommend.,0,"[it:0.7142840936631788, pt:0.14285838311894292...",False
2612,terrible,0,"[da:0.8571399543924237, ca:0.14285951959290602]",False
2912,still my FAV,1,[sv:0.9999947660032711],False
...,...,...,...,...
37291,Je tenais vraiment \u00e0 oublier ma premi\u00...,1,[fr:0.9999954204596291],False
37411,ramen is good\u3001severs good too\u3002\u3002...,1,[hr:0.9999979306957532],False
37601,Tr\u00e8s bon restaurant en plein coeur de Cen...,1,[fr:0.9999952559255993],False
37759,"Beratungskompetenz gleich Null, die Mitarbeite...",0,[de:0.9999972859992705],False


In [None]:
# 2. using fastText
# https://medium.com/@c.chaitanya/language-identification-in-python-using-fasttext-60359dc30ed0
# https://docs.python.org/3/howto/urllib2.html

! pip install fasttext
import fasttext
from urllib import request
pretrained_lang_model_URL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
path = "lid.176.bin"
request.urlretrieve(pretrained_lang_model_URL, path)

model = fasttext.load_model(path)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




In [None]:
# check if it works and how the output will look like
model.predict("Expensive lunch meals")

(('__label__en',), array([0.96638221]))

In [None]:
def isEnglishFast(text):
  languages_tuple = model.predict(text)
  for language_label in languages_tuple[0]:
    if language_label[-2:]=='en':
      return True
  return False

df['isEnglishFast'] = df['text'].apply(isEnglishFast)

# see how it looks like
df.loc[df.isEnglishFast != True]

Unnamed: 0,text,label,languages,isEnglish,isEnglishFast
3688,"TRES VECES, la orden me ha salido mal! No rec...",0,[es:0.9999961338990258],False,False
3869,"Schneller, freundlicher Service. Preis/Leistun...",1,"[de:0.8571372666042625, en:0.1428596291038731]",True,False
4409,Delycios,1,"[es:0.8571410515455712, pl:0.14285670635308906]",False,False
6301,Mojitos.,1,[lt:0.9999959011062911],False,False
6365,"Nella mia visita a Phoenix, i miei amici mi po...",1,[it:0.9999961063146598],False,False
...,...,...,...,...,...
37065,"29 mai 2014 - serveurs courtois, pr\u00e9senta...",0,[fr:0.9999958145455384],False,False
37291,Je tenais vraiment \u00e0 oublier ma premi\u00...,1,[fr:0.9999954204596291],False,False
37601,Tr\u00e8s bon restaurant en plein coeur de Cen...,1,[fr:0.9999952559255993],False,False
37759,"Beratungskompetenz gleich Null, die Mitarbeite...",0,[de:0.9999972859992705],False,False


In [None]:
# compare the 2 models
# detect_lang is much slower and less accurate
# fastText is faster and more accurate

# to see those reviews there are detected differently by the 2 models
df.loc[(df.isEnglish != df.isEnglishFast) & (df.isEnglish == False)]

Unnamed: 0,text,label,languages,isEnglish,isEnglishFast
500,"MALL CLOSED\n\nJune 30, 2010\n\nRIP",0,"[af:0.5714265752853798, fr:0.4285719117849144]",False,True
705,Decent!,1,[nl:0.9999955439398616],False,True
1395,I do not recommend.,0,"[it:0.7142840936631788, pt:0.14285838311894292...",False,True
2612,terrible,0,"[da:0.8571399543924237, ca:0.14285951959290602]",False,True
2912,still my FAV,1,[sv:0.9999947660032711],False,True
3327,Awesome gyros!,1,"[cy:0.8571413182732788, lt:0.1428572198135441]",False,True
4167,Very poor housekeeping,0,[af:0.9999929283385025],False,True
5062,Friendliest bartenders and best martinis in Ma...,1,[de:0.9999979601786492],False,True
5175,no......just no!,0,[sl:0.9999924677553107],False,True
5598,Awesome burgers.,1,[af:0.9999940843688306],False,True


In [None]:
# so, we choose only those reviews detected by fastText as English
df = df.loc[df.isEnglishFast == True]

In [None]:
df.shape

(37769, 5)

In [None]:
# Text preprocessing
# 1. lower case
# Series.str.lower() --> Convert strings in the Series/Index to lowercase
df.text = df['text'].str.lower()

In [None]:
# check if they are in lower case
df.head()

Unnamed: 0,text,label,languages,isEnglish,isEnglishFast
0,"contrary to other reviews, i have zero complai...",1,[en:0.9999986395268634],True,True
1,last summer i had an appointment to get new ti...,0,[en:0.9999970957499589],True,True
2,"friendly staff, same starbucks fair you get an...",1,[en:0.9999959616159918],True,True
3,the food is good. unfortunately the service is...,0,[en:0.9999960258972647],True,True
4,even when we didn't have a car filene's baseme...,1,[en:0.9999975520745594],True,True


In [None]:
# 2. remove punctuation marks, stopwords, digits, etc.
import nltk
# nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

# A RegexpTokenizer splits a string into substrings using a regular expression.
# We want a match where the string contains any word characters with one or more occurrences.
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
df['tokens'] = df['text'].apply(tokenizer.tokenize)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df.head()

Unnamed: 0,text,label,languages,isEnglish,isEnglishFast,tokens
0,"contrary to other reviews, i have zero complai...",1,[en:0.9999986395268634],True,True,"[contrary, to, other, reviews, i, have, zero, ..."
1,last summer i had an appointment to get new ti...,0,[en:0.9999970957499589],True,True,"[last, summer, i, had, an, appointment, to, ge..."
2,"friendly staff, same starbucks fair you get an...",1,[en:0.9999959616159918],True,True,"[friendly, staff, same, starbucks, fair, you, ..."
3,the food is good. unfortunately the service is...,0,[en:0.9999960258972647],True,True,"[the, food, is, good, unfortunately, the, serv..."
4,even when we didn't have a car filene's baseme...,1,[en:0.9999975520745594],True,True,"[even, when, we, didn, t, have, a, car, filene..."


In [None]:
# remove stopwords
my_stopwords=nltk.corpus.stopwords.words('english')
stopwords_set = set(my_stopwords)
stopwords_set.update('&')

df['tokens_clean']= df['tokens'].apply(lambda text : [token for token in text if token not in stopwords_set and (len(token)>1 and token.isalpha())])

In [None]:
df.head(15)

Unnamed: 0,text,label,languages,isEnglish,isEnglishFast,tokens,tokens_clean
0,"contrary to other reviews, i have zero complai...",1,[en:0.9999986395268634],True,True,"[contrary, to, other, reviews, i, have, zero, ...","[contrary, reviews, zero, complaints, service,..."
1,last summer i had an appointment to get new ti...,0,[en:0.9999970957499589],True,True,"[last, summer, i, had, an, appointment, to, ge...","[last, summer, appointment, get, new, tires, w..."
2,"friendly staff, same starbucks fair you get an...",1,[en:0.9999959616159918],True,True,"[friendly, staff, same, starbucks, fair, you, ...","[friendly, staff, starbucks, fair, get, anywhe..."
3,the food is good. unfortunately the service is...,0,[en:0.9999960258972647],True,True,"[the, food, is, good, unfortunately, the, serv...","[food, good, unfortunately, service, hit, miss..."
4,even when we didn't have a car filene's baseme...,1,[en:0.9999975520745594],True,True,"[even, when, we, didn, t, have, a, car, filene...","[even, car, filene, basement, worth, bus, trip..."
5,"picture billy joel's \""""piano man\"""" doubled m...",1,[en:0.9999954560513125],True,True,"[picture, billy, joel, s, piano, man, doubled,...","[picture, billy, joel, piano, man, doubled, mi..."
6,mediocre service. cold food! our food waited s...,0,[en:0.9999991385429662],True,True,"[mediocre, service, cold, food, our, food, wai...","[mediocre, service, cold, food, food, waited, ..."
7,ok! let me tell you about my bad experience fi...,0,[en:0.9999968675346629],True,True,"[ok, let, me, tell, you, about, my, bad, exper...","[ok, let, tell, bad, experience, first, went, ..."
8,i used to love d&b when it first opened in the...,0,[en:0.9999955057633292],True,True,"[i, used, to, love, d, b, when, it, first, ope...","[used, love, first, opened, waterfront, gone, ..."
9,"like any barnes & noble, it has a nice comfy c...",1,[en:0.9999956289178797],True,True,"[like, any, barnes, noble, it, has, a, nice, c...","[like, barnes, noble, nice, comfy, cafe, large..."


In [None]:
# 3. Stemming
from nltk.stem import PorterStemmer
porter = PorterStemmer()

df['tokens_clean2'] = df['tokens_clean'].apply(lambda text : [porter.stem(token) for token in text])

In [None]:
df.head()

Unnamed: 0,text,label,languages,isEnglish,isEnglishFast,tokens,tokens_clean,tokens_clean2
0,"contrary to other reviews, i have zero complai...",1,[en:0.9999986395268634],True,True,"[contrary, to, other, reviews, i, have, zero, ...","[contrary, reviews, zero, complaints, service,...","[contrari, review, zero, complaint, servic, pr..."
1,last summer i had an appointment to get new ti...,0,[en:0.9999970957499589],True,True,"[last, summer, i, had, an, appointment, to, ge...","[last, summer, appointment, get, new, tires, w...","[last, summer, appoint, get, new, tire, wait, ..."
2,"friendly staff, same starbucks fair you get an...",1,[en:0.9999959616159918],True,True,"[friendly, staff, same, starbucks, fair, you, ...","[friendly, staff, starbucks, fair, get, anywhe...","[friendli, staff, starbuck, fair, get, anywher..."
3,the food is good. unfortunately the service is...,0,[en:0.9999960258972647],True,True,"[the, food, is, good, unfortunately, the, serv...","[food, good, unfortunately, service, hit, miss...","[food, good, unfortun, servic, hit, miss, main..."
4,even when we didn't have a car filene's baseme...,1,[en:0.9999975520745594],True,True,"[even, when, we, didn, t, have, a, car, filene...","[even, car, filene, basement, worth, bus, trip...","[even, car, filen, basement, worth, bu, trip, ..."


In [None]:
df['clean_text'] = df['tokens_clean2'].str.join(" ")

In [None]:
df.head()

Unnamed: 0,text,label,languages,isEnglish,isEnglishFast,tokens,tokens_clean,tokens_clean2,clean_text
0,"contrary to other reviews, i have zero complai...",1,[en:0.9999986395268634],True,True,"[contrary, to, other, reviews, i, have, zero, ...","[contrary, reviews, zero, complaints, service,...","[contrari, review, zero, complaint, servic, pr...",contrari review zero complaint servic price ge...
1,last summer i had an appointment to get new ti...,0,[en:0.9999970957499589],True,True,"[last, summer, i, had, an, appointment, to, ge...","[last, summer, appointment, get, new, tires, w...","[last, summer, appoint, get, new, tire, wait, ...",last summer appoint get new tire wait super lo...
2,"friendly staff, same starbucks fair you get an...",1,[en:0.9999959616159918],True,True,"[friendly, staff, same, starbucks, fair, you, ...","[friendly, staff, starbucks, fair, get, anywhe...","[friendli, staff, starbuck, fair, get, anywher...",friendli staff starbuck fair get anywher els s...
3,the food is good. unfortunately the service is...,0,[en:0.9999960258972647],True,True,"[the, food, is, good, unfortunately, the, serv...","[food, good, unfortunately, service, hit, miss...","[food, good, unfortun, servic, hit, miss, main...",food good unfortun servic hit miss main issu s...
4,even when we didn't have a car filene's baseme...,1,[en:0.9999975520745594],True,True,"[even, when, we, didn, t, have, a, car, filene...","[even, car, filene, basement, worth, bus, trip...","[even, car, filen, basement, worth, bu, trip, ...",even car filen basement worth bu trip waterfro...


In [None]:
# train/test split
from sklearn.model_selection import train_test_split
y = df.label
X = df.clean_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [None]:
X_train

14782                           wow best place buy groceri
14189    great hole da wall joint big plu get lot stuff...
19329    know hard go wrong find good restaur montreal ...
9937                             even go write anyth miser
8656     cannot commend star enough put fenc keep rabid...
                               ...                        
27016    wife went vida base good yelp review disappoin...
36585    friend recommend restaur japan amaz food custo...
14425    ok mostli like go nwe place order take pork fo...
30933    recent came doubl date anoth coupl place one f...
25940    past week went stingray lunch late lunch aroun...
Name: clean_text, Length: 30215, dtype: object

In [None]:
y_train

14782    1
14189    1
19329    1
9937     0
8656     1
        ..
27016    1
36585    1
14425    0
30933    1
25940    1
Name: label, Length: 30215, dtype: int64

In [None]:
X_test

29724    total fraud went look reput pool compani fact ...
37435    abl eat lunch close seat ask next day thing ha...
26192    cute littl spot need littl indi spot like one ...
24220    love place right next motel food price servic ...
5366     place beauti sunlit airi fill fine modern clea...
                               ...                        
10706                               discrimin men guy ladi
32237    get mad sub contract job detail show home told...
17237    basic get three comedian price one emce short ...
1823     great experi place went lunch chicken fajita l...
10600    best food kind experienc chicken fri steak pre...
Name: clean_text, Length: 7554, dtype: object

In [None]:
y_test

29724    0
37435    0
26192    1
24220    1
5366     0
        ..
10706    0
32237    0
17237    1
1823     1
10600    1
Name: label, Length: 7554, dtype: int64

In [None]:
# vectorization of X_train and X_test using BOW model
from sklearn.feature_extraction.text import CountVectorizer
# vect = CountVectorizer()
vect = CountVectorizer(ngram_range=(1,2) , min_df=50)

# 1. X_train
review_train = vect.fit_transform(X_train)
X_train_transformed = pd.DataFrame(review_train.toarray(), columns=vect.get_feature_names())

# 2. X_test
review_test = vect.transform(X_test)
X_test_transformed = pd.DataFrame(review_test.toarray(), columns=vect.get_feature_names())



In [None]:
X_train_transformed

Unnamed: 0,abil,abl,abl get,absolut,absolut amaz,absolut delici,absolut love,abund,ac,accent,...,young,young ladi,younger,yr,yuck,yum,yummi,zero,zero star,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30212,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
30213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y_train

14782    1
14189    1
19329    1
9937     0
8656     1
        ..
27016    1
36585    1
14425    0
30933    1
25940    1
Name: label, Length: 30215, dtype: int64

In [None]:
X_test_transformed

Unnamed: 0,abil,abl,abl get,absolut,absolut amaz,absolut delici,absolut love,abund,ac,accent,...,young,young ladi,younger,yr,yuck,yum,yummi,zero,zero star,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7549,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7551,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7552,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# train the model
# 1. Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# supervised ML
nb.fit(X_train_transformed,y_train)

MultinomialNB()

In [None]:
nb_predict_y=nb.predict(X_test_transformed)

In [None]:
nb_predict_y

array([0, 0, 1, ..., 0, 1, 1])

In [None]:
# measuring NB model performance

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, nb_predict_y, target_names = ['negative (0)', 'positive (1)']))

cm = confusion_matrix(y_test, nb_predict_y)
print(pd.DataFrame(cm))

              precision    recall  f1-score   support

negative (0)       0.89      0.84      0.87      3782
positive (1)       0.85      0.90      0.87      3772

    accuracy                           0.87      7554
   macro avg       0.87      0.87      0.87      7554
weighted avg       0.87      0.87      0.87      7554

      0     1
0  3179   603
1   389  3383


In [None]:
# train the model
# 2. Logistic Regression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)

In [None]:
# supervised ML
lr.fit(X_train_transformed,y_train)

LogisticRegression(max_iter=1000)

In [None]:
lr_predict_y=lr.predict(X_test_transformed)

In [None]:
# measuring LR model performance

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, lr_predict_y, target_names = ['negative (0)', 'positive (1)']))

cm = confusion_matrix(y_test, lr_predict_y)
print(pd.DataFrame(cm))

              precision    recall  f1-score   support

negative (0)       0.92      0.90      0.91      3782
positive (1)       0.90      0.92      0.91      3772

    accuracy                           0.91      7554
   macro avg       0.91      0.91      0.91      7554
weighted avg       0.91      0.91      0.91      7554

      0     1
0  3411   371
1   311  3461
