In [1]:
import pandas as pd
df = pd.read_csv("Restaurant_Reviews.csv")
df.head(3)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0


In [2]:
import re
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ',text)
    text = re.sub(r'[ \n]+',' ',text)
    return text.strip().lower()

In [3]:
import spacy
def lemmatize_text(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    # Join the lemmas into a single string with spaces between each lemma
    lemmas_string = ' '.join(lemmas)
    print(lemmas_string)
    return lemmas_string

In [4]:
df.Review = df.Review.map(preprocess)

In [5]:
df.Review = df.Review.map(lemmatize_text)

wow love this place
crust be not good
not tasty and the texture be just nasty
stop by during the late may bank holiday off rick steve recommendation and love it
the selection on the menu be great and so be the price
now I be get angry and I want my damn pho
honeslty it do not taste that fresh
the potato be like rubber and you could tell they have be make up ahead of time be keep under a warmer
the fry be great too
a great touch
service be very prompt
would not go back
the cashier have no care what so ever on what I have to say it still end up be wayyy overprice
I try the cape cod ravoli chicken with cranberry mmmm
I be disgusted because I be pretty sure that be human hair
I be shocked because no sign indicate cash only
highly recommend
waitress be a little slow in service
this place be not worth your time let alone vegas
do not like at all
the burrittos blah
the food amazing
service be also cute
I could care less the interior be just beautiful
so they perform
that be right the red velv

In [6]:
df.Liked.value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [7]:
def preprocess_reviews_for_fasttext(df):
    df['Sentiment'] = df['Liked'].apply(lambda x: '__label__liked' if x == 1 else '__label__not')
    df['FastTextFormat'] = df['Sentiment'] + ' ' + df['Review']


In [8]:
preprocess_reviews_for_fasttext(df)

In [9]:
df.head(30)

Unnamed: 0,Review,Liked,Sentiment,FastTextFormat
0,wow love this place,1,__label__liked,__label__liked wow love this place
1,crust be not good,0,__label__not,__label__not crust be not good
2,not tasty and the texture be just nasty,0,__label__not,__label__not not tasty and the texture be just...
3,stop by during the late may bank holiday off r...,1,__label__liked,__label__liked stop by during the late may ban...
4,the selection on the menu be great and so be t...,1,__label__liked,__label__liked the selection on the menu be gr...
5,now I be get angry and I want my damn pho,0,__label__not,__label__not now I be get angry and I want my ...
6,honeslty it do not taste that fresh,0,__label__not,__label__not honeslty it do not taste that fresh
7,the potato be like rubber and you could tell t...,0,__label__not,__label__not the potato be like rubber and you...
8,the fry be great too,1,__label__liked,__label__liked the fry be great too
9,a great touch,1,__label__liked,__label__liked a great touch


In [10]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df,test_size=0.2)

In [11]:
test.shape

(200, 4)

In [12]:
train.to_csv("train1.train",columns=["FastTextFormat"],index=False,header=False)
test.to_csv("test1.test",columns=["FastTextFormat"],index=False,header=False)

In [13]:
lrs = [0.01,0.02,0.03,0.1,0.2,0.3,0.9]
# losses = ["softmax", "hs", "hierarchical", "ns", "kullback-leibler"]
epochs = [10,20,50,100,200,500,1000,1500,2000]
verbose = True

In [14]:
import numpy as np
import fasttext
best_model_acc = -np.inf
best_model = None
counter = 1
for lr in lrs:
        for epoch in epochs:
            print(counter , " : " , "epoch =" , epoch ,"  lr = " , lr)
            model = fasttext.train_supervised(input="train1.train",lr=lr,epoch=epoch,verbose=verbose)
            test = model.test("test1.test")
            print("model acc = " ,test)
            counter+=1
            print("--------------------------------------------------")
            if test[1] > best_model_acc:
                best_model_acc = test[1]
                best_model = model
                
        

1  :  epoch = 10   lr =  0.01
model acc =  (200, 0.525, 0.525)
--------------------------------------------------
2  :  epoch = 20   lr =  0.01
model acc =  (200, 0.535, 0.535)
--------------------------------------------------
3  :  epoch = 50   lr =  0.01
model acc =  (200, 0.56, 0.56)
--------------------------------------------------
4  :  epoch = 100   lr =  0.01
model acc =  (200, 0.77, 0.77)
--------------------------------------------------
5  :  epoch = 200   lr =  0.01
model acc =  (200, 0.805, 0.805)
--------------------------------------------------
6  :  epoch = 500   lr =  0.01
model acc =  (200, 0.795, 0.795)
--------------------------------------------------
7  :  epoch = 1000   lr =  0.01
model acc =  (200, 0.8, 0.8)
--------------------------------------------------
8  :  epoch = 1500   lr =  0.01
model acc =  (200, 0.805, 0.805)
--------------------------------------------------
9  :  epoch = 2000   lr =  0.01
model acc =  (200, 0.805, 0.805)
------------------------

In [15]:
best_model_acc

0.815

In [31]:
best_model.save_model("Reviews_classification_final1.bin")

In [19]:
model = fasttext.train_supervised(input="train1.train")
test = model.test("test1.test")
print(test)

(200, 0.55, 0.55)


In [17]:
model.save_model("Reviews_classification_final.bin")

In [29]:
text = lemmatize_text("there is to much noise in this place i could not focus")
model.predict(text)

there be to much noise in this place I could not focus


(('__label__not',), array([0.51359254]))

In [30]:
best_model.predict(text)

(('__label__not',), array([1.00000393]))