In [1]:
import os
import json
import re
import random
import pandas as pd
import fasttext
from tqdm import trange

#### Data Preparation ####

##### data url
##### https://www.kaggle.com/yelp-dataset/yelp-dataset

In [2]:
## number of reviews in dataset

!wc -l yelp_academic_dataset_review.json 

8635403 /kaggle/input/yelp-dataset/yelp_academic_dataset_review.json


In [3]:
## sample data

reviews_path = 'yelp_academic_dataset_review.json'
with open(reviews_path) as f:
    line = json.loads(f.readline())
    for key,val in line.items():
        print(f'{key} -> {val}')

review_id -> lWC-xP3rd6obsecCYsGZRg
user_id -> ak0TdVmGKo4pwqdJSTLwWw
business_id -> buF9druCkbuXLX526sGELQ
stars -> 4.0
useful -> 3
funny -> 1
cool -> 1
text -> Apparently Prides Osteria had a rough summer as evidenced by the almost empty dining room at 6:30 on a Friday night. However new blood in the kitchen seems to have revitalized the food from other customers recent visits. Waitstaff was warm but unobtrusive. By 8 pm or so when we left the bar was full and the dining room was much more lively than it had been. Perhaps Beverly residents prefer a later seating. 

After reading the mixed reviews of late I was a little tentative over our choice but luckily there was nothing to worry about in the food department. We started with the fried dough, burrata and prosciutto which were all lovely. Then although they don't offer half portions of pasta we each ordered the entree size and split them. We chose the tagliatelle bolognese and a four cheese filled pasta in a creamy sauce with bacon,

In [4]:
def strip_formatting(string):
    string = string.replace("\n"," ")
    string = string.lower()
    string = re.sub(r"([.!?,'/()])", r" \1 ", string)
    return string

def prepare_dataset(dataset_path,dataset_size = 500000,split_ratio = 0.9):
    count = 0
    with open(dataset_path) as f,open('train.txt','w') as train_,open('test.txt','w') as test_:
        for i in trange(dataset_size):
            line = json.loads(f.readline())
            rating = line['stars']
            text = strip_formatting(line['text'])
            
            ## convert data to fasttext required format ##
            form_line = "__label__{} {}\n".format(rating, text)
            
            if random.random() <= split_ratio:
                train_.write(form_line)
            else:
                test_.write(form_line)
    return os.path.join(os.getcwd(),'train.txt'),os.path.join(os.getcwd(),'test.txt')

def display_test(model):
    _,p1,r1 = model.test(test_path)
    _,p2,r2 = model.test(test_path,k=2)
    _,p3,r3 = model.test(test_path,k=3)
    return pd.DataFrame({'K' : [1,2,3],'Precision' : [p1,p2,p3],'Recall' : [r1,r2,r3]})

In [5]:
train_path,test_path = prepare_dataset(reviews_path)
print(train_path,test_path)

100%|██████████| 500000/500000 [00:28<00:00, 17351.86it/s]


/kaggle/working/train.txt /kaggle/working/test.txt


In [6]:
## check number of line in train test set

# train set
!wc -l /kaggle/working/train.txt | awk -F " " '{print $1}'

# test set
!wc -l /kaggle/working/test.txt | awk -F " " '{print $1}'

450075
49925


In [7]:
## Training model with fasttext

model_unigram = fasttext.train_supervised(input = train_path,lr = 0.05,epoch = 100)

res_unigram = display_test(model_unigram)
res_unigram

Unnamed: 0,K,Precision,Recall
0,1,0.644086,0.644086
1,2,0.449534,0.899069
2,3,0.322891,0.968673


In [8]:
model_bigram = fasttext.train_supervised(input = train_path,lr = 0.05,epoch = 100,wordNgrams = 2)

res_bigram = display_test(model_bigram)
res_bigram

Unnamed: 0,K,Precision,Recall
0,1,0.642143,0.642143
1,2,0.452188,0.904377
2,3,0.323639,0.970916


In [9]:
sample_revs = ['This place is great! Atmosphere is chill and cool but the staff is also really friendly. They know what they’re doing and what they’re talking about, and you can tell making the customers happy is their main priority. Food is pretty good, some italian classics and some twists, and for their prices it’s 100% worth it.',
              'This cozy restaurant has left the best impressions! Hospitable hosts, delicious dishes, beautiful presentation, wide wine list and wonderful dessert. I recommend to everyone! I would like to come back here again and again.',
              'this is a terrible restaurant . i hate it so much .']

In [10]:
for text in sample_revs:
    (label,prob) = model_unigram.predict(text)
    print('Stars : {} , Confidence : {:.4f}'.format(label[0].replace('__label__',''),prob[0]))

Stars : 5.0 , Confidence : 0.6875
Stars : 5.0 , Confidence : 0.9843
Stars : 1.0 , Confidence : 0.9948
