# Libraries

In [1]:
import pandas as pd
import orjson

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


# Data preparation

## Data loading

### Reviews

We load the *review* and the *business* json files and transform them into a DataFrame 

In [2]:
with open('data/yelp_academic_dataset_review.json', 'rb') as file:
    reviews = [orjson.loads(line) for line in file]

df_reviews = pd.DataFrame.from_dict(reviews)
print(df_reviews.shape)
df_reviews.head()

(6990280, 9)


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


We will only consider reviews shorter than 800 characters.

In [3]:
df_reviews_short = df_reviews[df_reviews['text'].apply(lambda x: len(str(x)) < 800)]
print(f"Current shape: {df_reviews_short.shape} (removed {df_reviews.shape[0] - df_reviews_short.shape[0]} rows)")

Current shape: (5512211, 9) (removed 1478069 rows)


We will now perform undersampling so we have balanced data based on the number of stars. In our case, the undersampling is also preferd as we do not have the computational power to do upsampling.

In [4]:
# Get minimal value
min_count = df_reviews_short["stars"].value_counts()
min_category = min_count.idxmin()
min_value = min_count[min_category]
# Downsample
df_reviews_down = df_reviews_short[df_reviews_short["stars"] == min_category]
for category in [1,2,3,4,5]:
    if category != min_category:
        df_sample = df_reviews_short[df_reviews_short["stars"] == category]
        df_sample = df_sample.sample(n = min_value, random_state = 99)
        df_reviews_down = pd.concat([df_reviews_down, df_sample], ignore_index=True)
        
print(f"Current shape: {df_reviews_down.shape} (removed {df_reviews_short.shape[0] - df_reviews_down.shape[0]} rows)")

Current shape: (1867950, 9) (removed 3644261 rows)


Lets now control it.

In [5]:
df_reviews_down["stars"].value_counts()

stars
2.0    373590
1.0    373590
3.0    373590
4.0    373590
5.0    373590
Name: count, dtype: int64

Stats *funny*, *useful*, and *cool* are rescaled on a $\langle 0,1 \rangle$ range.

In [6]:
df_reviews_down["useful"] = (df_reviews_down["useful"]  - min(df_reviews_down["useful"]))/(max(df_reviews_down["useful"] ) - min(df_reviews_down["useful"]))
df_reviews_down["funny"]  = (df_reviews_down["funny"]   - min(df_reviews_down["funny"]))/ (max(df_reviews_down["funny"] )  - min(df_reviews_down["funny"]))
df_reviews_down["cool"]   = (df_reviews_down["cool"]    - min(df_reviews_down["cool"]))/  (max(df_reviews_down["cool"] )   - min(df_reviews_down["cool"]))

### Business

In [7]:
with open('data/yelp_academic_dataset_business.json', 'rb') as file:
    business = [orjson.loads(line) for line in file]

df_business = pd.DataFrame.from_dict(business)
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


We only care about restaurants.

In [9]:
df_business_rest = df_business[df_business['categories'].str.contains('Restaurants', case=False, na=False)]
print(f"Current shape: {df_business_rest.shape} (removed {df_business.shape[0] - df_business_rest.shape[0]} rows)")

Current shape: (52268, 14) (removed 98078 rows)


## Data merge

To get a complete overview of the data, tables are joined by the *business_id* and the variable *stars* is renamed.

In [10]:
df_business = df_business_rest.rename(axis=1, mapper={"stars": "stars_business"})
df_reviews = df_reviews_down.rename(axis=1, mapper={"stars": "stars_reviews"})
df = pd.merge(df_business, df_reviews, on='business_id', how="inner")
pd.set_option('display.max_columns', None)
print(df.shape)
df.head()

(1328409, 22)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_business,review_count,is_open,attributes,categories,hours,review_id,user_id,stars_reviews,useful,funny,cool,text,date
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",coVz18GyxT9PJSkl-H8KLQ,19cvBxYcO52xV5x6EKTa_A,2.0,0.001072,0.00277,0.002786,Went just for the egg tarts. The crust was inc...,2015-04-06 14:31:37
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",I8k0__Bp6EYuu73hxZ0kYg,UuVWbpQu76pJOFc1SQNk6A,2.0,0.002144,0.00554,0.002786,The lady on the counter looked like she did no...,2017-11-18 00:13:43
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",QAy1XATH29BEtCBhuW2CeA,oDuMcQ73TF60-TuWDu_GUQ,1.0,0.002144,0.01108,0.002786,Horrible 3rd world service. After I picked out...,2016-03-13 03:26:59
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",DbARmo95_axUdC9FU8o_Jw,X_DkwPTzdO_VWzUcbUXREg,3.0,0.0,0.0,0.0,so I dropped in to this store since it was the...,2012-08-16 19:42:17
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",fI_dVXwDk_MtmCufRabMgg,nDFRVVcNLLUt-F_s0yBIPA,3.0,0.002144,0.0,0.002786,Arguably the best Chinese pastry shop in Phill...,2009-02-21 22:07:07


## Column selection

Some columns definitely won't be helpfull and we decided to drop them.

In [11]:
cols_to_use = ["stars_reviews", "useful", "funny", "cool", "text"]
df = df.loc[:, cols_to_use]
df.head()

Unnamed: 0,stars_reviews,useful,funny,cool,text
0,2.0,0.001072,0.00277,0.002786,Went just for the egg tarts. The crust was inc...
1,2.0,0.002144,0.00554,0.002786,The lady on the counter looked like she did no...
2,1.0,0.002144,0.01108,0.002786,Horrible 3rd world service. After I picked out...
3,3.0,0.0,0.0,0.0,so I dropped in to this store since it was the...
4,3.0,0.002144,0.0,0.002786,Arguably the best Chinese pastry shop in Phill...


# Transformer

First, the text needs to be encoded.

In [12]:
df.loc[0:1000, "stars_reviews"].value_counts()

stars_reviews
3.0    236
4.0    213
2.0    202
1.0    185
5.0    165
Name: count, dtype: int64

In [13]:
def format_data(row):
    return f"Metadata: Review stars = {row['stars_reviews']}, Usefulness = {row['useful']}, Funny: {row['funny']}, Cool: {row['cool']}; Review: {row['text']}"

formatted_text = [format_data(row) for _, row in df.iterrows()]
formatted_text[1]

'Metadata: Review stars = 2.0, Usefulness = 0.0021436227224008574, Funny: 0.00554016620498615, Cool: 0.002785515320334262; Review: The lady on the counter looked like she did not want to do business at all, her face was very unpleasant. I went to this store specifically to get their egg tarts cause they WERE delicious. Now, not anymore. The fillings had watery texture instead of the creamy taste they should have. \n\nI also ordered "honey citron tea" which the sign said $1.25, then the lady charged me $1.50. I positively thought they must have changed the price. As I went back, she got my order wrong and gave me "honey ginseng tea" instead. Ugh. I also bought a slice of tiramisu cake, and it was ok, not too sweet but I hardly tasted the coffee at all.\n\nI will not be back.'

----------------------------------------------------------------------------------------------------
LLAMA MODEL alternative
----------------------------------------------------------------------------------------------------

In [15]:
from transformers import (AutoModelForCausalLM, AutoTokenizer,  TrainingArguments,)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from transformers import LlamaTokenizer, LlamaForCausalLM

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# Fine-tuned model name
new_model = "finetuned_model"

model = AutoModelForCausalLM.from_pretrained(
    model_name
    #,torch_dtype=torch.float16
)

: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Set training parameters
training_arguments = TrainingArguments(
    output_dir= "./results"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset = formatted_text[0:1000],
    #dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

----------------------------------------------------------------------------------------------------

Model and tokenizer creation.

In [53]:
model_name = 'gpt2'  # or 'gpt2-medium', 'gpt2-large', 'gpt2-xl' depending on the model size you want
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Model training.

In [55]:
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

formatted_text_sample = formatted_text[0:1000]
for i, sentence in enumerate(formatted_text_sample):
    # Encode each sentence separately
    encoded_sentence = tokenizer.encode(sentence, return_tensors='pt')
    
    # Training loop for each sentence
    optimizer.zero_grad()
    outputs = model(encoded_sentence, labels=encoded_sentence)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"({i+1}/{len(formatted_text_sample)}) Sentence: {sentence} - Loss: {loss.item()}")

(1/1000) Sentence: Metadata: Review stars = 2.0, Usefulness = 0.0010718113612004287, Funny: 0.002770083102493075, Cool: 0.002785515320334262; Review: Went just for the egg tarts. The crust was incredibly thick and while flaky, the edges had the texture of being stale. The egg custard was sweeter than I am used to but I did not mind that as much the crust. - Loss: 3.5797476768493652
(2/1000) Sentence: Metadata: Review stars = 2.0, Usefulness = 0.0021436227224008574, Funny: 0.00554016620498615, Cool: 0.002785515320334262; Review: The lady on the counter looked like she did not want to do business at all, her face was very unpleasant. I went to this store specifically to get their egg tarts cause they WERE delicious. Now, not anymore. The fillings had watery texture instead of the creamy taste they should have. 

I also ordered "honey citron tea" which the sign said $1.25, then the lady charged me $1.50. I positively thought they must have changed the price. As I went back, she got my ord

Finally, save the model.

In [56]:
model.save_pretrained('trained_gpt2_model')
tokenizer.save_pretrained('trained_gpt2_model')

('trained_gpt2_model\\tokenizer_config.json',
 'trained_gpt2_model\\special_tokens_map.json',
 'trained_gpt2_model\\vocab.json',
 'trained_gpt2_model\\merges.txt',
 'trained_gpt2_model\\added_tokens.json')