# Libraries

In [2]:
import pandas as pd
import orjson

# Data preparation

## Data loading

We load the *review* and the *business* json files and transform them into a DataFrame 

In [3]:
with open('data/yelp_academic_dataset_review.json', 'rb') as file:
    reviews = [orjson.loads(line) for line in file]

df_reviews = pd.DataFrame.from_dict(reviews)
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [4]:
with open('data/yelp_academic_dataset_business.json', 'rb') as file:
    business = [orjson.loads(line) for line in file]

df_business = pd.DataFrame.from_dict(business)
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


## Data merge

To get a complete overview of the data, tables are joined by the *business_id* and the variable *stars* is renamed.

In [5]:
df_business = df_business.rename(axis=1, mapper={"stars": "stars_business"})
df_reviews = df_reviews.rename(axis=1, mapper={"stars": "stars_reviews"})
df = pd.merge(df_reviews, df_business, on='business_id')
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars_reviews,useful,funny,cool,text,date,name,address,city,state,postal_code,latitude,longitude,stars_business,review_count,is_open,attributes,categories,hours
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,This is the second time we tried turning point...,2017-05-13 17:06:55,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4.0,2,0,1,The place is cute and the staff was very frien...,2017-08-08 00:58:18,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,We came on a Saturday morning after waiting a ...,2017-11-19 02:20:23,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,"Mediocre at best. The decor is very nice, and ...",2017-09-09 17:49:47,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."


## Data filtering

Because the reviews can be for other things than restaurants, only relevant reviews are taken into account.

In [6]:
df = df[df['categories'].str.contains('Restaurants', case=False, na=False)]
df.head()

Unnamed: 0,review_id,user_id,business_id,stars_reviews,useful,funny,cool,text,date,name,address,city,state,postal_code,latitude,longitude,stars_business,review_count,is_open,attributes,categories,hours
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,This is the second time we tried turning point...,2017-05-13 17:06:55,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4.0,2,0,1,The place is cute and the staff was very frien...,2017-08-08 00:58:18,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,We came on a Saturday morning after waiting a ...,2017-11-19 02:20:23,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,"Mediocre at best. The decor is very nice, and ...",2017-09-09 17:49:47,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."


## Column selection

Some columns definitely won't be helpfull and we decided to drop them.

In [7]:
cols_to_use = ["stars_reviews", "stars_business", "useful", "funny", "cool", "text"]
df = df.loc[:, cols_to_use]
df.head()

Unnamed: 0,stars_reviews,stars_business,useful,funny,cool,text
0,3.0,3.0,0,0,0,"If you decide to eat here, just be aware it is..."
1,2.0,3.0,0,0,0,This is the second time we tried turning point...
2,4.0,3.0,2,0,1,The place is cute and the staff was very frien...
3,3.0,3.0,0,0,0,We came on a Saturday morning after waiting a ...
4,2.0,3.0,0,0,0,"Mediocre at best. The decor is very nice, and ..."


## Scaling

Stats *funny*, *useful*, and *cool* are rescaled on a $\langle 0,1 \rangle$ range.

In [8]:
df["useful"] = (df["useful"]  - min(df["useful"]))/(max(df["useful"] ) - min(df["useful"]))
df["funny"]  = (df["funny"]   - min(df["funny"]))/ (max(df["funny"] )  - min(df["funny"]))
df["cool"]   = (df["cool"]    - min(df["cool"]))/  (max(df["cool"] )   - min(df["cool"]))
df.head()

Unnamed: 0,stars_reviews,stars_business,useful,funny,cool,text
0,3.0,3.0,0.002375,0.001261,0.002469,"If you decide to eat here, just be aware it is..."
1,2.0,3.0,0.002375,0.001261,0.002469,This is the second time we tried turning point...
2,4.0,3.0,0.007126,0.001261,0.004938,The place is cute and the staff was very frien...
3,3.0,3.0,0.002375,0.001261,0.002469,We came on a Saturday morning after waiting a ...
4,2.0,3.0,0.002375,0.001261,0.002469,"Mediocre at best. The decor is very nice, and ..."


# Transformer

First, the text needs to be encoded.

In [18]:
def format_data(row):
    return f"Metadata: Stars for the review = {row['stars_reviews']}, Stars for the business: {row['stars_business']}, Usefulness = {row['useful']}, Funny: {row['funny']}, Cool: {row['cool']}; Review: {row['text']}"

formatted_text = [format_data(row) for _, row in df.iloc[1:1000].iterrows()]
formatted_text[1]

'Metadata: Stars for the review = 4.0, Stars for the business: 3.0, Usefulness = 0.007125890736342043, Funny: 0.0012610340479192938, Cool: 0.0049382716049382715; Review: The place is cute and the staff was very friendly. Nice menu. Good for brunch. We had lunch and were seated right away. I really enjoyed the avocado  toast and side of bacon. Nice to have another brunch place nearby.'

As we are using GPT-2 with characer limit 1024, we need to adjust.

In [26]:
formatted_text = [text for text in formatted_text if len(text) <= 1024]
len(formatted_text)

776

Model and tokenizer creation.

In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = 'gpt2'  # or 'gpt2-medium', 'gpt2-large', 'gpt2-xl' depending on the model size you want
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Model "training".

In [28]:
model.train()
import torch
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for i, sentence in enumerate(formatted_text):
    # Encode each sentence separately
    encoded_sentence = tokenizer.encode(sentence, return_tensors='pt')
    
    # Training loop for each sentence
    optimizer.zero_grad()
    outputs = model(encoded_sentence, labels=encoded_sentence)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"({i+1}/{len(formatted_text)}) Sentence: {sentence} - Loss: {loss.item()}")

(0/776) Sentence: Metadata: Stars for the review = 2.0, Stars for the business: 3.0, Usefulness = 0.0023752969121140144, Funny: 0.0012610340479192938, Cool: 0.0024691358024691358; Review: This is the second time we tried turning point at this location. The first time we had a long wait for food after ordering, this time we had an even longer wait of over 40 minutes. I had the omelette skillet and there was hardly any egg in it, I felt like I was eating chopped onions and chopped tomatoes. My wife had a BLT and had a hard time finding the tomato and the avocado that was supposed be on it. Overall,The experience was stressful, mainly because of the long wait. - Loss: 1.760439395904541
(1/776) Sentence: Metadata: Stars for the review = 4.0, Stars for the business: 3.0, Usefulness = 0.007125890736342043, Funny: 0.0012610340479192938, Cool: 0.0049382716049382715; Review: The place is cute and the staff was very friendly. Nice menu. Good for brunch. We had lunch and were seated right away. I

Model evaluation.

In [55]:
# Set the model in evaluation mode
model.eval()

# Function to generate text
def generate_text(prompt_text, max_length=50, temperature=0.7):
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt')

    # Generate text conditioned on the input prompt
    output = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=temperature,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1  # You can adjust the number of generated sequences
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example prompt for text generation
prompt = "Metadata: Stars for the review = 1.0, Stars for the business: 5.0, Usefulness = 0.5, Funny: 0, Cool: 0; You can't end the review in the middle of a sentence."

# Generate text based on the prompt
generated_text = generate_text(prompt, max_length=100,temperature=0.8)
print("Generated Text:")
print(generated_text)

Generated Text:
Metadata: Stars for the review = 1.0, Stars for the business: 5.0, Usefulness = 0.5, Funny: 0, Cool: 0; You can't end the review in the middle of a sentence. The food is amazing, the place is really casual and all the staff are extremely helpful. The drinks are delicious.

This place is really close to the highway. The food is amazing and I'll definitely be back.

Love the beer selection


Finally, save the model.

In [49]:
model.save_pretrained('trained_gpt2_model')