# Libraries

In [1]:
import pandas as pd
import numpy as np
import orjson

import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel




# Data preparation

## Data loading

We load the *review* and the *business* json files and transform them into a DataFrame 

In [2]:
with open('data/yelp_academic_dataset_review.json', 'rb') as file:
    reviews = [orjson.loads(line) for line in file]

df_reviews = pd.DataFrame.from_dict(reviews)
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [3]:
with open('data/yelp_academic_dataset_business.json', 'rb') as file:
    business = [orjson.loads(line) for line in file]

df_business = pd.DataFrame.from_dict(business)
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


## Data merge

To get a complete overview of the data, tables are joined by the *business_id* and the variable *stars* is renamed.

In [4]:
df_business = df_business.rename(axis=1, mapper={"stars": "stars_business"})
df_reviews = df_reviews.rename(axis=1, mapper={"stars": "stars_reviews"})
df = pd.merge(df_reviews, df_business, on='business_id')
df.head()

Unnamed: 0,review_id,user_id,business_id,stars_reviews,useful,funny,cool,text,date,name,...,state,postal_code,latitude,longitude,stars_business,review_count,is_open,attributes,categories,hours
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,This is the second time we tried turning point...,2017-05-13 17:06:55,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4.0,2,0,1,The place is cute and the staff was very frien...,2017-08-08 00:58:18,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,We came on a Saturday morning after waiting a ...,2017-11-19 02:20:23,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,"Mediocre at best. The decor is very nice, and ...",2017-09-09 17:49:47,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."


In [11]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars_reviews,useful,funny,cool,text,date,name,address,city,state,postal_code,latitude,longitude,stars_business,review_count,is_open,attributes,categories,hours
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,This is the second time we tried turning point...,2017-05-13 17:06:55,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4.0,2,0,1,The place is cute and the staff was very frien...,2017-08-08 00:58:18,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,We came on a Saturday morning after waiting a ...,2017-11-19 02:20:23,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,"Mediocre at best. The decor is very nice, and ...",2017-09-09 17:49:47,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."


## Data filtering

Because the reviews can be for other things than restaurants, only relevant reviews are taken into account.

In [54]:
df = df[df['categories'].str.contains('Restaurants', case=False, na=False)]
df.head()

Unnamed: 0,stars_reviews,stars_business,useful,funny,cool,categories,text
0,3.0,3.0,0,0,0,"Restaurants, Breakfast & Brunch, Food, Juice B...","If you decide to eat here, just be aware it is..."
1,2.0,3.0,0,0,0,"Restaurants, Breakfast & Brunch, Food, Juice B...",This is the second time we tried turning point...
2,4.0,3.0,2,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",The place is cute and the staff was very frien...
3,3.0,3.0,0,0,0,"Restaurants, Breakfast & Brunch, Food, Juice B...",We came on a Saturday morning after waiting a ...
4,2.0,3.0,0,0,0,"Restaurants, Breakfast & Brunch, Food, Juice B...","Mediocre at best. The decor is very nice, and ..."


## Column selection

Some columns definitely won't be helpfull and we decided to drop them.

In [55]:
cols_to_use = ["stars_reviews", "stars_business", "useful", "funny", "cool", "text"]
df = df.loc[:, cols_to_use]
df.head()

Unnamed: 0,stars_reviews,stars_business,useful,funny,cool,text
0,3.0,3.0,0,0,0,"If you decide to eat here, just be aware it is..."
1,2.0,3.0,0,0,0,This is the second time we tried turning point...
2,4.0,3.0,2,0,1,The place is cute and the staff was very frien...
3,3.0,3.0,0,0,0,We came on a Saturday morning after waiting a ...
4,2.0,3.0,0,0,0,"Mediocre at best. The decor is very nice, and ..."


## Scaling

Stats *funny*, *useful*, and *cool* are rescaled on a $\langle 0,1 \rangle$ range.

In [61]:
df["useful"] = (df["useful"]  - min(df["useful"]))/(max(df["useful"] ) - min(df["useful"]))
df["funny"]  = (df["funny"]   - min(df["funny"]))/ (max(df["funny"] )  - min(df["funny"]))
df["cool"]   = (df["cool"]    - min(df["cool"]))/  (max(df["cool"] )   - min(df["cool"]))
df.head()

Unnamed: 0,stars_reviews,stars_business,useful,funny,cool,text
0,3.0,3.0,0.002375,0.001261,0.002469,"If you decide to eat here, just be aware it is..."
1,2.0,3.0,0.002375,0.001261,0.002469,This is the second time we tried turning point...
2,4.0,3.0,0.007126,0.001261,0.004938,The place is cute and the staff was very frien...
3,3.0,3.0,0.002375,0.001261,0.002469,We came on a Saturday morning after waiting a ...
4,2.0,3.0,0.002375,0.001261,0.002469,"Mediocre at best. The decor is very nice, and ..."


# Transformer

First, the text needs to be encoded.

In [72]:
def format_data(row):
    return f"Metadata: Stars for the review = {row['stars_reviews']}, Stars for the business: {row['stars_business']}, Usefulness = {row['useful']}, Funny: {row['funny']}, Cool: {row['cool']}; Review: {row['text']}"

formatted_text = [format_data(row) for _, row in df.iloc[1:10].iterrows()]

In [95]:
# Initialize the GPT-2 tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = TFGPT2LMHeadModel.from_pretrained(model_name)

# Tokenize the text data
encoded_texts = [tokenizer(text, return_tensors='tf', max_length=512, truncation=True) for text in formatted_text]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


{'input_ids': <tf.Tensor: shape=(1, 83), dtype=int32, numpy=
array([[ 9171, 14706,    25, 10271,   329,   262,  2423,   796,   604,
           13,    15,    11, 10271,   329,   262,  1597,    25,   513,
           13,    15,    11,  5765, 15538,   796,   362,    11, 40473,
           25,   657,    11, 15226,    25,   352,    11,  6602,    25,
          383,  1295,   318, 13779,   290,   262,  3085,   373,   845,
         8030,    13, 18460,  6859,    13,  4599,   329, 50219,    13,
          775,   550,  9965,   290,   547, 21639,   826,  1497,    13,
          314,  1107,  8359,   262, 40377,   220, 27805,   290,  1735,
          286, 21385,    13, 18460,   284,   423,  1194, 50219,  1295,
         6716,    13]])>, 'attention_mask': <tf.Tensor: shape=(1, 83), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,