In [1]:
import pandas as pd
import numpy as np

In [17]:
# Read in the entire McDonald's Reviews dataset
df = pd.read_csv("./data/McDonald_s_Reviews.csv", encoding="ISO-8859-1")
df.head(3)

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star


In [23]:
# Extract the review column and concatenate everything into a single string
reviews = df["review"].str.cat(sep='\n')
print(f"There are {len(reviews):,} characters in the dataset.")

There are 4,238,053 characters in the dataset.


In [27]:
# Extract all the unique characters in the dataset so we can create our embeddings
chars = sorted(list(set(reviews)))
vocab_size = len(chars)
print(''.join(chars))
print(f"Vocabulary Size: {vocab_size}")


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{}~½¿ïý
Vocabulary Size: 98


In [28]:
# Now we want to tokenize the data
# Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("Test"))
print(decode(encode("Test")))

[53, 69, 83, 84]
Test


In [38]:
# Tokenize, or encode, the entire dataset and store it in a Tensor
import torch
data = torch.tensor(encode(reviews), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:10]) # Input will look like this to GPT

torch.Size([4238053]) torch.int64
tensor([56, 72, 89,  1, 68, 79, 69, 83,  1, 73])


In [None]:
# Split our data into training and validation sets
split = int(len(data) * 0.9)
train_data = data[:split]
val_data = data[split:]