In [10]:
import pandas as pd
import random
import json
random.seed(8888)

In [11]:
movie_data = pd.read_csv('../data/movie_data.csv')
movie_data.dropna(subset=['year_released'], inplace=True)
drop_cols = ['image_url', 'imdb_id', 'imdb_link', 'tmdb_id', 'tmdb_link']
movie_data['year_released'] = movie_data['year_released'].astype('Int16')
movie_data['runtime'] = movie_data['runtime'].astype('Int16', errors='ignore')
movie_data.set_index('movie_id', inplace=True)

In [12]:
movie_data.head()

Unnamed: 0_level_0,_id,genres,image_url,imdb_id,imdb_link,movie_title,original_language,overview,popularity,production_countries,release_date,runtime,spoken_languages,tmdb_id,tmdb_link,vote_average,vote_count,year_released
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
football-freaks,5fc85f606758f69634496fd3,"[""Music"",""Animation""]",film-poster/4/6/4/4/4/0/464440-football-freaks...,,,Football Freaks,en,"Football crazy, football mad. Don’t watch this...",0.6,"[""United Kingdom""]",1971-12-05,0.0,[],535272.0,https://www.themoviedb.org/movie/535272/,0.0,0.0,1971
aftermath-1960,5fc85ff26758f696344ace0c,[],film-poster/2/4/5/5/0/0/245500-aftermath-0-230...,tt0586129,http://www.imdb.com/title/tt0586129/maindetails,Aftermath,en,Aftermath was the pilot for an unsold TV serie...,0.6,[],1960-04-17,22.0,[],318331.0,https://www.themoviedb.org/movie/318331/,8.0,1.0,1960
where-chimneys-are-seen,5fc85f606758f69634496fcd,"[""Drama""]",film-poster/9/3/3/1/8/93318-where-chimneys-are...,tt0045731,http://www.imdb.com/title/tt0045731/maindetails,Where Chimneys Are Seen,ja,Gosho’s most celebrated film both in Japan and...,1.568,"[""Japan""]",1953-03-05,108.0,"[""日本語""]",117779.0,https://www.themoviedb.org/movie/117779/,6.6,10.0,1953
the-musicians-daughter,5fc85f606758f69634496fd1,"[""Drama""]",,tt0187327,http://www.imdb.com/title/tt0187327/maindetails,The Musician's Daughter,en,Carl Wagner's good wife was dying. His heart b...,0.6,"[""United States of America""]",1911-12-12,15.0,[],560377.0,https://www.themoviedb.org/movie/560377/,0.0,0.0,1911
50-years-of-fabulous,5fc85f606758f69634496fd4,"[""Documentary""]",film-poster/4/5/4/6/0/3/454603-50-years-of-fab...,tt4769914,http://www.imdb.com/title/tt4769914/maindetails,50 Years of Fabulous,en,50 Years of Fabulous recounts the rich history...,0.6,[],2018-05-17,75.0,[],525187.0,https://www.themoviedb.org/movie/525187/,0.0,0.0,2018


In [13]:
def format_movie_data(movie_data):
    """
    Converts a dictionary of movie data into a string format suitable for LLM fine-tuning.
    
    Args:
        movie_data (dict): A dictionary containing movie attributes such as title, language,
                           overview, and other metadata.
    
    Returns:
        str: A formatted string representation of the movie data.
    """
    # Define the order and the keys to include in the string
    keys_order = [
        "movie_title", "rating_val", "genres", "year_released", "popularity", "vote_average", "vote_count", "runtime", "production_countries", "original_language", "spoken_languages", "overview"
    ]
    
    # Building the string with key-value pairs
    formatted_string = " | ".join(f"{key}: {movie_data.get(key, 'N/A')}" for key in keys_order if key in movie_data)
    
    return formatted_string

In [14]:
import numpy as np

nan2list = lambda x: x if type(x) is str else '[]'

def format_movie_data_v2(movie_data): # , user_rating, user_review):
    release_yr = movie_data.get("year_released", None)
    title_fmt = movie_data.get("movie_title", "N/A") + (f" ({release_yr})" if release_yr else "")
    genre_fmt = ' and '.join([x.lower() for x in json.loads(nan2list(movie_data.get('genres', "[]")))])
    runtime = movie_data.get('runtime', None)
    if runtime is not None and not np.isnan(runtime):
        hours = int(runtime // 60)
        minutes = int(runtime % 60)
        runtime_fmt = f"{hours}h {minutes}m"
    else:
        runtime_fmt = "N/A"
    
    avg_rating = movie_data.get('vote_average')
    votes = movie_data.get('vote_count')
    if votes is not None and not np.isnan(votes) and votes > 0:
        votes = int(votes)
        avg_rating_fmt = f"{avg_rating:.2f} ({votes} vote(s))"
    else:
        avg_rating_fmt = "N/A"
    
    production_countries_fmt = ' and '.join(json.loads(nan2list(movie_data.get('production_countries', "[]")))) or 'N/A'
    languages_fmt = ' and '.join(json.loads(nan2list(movie_data.get('spoken_languages', "[]")))) or 'N/A'
    overview = movie_data.get("overview", "N/A")
    
    return f"""
Title: {title_fmt}
Genres: {genre_fmt}
Runtime: {runtime_fmt}
Average rating: {avg_rating_fmt}
Production countries: {production_countries_fmt}
Languages: {languages_fmt}
Overview: {overview}
""".strip()
        

In [15]:
print("Stringification V1")
print(format_movie_data(movie_data.iloc[100]))
print()
print("Stringification V2")
print(format_movie_data_v2(movie_data.iloc[100]))

Stringification V1
movie_title: Pelican Blood | genres: ["Drama"] | year_released: 2019 | popularity: 1.256 | vote_average: 6.3 | vote_count: 9.0 | runtime: 121.0 | production_countries: ["Bulgaria","Germany"] | original_language: de | spoken_languages: ["Magyar","Deutsch"] | overview: The horse trainer Wiebke adopts Raya from abroad, but soon she has to learn that the girl suffers from an attachment disorder and does not build an emotional connection to anybody around her. Raya constantly puts others in great danger, especially her older adoptive sister Nicolina. After a neurologist explains that Raya will have life-long troubles and does not feel any empathy, Wiebke has to decide whether she is willing to keep her and risk Nicolina’s well-being.

Stringification V2
Title: Pelican Blood (2019)
Genres: drama
Runtime: 2h 1m
Average rating: 6.30 (9 vote(s))
Production countries: Bulgaria and Germany
Languages: Magyar and Deutsch
Overview: The horse trainer Wiebke adopts Raya from abroad,

# user samples

In [16]:
ratings = pd.read_csv("../data/ratings_export.csv")
ratings.head()


Unnamed: 0,_id,movie_id,rating_val,user_id
0,5fc57c5d6758f6963451a07f,feast-2014,7,deathproof
1,5fc57c5d6758f6963451a063,loving-2016,7,deathproof
2,5fc57c5d6758f6963451a0ef,scripted-content,7,deathproof
3,5fc57c5d6758f6963451a060,the-future,4,deathproof
4,5fc57c5c6758f69634519398,mank,5,deathproof


## Fine-tuning GPT2 model

We will stream this dataset, because it takes a while to precompute it. We would rather like to see the language model's progress as it trains.


In [19]:
import torch
import torch.utils.data
import tqdm

class MovieRatingDataset(torch.utils.data.Dataset):
    def __init__(self, ratings, n_context_movies: int):
        self.n_context_movies = n_context_movies
        
        user_counts = ratings.groupby('user_id').size().sort_values(ascending=False)
        user_ids = user_counts[user_counts > 10].index

        # Filter the ratings DataFrame to only include these users
        ratings = ratings[ratings['user_id'].isin(user_ids)]
        
        # Filter to movie ids that are in the movie index
        ratings = ratings[ratings['movie_id'].isin(movie_data.index)]
        
        # Generate rating spans
        rating_spans = []
        self.ratings_per_user = {}
        
        with tqdm.tqdm(user_ids, desc='Generating rating spans...') as pbar:
            grouped = ratings.groupby('user_id')
            for user_id in pbar:
                user_ratings = grouped.get_group(user_id)
                n_ratings = len(user_ratings)
                for start_i in range(0, max(1, n_ratings - n_context_movies)):
                    rating_spans.append((user_id, start_i, min(n_ratings, start_i + n_context_movies)))

                self.ratings_per_user[user_id] = user_ratings
                
        self.rating_spans = rating_spans
        
    def __len__(self):
        return len(self.rating_spans)
    
    def __getitem__(self, index: int):
        (user_id, start_index, end_index) = self.rating_spans[index]
        
        ratings = self.ratings_per_user[user_id].iloc[start_index:end_index]
        
        # Construct target string.
        target = ""
        for i, rating in ratings.iterrows():
            target += '---\n' + format_movie_data_v2(movie_data.loc[rating['movie_id']]) + f"\n---\nRating: {rating['rating_val']} / 10\n"
            
        return target


In [20]:
dataset = MovieRatingDataset(ratings, n_context_movies=10)

Generating rating spans...: 100%|██████████| 7366/7366 [00:09<00:00, 745.99it/s] 


In [21]:
print(dataset[100000])

---
Title: The Concert (1962)
Genres: music and animation
Runtime: 0h 7m
Average rating: 5.10 (20 vote(s))
Production countries: France
Languages: N/A
Overview: A short film by Walerian Borowczyk. Ostensibly a film of a concert given by the round, unassuming Monsieur Kabal and his spiky, terrifying wife, it's actually a cover for their frequent attempts at causing each other extreme physical harm.
---
Rating: 3 / 10
---
Title: Regular Show: The Movie (2015)
Genres: animation and comedy and science fiction and tv movie
Runtime: 1h 10m
Average rating: 7.80 (247 vote(s))
Production countries: United States of America
Languages: English
Overview: To save the universe, and their friendship, Mordecai and Rigby must defeat an evil volleyball coach.
---
Rating: 4 / 10
---
Title: Yevade Subramanyam (2015)
Genres: adventure and comedy and drama
Runtime: 2h 40m
Average rating: 7.40 (10 vote(s))
Production countries: N/A
Languages: తెలుగు
Overview: A corporate man who sets out on a journey of self

In [22]:
import transformers

In [None]:
import os
os.environ['HUGGINGFACE_HUB_CACHE'] = '/scratch/gsk6me/huggingface_cache'

In [8]:
gpt2 = transformers.GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to("cuda")
gpt2_tokenizer = transformers.GPT2Tokenizer.from_pretrained("openai-community/gpt2")

In [26]:
import torch

optimizer = torch.optim.Adam(gpt2.parameters(), lr=1e-4)

order = torch.randperm(len(dataset))
for step in range(100000):
    index = order[step]
    string = dataset[index]
    
    # Train to maximize the ll of this string.
    tokenization = gpt2_tokenizer(string, return_tensors='pt')['input_ids'].cuda()
    output = gpt2(tokenization)
    print(output)
    break


/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [148,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [148,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [148,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [148,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [148,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/con

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
import tqdm

def construct_movie_string_dictionary():
    results = {}
    with tqdm.tqdm(list(movie_data.index), desc='Serializing movie data...') as pbar:
        for movie_id in pbar:
            results[movie_id] = format_movie_data_v2(movie_data.loc[movie_id])
    return results

movie_string_dictionary = construct_movie_string_dictionary()


In [None]:
# construct some strings for each of these users
def construct_ratings_set():
    user_ids = []
    rating_strings = []

    for user_id in tqdm.tqdm(users_filtered, desc='Creating dataset...'):
        user_ratings = filtered_ratings[filtered_ratings['user_id'] == user_id]

        # construct strings from each of these users
        user_rating_strings = []
        for rating_id, (_id, movie_id, rating_val, _user_id) in user_ratings.iterrows():
            if movie_id not in movie_data.index:
                continue
            movie_data = movie_data.loc[movie_id]
            user_rating_strings.append(format_movie_data_v2(movie_data, rating_val, "N/A"))
        user_ids.append(user_id)
        rating_strings.append(user_rating_strings)


In [None]:
all_user_ids = filtered_ratings['user_id'].unique() 
random.shuffle(all_user_ids)
train_cutoff = int(len(all_user_ids)*4/5)
train_movie_ids = all_user_ids[:train_cutoff]
test_movie_ids = all_user_ids[train_cutoff:]

In [20]:
train_movie_ids[0]

'rtown004'

In [37]:
training_samples = []
system_prompt = "A movie recommendation system which takes in a list of movies and outputs the title of the next movie to watch. Only output the title of the movie."
user_history = "I have watched the following movies:\n"

for user in train_movie_ids[:1]:
    rating_history = filtered_ratings[filtered_ratings['user_id'] == user]
    combined = rating_history.merge(movie_data, on='movie_id')
    combined['movie_string'] = combined.apply(format_movie_data, axis=1)

    # split data into groups of 11 where 10 are the user's history and the last is the next movie to watch
    for i in range(0, len(combined), 10):
        if i + 10 >= len(combined):
            break
        watch_history = user_history + "\n".join(combined['movie_string'][i:i+10])
        next_movie = combined['movie_title'][i+10]
        sample = {
            "messages": 
                [
                    {"role": "system", "content": f"{system_prompt}"}, 
                    {"role": "user", "content": f"{watch_history}"}, 
                    {"role": "assistant", "content": f"{next_movie}"}
                ]
        }
        training_samples.append(sample)

with open('training_samples.jsonl', 'w') as file:
    for sample in training_samples:
        json_line = json.dumps(sample)
        file.write(json_line + '\n')

In [None]:
client = OpenAI()
job_id = ""
client.fine_tuning.jobs.retrieve(f"{job_id}")

In [None]:
# client = OpenAI()
# completion = client.chat.completions.create(
#   model="ft:gpt-3.5-turbo:my-org:custom_suffix:id",
#   messages=[
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "Hello!"}
#   ]
# )
# print(completion.choices[0].message)