In [1]:
import pandas as pd
import random
import json
from openai import OpenAI
random.seed(8888)

In [2]:
movie_strings = pd.read_csv('../data/movie_data.csv')
movie_strings.dropna(subset=['year_released'], inplace=True)
drop_cols = ['image_url', 'imdb_id', 'imdb_link', 'tmdb_id', 'tmdb_link']
movie_strings.drop(columns=drop_cols, inplace=True)
movie_strings['year_released'] = movie_strings['year_released'].astype('Int16')
movie_strings['runtime'] = movie_strings['runtime'].astype('Int16', errors='ignore')

In [3]:
movie_strings.head()

Unnamed: 0,_id,genres,movie_id,movie_title,original_language,overview,popularity,production_countries,release_date,runtime,spoken_languages,vote_average,vote_count,year_released
0,5fc85f606758f69634496fd3,"[""Music"",""Animation""]",football-freaks,Football Freaks,en,"Football crazy, football mad. Don’t watch this...",0.6,"[""United Kingdom""]",12/5/1971,0.0,[],0.0,0.0,1971
1,5fc85ff26758f696344ace0c,[],aftermath-1960,Aftermath,en,Aftermath was the pilot for an unsold TV serie...,0.6,[],4/17/1960,22.0,[],8.0,1.0,1960
2,5fc85f606758f69634496fcd,"[""Drama""]",where-chimneys-are-seen,Where Chimneys Are Seen,ja,Gosho’s most celebrated film both in Japan and...,1.568,"[""Japan""]",3/5/1953,108.0,"[""日本語""]",6.6,10.0,1953
3,5fc85f606758f69634496fd1,"[""Drama""]",the-musicians-daughter,The Musician's Daughter,en,Carl Wagner's good wife was dying. His heart b...,0.6,"[""United States of America""]",12/12/1911,15.0,[],0.0,0.0,1911
4,5fc85f606758f69634496fd4,"[""Documentary""]",50-years-of-fabulous,50 Years of Fabulous,en,50 Years of Fabulous recounts the rich history...,0.6,[],5/17/2018,75.0,[],0.0,0.0,2018


In [4]:
row = movie_strings.iloc[100]
row

_id                                              5fc85ff26758f696344ad095
genres                                                          ["Drama"]
movie_id                                               pelican-blood-2019
movie_title                                                 Pelican Blood
original_language                                                      de
overview                The horse trainer Wiebke adopts Raya from abro...
popularity                                                          1.256
production_countries                               ["Bulgaria","Germany"]
release_date                                                    9/24/2020
runtime                                                             121.0
spoken_languages                                     ["Magyar","Deutsch"]
vote_average                                                          6.3
vote_count                                                            9.0
year_released                         

In [30]:
def format_movie_data(movie_data):
    """
    Converts a dictionary of movie data into a string format suitable for LLM fine-tuning.
    
    Args:
        movie_data (dict): A dictionary containing movie attributes such as title, language,
                           overview, and other metadata.
    
    Returns:
        str: A formatted string representation of the movie data.
    """
    # Define the order and the keys to include in the string
    keys_order = [
        "movie_title", "rating_val", "genres", "year_released", "popularity", "vote_average", "vote_count", "runtime", "production_countries", "original_language", "spoken_languages", "overview"
    ]
    
    # Building the string with key-value pairs
    formatted_string = " | ".join(f"{key}: {movie_data.get(key, 'N/A')}" for key in keys_order if key in movie_data)
    
    return formatted_string

In [6]:
format_movie_data(row)

'movie_title: Pelican Blood | genres: ["Drama"] | year_released: 2019 | popularity: 1.256 | vote_average: 6.3 | vote_count: 9.0 | runtime: 121.0 | production_countries: ["Bulgaria","Germany"] | original_language: de | spoken_languages: ["Magyar","Deutsch"] | overview: The horse trainer Wiebke adopts Raya from abroad, but soon she has to learn that the girl suffers from an attachment disorder and does not build an emotional connection to anybody around her. Raya constantly puts others in great danger, especially her older adoptive sister Nicolina. After a neurologist explains that Raya will have life-long troubles and does not feel any empathy, Wiebke has to decide whether she is willing to keep her and risk Nicolina’s well-being.'

# user samples

In [12]:
ratings = pd.read_csv("../data/ratings_export.csv")
ratings.head()


Unnamed: 0,_id,movie_id,rating_val,user_id
0,5fc57c5d6758f6963451a07f,feast-2014,7,deathproof
1,5fc57c5d6758f6963451a063,loving-2016,7,deathproof
2,5fc57c5d6758f6963451a0ef,scripted-content,7,deathproof
3,5fc57c5d6758f6963451a060,the-future,4,deathproof
4,5fc57c5c6758f69634519398,mank,5,deathproof


In [19]:
user_counts = ratings.groupby('user_id').size().sort_values(ascending=False)
users_filtered = user_counts[user_counts > 200].index
print(len(users_filtered))

# Filter the ratings DataFrame to only include these users
filtered_ratings = ratings[ratings['user_id'].isin(users_filtered)]


6378


In [None]:
all_user_ids = filtered_ratings['user_id'].unique() 
random.shuffle(all_user_ids)
train_cutoff = int(len(all_user_ids)*4/5)
train_movie_ids = all_user_ids[:train_cutoff]
test_movie_ids = all_user_ids[train_cutoff:]

In [20]:
train_movie_ids[0]

'rtown004'

In [37]:
training_samples = []
system_prompt = "A movie recommendation system which takes in a list of movies and outputs the title of the next movie to watch. Only output the title of the movie."
user_history = "I have watched the following movies:\n"

for user in train_movie_ids[:1]:
    rating_history = filtered_ratings[filtered_ratings['user_id'] == user]
    combined = rating_history.merge(movie_strings, on='movie_id')
    combined['movie_string'] = combined.apply(format_movie_data, axis=1)

    # split data into groups of 11 where 10 are the user's history and the last is the next movie to watch
    for i in range(0, len(combined), 10):
        if i + 10 >= len(combined):
            break
        watch_history = user_history + "\n".join(combined['movie_string'][i:i+10])
        next_movie = combined['movie_title'][i+10]
        sample = {
            "messages": 
                [
                    {"role": "system", "content": f"{system_prompt}"}, 
                    {"role": "user", "content": f"{watch_history}"}, 
                    {"role": "assistant", "content": f"{next_movie}"}
                ]
        }
        training_samples.append(sample)

with open('training_samples.jsonl', 'w') as file:
    for sample in training_samples:
        json_line = json.dumps(sample)
        file.write(json_line + '\n')

In [None]:
client = OpenAI()
job_id = ""
client.fine_tuning.jobs.retrieve(f"{job_id}")

In [None]:
# client = OpenAI()
# completion = client.chat.completions.create(
#   model="ft:gpt-3.5-turbo:my-org:custom_suffix:id",
#   messages=[
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "Hello!"}
#   ]
# )
# print(completion.choices[0].message)