In [1]:
import pandas as pd
import numpy as np
import ast
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# import warnings # For handling error messages.
# warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
movie_data = pd.read_csv('data/cleaned_data.csv')
movie_data = movie_data.drop(columns=['Unnamed: 0', 'id'])

In [3]:
movie_data.head()

Unnamed: 0,title,vote_average,year,month,genre_ids,budget,revenue,runtime,director_id
0,Psycho,8.434,1960,6,"[27, 53, 9648]",806947.0,32000000.0,109,2636
1,Spartacus,7.5,1960,10,"[36, 10752, 18, 12]",12000000.0,60000000.0,197,240
2,The Magnificent Seven,7.5,1960,10,"[37, 28, 12]",2000000.0,4905000.0,127,14520
3,The Apartment,8.214,1960,6,"[35, 18, 10749]",3000000.0,25000000.0,125,3146
4,La Dolce Vita,8.121,1960,2,"[35, 18]",1403473.5,19647000.0,176,4415


In [4]:
# Convert string representations of lists to actual lists of integers
movie_data['genre_ids'] = movie_data['genre_ids'].apply(lambda x: ast.literal_eval(x))

# Flatten the list of lists into a single list
flattened_list = [number for sublist in movie_data['genre_ids'] for number in sublist]

# Convert the flattened list into a set to find unique numbers
unique_genres = set(flattened_list)

# Count the number of unique elements
unique_count = len(unique_genres)

print(f"Number of unique numbers in the 'genre' column: {unique_count}")

Number of unique numbers in the 'genre' column: 19


In [5]:
genre_dict = {'Action':28, 'Adventure':12, 'Animation':16, 'Comedy':35, 'Crime':80, 'Documentary':99,
              'Drama':18, 'Family':10751, 'Fantasy':14, 'History':36, 'Horror':27, 'Music':10402,
              'Mystery':9648, 'Romance':10749, 'Science_Fiction':878, 'Thriller':53,
              'War':10752, 'Western':37}

In [6]:
id_to_genre = {v: k for k, v in genre_dict.items()}

# Add new columns for each genre, initializing with 0
for genre in genre_dict.keys():
    movie_data[genre] = 0

# Iterate through the dataframe and set genre columns
for index, row in movie_data.iterrows():
    for genre_id in row['genre_ids']:
        genre_name = id_to_genre.get(genre_id)
        if genre_name:
            movie_data.at[index, genre_name] = 1

movie_data.drop(columns='genre_ids', inplace=True)

In [7]:
X = movie_data[['budget', 'revenue', 'runtime']]
movie_data[['budget', 'revenue', 'runtime']] = StandardScaler().fit(X).transform(X)

In [8]:
director_counts = movie_data.director_id.value_counts()
threshold = 10
filtered_directors = director_counts[director_counts > threshold].index

movie_data['filtered_director_id'] = movie_data['director_id'].apply(
    lambda x: x if x in filtered_directors else 0)

one_hot_encoded_directors = pd.get_dummies(movie_data['filtered_director_id'], prefix='director', dtype=int)
movie_data = movie_data.drop(columns=['director_id', 'filtered_director_id'])
movie_data = pd.concat([movie_data, one_hot_encoded_directors], axis=1)
movie_data

Unnamed: 0,title,vote_average,year,month,budget,revenue,runtime,Action,Adventure,Animation,...,director_12995,director_15217,director_16938,director_17494,director_17825,director_18878,director_19303,director_20907,director_36602,director_90367
0,Psycho,8.434,1960,6,-0.841548,-0.388650,0.012591,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Spartacus,7.500,1960,10,-0.578510,-0.236352,4.473263,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,The Magnificent Seven,7.500,1960,10,-0.813511,-0.536026,0.925002,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,The Apartment,8.214,1960,6,-0.790011,-0.426725,0.823623,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,La Dolce Vita,8.121,1960,2,-0.827530,-0.455841,3.408785,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6416,There's Still Tomorrow,8.295,2023,10,-0.649010,-0.351526,0.468797,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6417,Dune: Part Two,8.549,2024,2,3.604516,0.405475,2.901890,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6418,Badland Hunters,6.800,2024,1,1.783255,0.328361,-0.088787,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6419,The Beekeeper,7.400,2024,1,-0.038006,0.251246,-0.190166,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
movie_data.columns

Index(['title', 'vote_average', 'year', 'month', 'budget', 'revenue',
       'runtime', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
       'Music', 'Mystery', 'Romance', 'Science_Fiction', 'Thriller', 'War',
       'Western', 'director_0', 'director_24', 'director_59', 'director_108',
       'director_138', 'director_190', 'director_224', 'director_280',
       'director_309', 'director_366', 'director_488', 'director_510',
       'director_525', 'director_564', 'director_578', 'director_608',
       'director_865', 'director_893', 'director_956', 'director_1032',
       'director_1150', 'director_1152', 'director_1223', 'director_1243',
       'director_1776', 'director_1884', 'director_2034', 'director_2127',
       'director_2294', 'director_3556', 'director_4014', 'director_5140',
       'director_5306', 'director_5572', 'director_5655', 'director_6046',
       'director_6159', 'director_7187', 'dir

In [9]:
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [12]:
X = movie_data.drop(columns=['title', 'vote_average'])
y = movie_data['vote_average']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)