In [1]:
import pandas as pd
import numpy as np
import ast
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# import warnings # For handling error messages.
# warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
movie_data = pd.read_csv('data/cleaned_data.csv')
movie_data = movie_data.drop(columns=['Unnamed: 0', 'id'])

In [3]:
movie_data.head()

Unnamed: 0,title,vote_average,year,month,genre_ids,budget,revenue,runtime,director_id
0,Psycho,8.434,1960,6,"[27, 53, 9648]",806947.0,32000000.0,109,2636
1,Spartacus,7.5,1960,10,"[36, 10752, 18, 12]",12000000.0,60000000.0,197,240
2,The Magnificent Seven,7.5,1960,10,"[37, 28, 12]",2000000.0,4905000.0,127,14520
3,The Apartment,8.214,1960,6,"[35, 18, 10749]",3000000.0,25000000.0,125,3146
4,La Dolce Vita,8.121,1960,2,"[35, 18]",1403473.5,19647000.0,176,4415


In [4]:
# Convert string representations of lists to actual lists of integers
movie_data['genre_ids'] = movie_data['genre_ids'].apply(lambda x: ast.literal_eval(x))

# Flatten the list of lists into a single list
flattened_list = [number for sublist in movie_data['genre_ids'] for number in sublist]

# Convert the flattened list into a set to find unique numbers
unique_genres = set(flattened_list)

# Count the number of unique elements
unique_count = len(unique_genres)

print(f"Number of unique numbers in the 'genre' column: {unique_count}")

Number of unique numbers in the 'genre' column: 19


In [5]:
genre_dict = {'Action':28, 'Adventure':12, 'Animation':16, 'Comedy':35, 'Crime':80, 'Documentary':99,
              'Drama':18, 'Family':10751, 'Fantasy':14, 'History':36, 'Horror':27, 'Music':10402,
              'Mystery':9648, 'Romance':10749, 'Science_Fiction':878, 'Thriller':53,
              'War':10752, 'Western':37}

In [6]:
id_to_genre = {v: k for k, v in genre_dict.items()}

# Add new columns for each genre, initializing with 0
for genre in genre_dict.keys():
    movie_data[genre] = 0

# Iterate through the dataframe and set genre columns
for index, row in movie_data.iterrows():
    for genre_id in row['genre_ids']:
        genre_name = id_to_genre.get(genre_id)
        if genre_name:
            movie_data.at[index, genre_name] = 1

movie_data.drop(columns='genre_ids', inplace=True)

In [7]:
encoder = ce.HashingEncoder(cols=['director_id'], n_components=32)  # n_components is the number of hash buckets
df_hashed = encoder.fit_transform(movie_data)

In [8]:
# Generate descriptive column names
hashed_column_names = [f'director_hash_{i}' for i in range(32)]

# Replace the generic column names with descriptive ones
df_hashed.columns = hashed_column_names + list(movie_data.columns.drop('director_id'))

# Reorder the columns to place hashed columns at the end
non_hashed_columns = list(movie_data.columns.drop('director_id'))
df_hashed = df_hashed[non_hashed_columns + hashed_column_names]

In [9]:
df_hashed.columns

Index(['title', 'vote_average', 'year', 'month', 'budget', 'revenue',
       'runtime', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
       'Music', 'Mystery', 'Romance', 'Science_Fiction', 'Thriller', 'War',
       'Western', 'director_hash_0', 'director_hash_1', 'director_hash_2',
       'director_hash_3', 'director_hash_4', 'director_hash_5',
       'director_hash_6', 'director_hash_7', 'director_hash_8',
       'director_hash_9', 'director_hash_10', 'director_hash_11',
       'director_hash_12', 'director_hash_13', 'director_hash_14',
       'director_hash_15', 'director_hash_16', 'director_hash_17',
       'director_hash_18', 'director_hash_19', 'director_hash_20',
       'director_hash_21', 'director_hash_22', 'director_hash_23',
       'director_hash_24', 'director_hash_25', 'director_hash_26',
       'director_hash_27', 'director_hash_28', 'director_hash_29',
       'director_hash_30', 'director_

In [10]:
X = df_hashed[['budget', 'revenue', 'runtime']]
df_hashed[['budget', 'revenue', 'runtime']] = StandardScaler().fit(X).transform(X)
df_hashed[['year', 'month']] = MinMaxScaler().fit_transform(df_hashed[['year', 'month']])

In [11]:
df_hashed

Unnamed: 0,title,vote_average,year,month,budget,revenue,runtime,Action,Adventure,Animation,...,director_hash_22,director_hash_23,director_hash_24,director_hash_25,director_hash_26,director_hash_27,director_hash_28,director_hash_29,director_hash_30,director_hash_31
0,Psycho,8.434,0.000000,0.454545,-0.841548,-0.388650,0.012591,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Spartacus,7.500,0.000000,0.818182,-0.578510,-0.236352,4.473263,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,The Magnificent Seven,7.500,0.000000,0.818182,-0.813511,-0.536026,0.925002,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,The Apartment,8.214,0.000000,0.454545,-0.790011,-0.426725,0.823623,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,La Dolce Vita,8.121,0.000000,0.090909,-0.827530,-0.455841,3.408785,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6416,There's Still Tomorrow,8.295,0.984375,0.818182,-0.649010,-0.351526,0.468797,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6417,Dune: Part Two,8.549,1.000000,0.090909,3.604516,0.405475,2.901890,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6418,Badland Hunters,6.800,1.000000,0.000000,1.783255,0.328361,-0.088787,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6419,The Beekeeper,7.400,1.000000,0.000000,-0.038006,0.251246,-0.190166,1,0,0,...,0,0,0,0,0,0,0,0,0,0
