In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("all_datasets/content_based/cleaned_movies_info.csv", index_col=0, low_memory=False)

In [3]:
# get director's mean ratings of movies they directed
director_mean_ratings = data.groupby('directorId')['rating'].mean()
data['director_mean_rating'] = data['directorId'].map(director_mean_ratings)

In [4]:
# get encoded movie's decade
data['decade'] = (data['year'] // 10) * 10
decade_onehot = pd.get_dummies(data['decade'], prefix='decade').astype(int)
data = pd.concat([data, decade_onehot], axis=1)

In [5]:
# get encoded movie's genre
genres_onehot = data['genres'].str.get_dummies('|')
data = pd.concat([data, genres_onehot], axis=1)

In [7]:
# temporary replace missing actorId (movies with only one or two actors) with id = -1
data.fillna(-1, inplace=True)

In [9]:
# get actor's mean rating by id
actors = pd.concat([data['actor1'], data['actor2'], data['actor3']]).reset_index(drop=True)
ratings = pd.concat([data['rating']] * 3).reset_index(drop=True)
actor_ratings = pd.DataFrame({'actorId': actors, 'rating': ratings})
actor_mean_ratings = actor_ratings.groupby('actorId')['rating'].mean()

In [15]:
# remove actorId = -1
actor_mean_ratings = actor_mean_ratings.iloc[1:]

In [16]:
# add to movie dataframe
data['actor1_mean_rating'] = data['actor1'].map(actor_mean_ratings)
data['actor2_mean_rating'] = data['actor2'].map(actor_mean_ratings)
data['actor3_mean_rating'] = data['actor3'].map(actor_mean_ratings)

In [17]:
# fill missing actor with mean rating of all actors
overall_mean_rating = actor_mean_ratings.mean()
data['actor1_mean_rating'] = data['actor1_mean_rating'].fillna(overall_mean_rating)
data['actor2_mean_rating'] = data['actor2_mean_rating'].fillna(overall_mean_rating)
data['actor3_mean_rating'] = data['actor3_mean_rating'].fillna(overall_mean_rating)

In [19]:
# drop old columns
data.drop(columns=['directorId', 'genres', 'year', 'decade', 'actor1', 'actor2', 'actor3'], inplace=True)

In [21]:
data.to_csv('all_datasets/content_based/processed_movies_info.csv')