In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random



In [2]:
df_numerical = pd.read_csv('numerical_features_cleaned.csv')
df_textual = pd.read_csv('textual_features_cleaned.csv')
df_preference = pd.read_csv('../Data_Files/Raw_Data/user_movie_preferences_20users.csv')

In [3]:
df = pd.merge(df_numerical, df_textual, on=['UserID', 'Title'])
df.head(5)

Unnamed: 0,UserID,Title,Year,Genre,Votes,Movie_Rating,User_Rating,Brightness,Contrast,Saturation,...,review_topic__5.0,review_topic__6.0,review_topic__7.0,review_topic__8.0,review_topic__9.0,review_topic__10.0,review_topic__11.0,review_topic__12.0,review_topic__13.0,review_topic__14.0
0,ur3223254,The Making of 'West Side Story',1985,"Documentary, Music",186.0,8.1,10,28.345112,1.0,0.483944,...,0,0,1,0,0,0,0,0,0,0
1,ur3223254,West Side Story (2021),2021,"Crime, Drama, Musical",87055.0,7.2,4,56.452657,1.0,0.387084,...,0,0,0,0,0,0,0,0,0,0
2,ur3223254,Supernova,2020,"Drama, Romance",12422.0,6.9,10,129.423365,1.0,0.336827,...,0,0,0,0,0,0,0,0,0,1
3,ur3223254,The Many Saints of Newark,2021,"Crime, Drama",56763.0,6.3,2,59.251703,1.0,0.147507,...,0,0,0,0,0,0,0,0,1,0
4,ur3223254,Never Look Away,2018,"Biography, Drama, Romance",23016.0,7.7,10,95.750548,1.0,0.333741,...,0,0,0,0,0,0,0,0,0,0


# Get dataframe of movies that each user has not seen

In [4]:
all_movies = set(df['Title'])
added_movies = set()
movie_df_dict = {}

for user_id in df['UserID'].unique():
    seen_movies = set(df[df['UserID'] == user_id]['Title'])
    unseen_movies = all_movies - seen_movies
    unseen_movies = [movie for movie in all_movies if movie not in seen_movies and movie not in added_movies]
    unseen_movies = list(unseen_movies)
    random.shuffle(unseen_movies)
    unseen_movies = unseen_movies[:900]
    added_movies.update(set(unseen_movies))
    df_unseen = df[df['Title'].isin(unseen_movies)]
    movie_df_dict[user_id] = df_unseen

### Get scale of X_train and transform unseen datasets

In [5]:
from sklearn.model_selection import train_test_split

x = df.loc[:, df.columns != "User_Rating"]
y = df.loc[:, df.columns == 'User_Rating']
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state= 123)

In [6]:
#fill na values with mean
X_train['Votes'] = X_train['Votes'].fillna(X_train['Votes'].mean())
X_test['Votes'] = X_test['Votes'].fillna(X_test['Votes'].mean())

X_train['duration_mins'] = X_train['duration_mins'].fillna(X_train['duration_mins'].mean())
X_test['duration_mins'] = X_test['duration_mins'].fillna(X_test['duration_mins'].mean())

X_train['sentiment_score'] = X_train['sentiment_score'].fillna(X_train['sentiment_score'].mean())
X_test['sentiment_score'] = X_test['sentiment_score'].fillna(X_test['sentiment_score'].mean())

scaler = StandardScaler()
col_to_scale = ['Votes', 'Movie_Rating', 'num_years_released','duration_mins', 'Brightness', 'Contrast', 'Saturation', 'Hue', 'Texture', 'Entropy', 'Noise']
X_train[col_to_scale] = scaler.fit_transform(X_train[col_to_scale])

In [7]:
unseen_df_list = []
for user_id in movie_df_dict:
    df_unseen = movie_df_dict[user_id].copy()
    df_unseen.drop(['User_Rating'], axis=1, inplace=True)
    df_unseen['UserID'] = user_id
    movie_df_dict[user_id] = df_unseen
    unseen_df_list.append(df_unseen)

In [8]:
df_unseen_final = pd.concat(unseen_df_list)
df_unseen_final.drop_duplicates(subset=['UserID', 'Title'], keep='first', inplace=True)
df_unseen_final.reset_index(drop=True, inplace=True)
len(df_unseen_final[df_unseen_final.duplicated(subset='Title', keep=False)])

0

In [9]:
df_unseen_final['Votes'] = df_unseen_final['Votes'].fillna(df_unseen_final['Votes'].mean())
df_unseen_final['duration_mins'] = df_unseen_final['duration_mins'].fillna(df_unseen_final['duration_mins'].mean())
df_unseen_final['sentiment_score'] = df_unseen_final['sentiment_score'].fillna(df_unseen_final['sentiment_score'].mean())

df_unseen_final[col_to_scale] = scaler.transform(df_unseen_final[col_to_scale])
df_unseen_final = pd.merge(df_unseen_final, df_preference, on = 'UserID', how = 'left')

In [10]:
def check_pref_genre(row):
    if isinstance(row['show_genres'], list):
        for ele in row['show_genres']:
            if ele in row['Top 3 Genres']:
                return 1
        return 0
    else:
        return 0
        
df_unseen_final['show_genres'] = df_unseen_final['Genre'].apply(lambda x: x.strip().replace(" ",  "").split(",")  if isinstance(x, str)  else x)
df_unseen_final['is_top3_genre'] = df_unseen_final.apply(check_pref_genre, axis = 1)
df_unseen_final['is_top3_year'] = df_unseen_final.apply(lambda x: 1 if str(x['Year'][:4]) in x['Top 3 Years'] else 0, axis = 1)

In [11]:
df_unseen_final.drop(['show_genres', 'Genre', 'Year', 'Top 3 Genres', 'Top 3 Years'], axis=1, inplace=True)

In [12]:
nan_counts = df_unseen_final.isna().sum()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(nan_counts)

UserID                     0
Title                      0
Votes                      0
Movie_Rating               0
Brightness                 0
Contrast                   0
Saturation                 0
Hue                        0
Texture                    0
Entropy                    0
Noise                      0
Colorfulness               0
is_top3_genre              0
is_top3_year               0
num_years_released         0
duration_mins              0
Ad/SciFi/Fantasy           0
Adult                      0
Animation                  0
Crime/Mystery/Film-Noir    0
Drama                      0
Family                     0
Horror/Thriller            0
Music                      0
Reality                    0
RomCom                     0
Short                      0
Sport/Action/Adventure     0
War/History/Biography      0
Western                    0
viewer__(Banned)           0
viewer__Children           0
viewer__GP                 0
viewer__M                  0
viewer__NC-17 

In [13]:
len(df_unseen_final.columns)

64

In [14]:
df_unseen_final.to_csv('../Data_Files/Training_Data/unseen_movies.csv', index = False, encoding='utf-8-sig')