In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random



In [2]:
df_numerical = pd.read_csv('numerical_features_cleaned.csv')
df_textual = pd.read_csv('textual_features_cleaned.csv')

In [3]:
df = pd.merge(df_numerical, df_textual, on=['UserID', 'Title'])

# Get dataframe of movies that each user has not seen

In [4]:
all_movies = list(set(df['Title']))
movie_df_dict = {'original': df,
                 'ur117926588': None,
                 'ur15298231': None,
                 'ur1994077': None,
                 'ur17646017': None,
                 'ur4532636': None,
                 'ur22171966': None,
                 'ur3223254': None,
                 'ur66111139': None,
                 'ur63040106': None,
                 'ur84924605': None,
                 'ur4103165': None,
                 'ur59627333': None,
                 'ur98435364': None,
                 'ur65836273': None,
                 'ur44059846': None,
                 'ur7813355': None,
                 'ur98240498': None,
                 'ur59184301': None,
                 'ur57691865': None,
                 'ur3793011': None}

for user_id in movie_df_dict:
    if user_id != 'original':
        seen_movies = list(set(df[df['UserID'] == user_id]['Title']))
        unseen_movies = [movie for movie in all_movies if movie not in seen_movies]
        df_unseen = df[df['Title'].isin(unseen_movies)]

        movie_df_dict[user_id] = df_unseen

### Get scale of X_train and transform unseen datasets

In [5]:
from sklearn.model_selection import train_test_split

x = df.loc[:, df.columns != "User_Rating"]
y = df.loc[:, df.columns == 'User_Rating']
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state= 123)

In [6]:
#fill na values with mean
X_train['Votes'] = X_train['Votes'].fillna(X_train['Votes'].mean())
X_test['Votes'] = X_test['Votes'].fillna(X_test['Votes'].mean())

X_train['duration_mins'] = X_train['duration_mins'].fillna(X_train['duration_mins'].mean())
X_test['duration_mins'] = X_test['duration_mins'].fillna(X_test['duration_mins'].mean())

scaler = StandardScaler()
col_to_scale = ['Votes', 'Movie_Rating', 'num_years_released','duration_mins', 'Brightness', 'Contrast', 'Saturation', 'Hue', 'Texture', 'Entropy', 'Noise']
X_train[col_to_scale] = scaler.fit_transform(X_train[col_to_scale])

In [7]:
unseen_df_list = []
for user_id in movie_df_dict:
    if user_id != 'original':
        df_unseen = movie_df_dict[user_id].copy()
        df_unseen.dropna(inplace=True)
        
        df_unseen[col_to_scale] = scaler.transform(df_unseen[col_to_scale])

        df_unseen = df_unseen.iloc[:min(1000, len(df_unseen)), :]
        df_unseen.drop(['User_Rating', 'Title'], axis=1, inplace=True)
        
        df_unseen['UserID'] = user_id
        
        movie_df_dict[user_id] = df_unseen
        
        unseen_df_list.append(df_unseen)

In [8]:
df_unseen_final = pd.concat(unseen_df_list)
df_unseen_final.drop_duplicates(inplace=True)
df_unseen_final.reset_index(drop=True, inplace=True)

In [13]:
df_unseen_final.to_csv('../Data_Files/Training_Data/unseen_movies.csv', index = False, encoding='utf-8-sig')