In [31]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [15]:
df_numerical = pd.read_csv('numerical_features_cleaned.csv')
df_textual = pd.read_csv('textual_features_cleaned.csv')

In [16]:
df_numerical.head(5)

Unnamed: 0,UserID,Title,Votes,Movie_Rating,User_Rating,Brightness,Contrast,Saturation,Hue,Texture,...,viewer__(Banned),viewer__GP,viewer__M,viewer__M18,viewer__NC-17,viewer__Others,viewer__PG,viewer__R21,viewer__TV,viewer__Unrated
0,ur3223254,The Making of 'West Side Story',186.0,8.1,10,28.345112,1.0,0.483944,0.286751,0.095318,...,0,0,0,0,0,0,0,0,0,1
1,ur3223254,West Side Story (2021),87055.0,7.2,4,56.452657,1.0,0.387084,0.630909,0.117362,...,0,0,0,0,0,0,1,0,0,0
2,ur3223254,Supernova,12422.0,6.9,10,129.423365,1.0,0.336827,0.314486,0.064798,...,0,0,0,0,0,0,0,1,0,0
3,ur3223254,The Many Saints of Newark,56763.0,6.3,2,59.251703,1.0,0.147507,0.146196,0.076623,...,0,0,0,0,0,0,0,1,0,0
4,ur3223254,Never Look Away,23016.0,7.7,10,95.750548,1.0,0.333741,0.614238,0.074675,...,0,0,0,0,0,0,0,1,0,0


In [17]:
df_textual.head(5)

Unnamed: 0,UserID,Title,sentiment_score,desc_topic__0,desc_topic__1,desc_topic__2,desc_topic__3,desc_topic__4,desc_topic__5,desc_topic__6,...,review_topic__13.0,review_topic__14.0,review_topic__15.0,review_topic__16.0,review_topic__17.0,review_topic__18.0,review_topic__19.0,review_topic__20.0,review_topic__21.0,review_topic__22.0
0,ur3223254,The Making of 'West Side Story',0.6249,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,ur3223254,West Side Story (2021),,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ur3223254,Supernova,0.25,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,ur3223254,The Many Saints of Newark,0.4926,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ur3223254,Never Look Away,0.4215,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [18]:
nan_counts = df_textual.isna().sum()
print(nan_counts)

UserID                    0
Title                     0
sentiment_score       11073
desc_topic__0             0
desc_topic__1             0
desc_topic__2             0
desc_topic__3             0
desc_topic__4             0
desc_topic__5             0
desc_topic__6             0
desc_topic__7             0
desc_topic__8             0
desc_topic__9             0
desc_topic__10            0
desc_topic__11            0
desc_topic__12            0
desc_topic__13            0
desc_topic__14            0
desc_topic__15            0
desc_topic__16            0
desc_topic__17            0
desc_topic__18            0
desc_topic__19            0
desc_topic__20            0
desc_topic__21            0
desc_topic__22            0
review_topic__0.0         0
review_topic__1.0         0
review_topic__2.0         0
review_topic__3.0         0
review_topic__4.0         0
review_topic__5.0         0
review_topic__6.0         0
review_topic__7.0         0
review_topic__8.0         0
review_topic__9.0   

In [19]:
nan_counts = df_numerical.isna().sum()
print(nan_counts)

UserID                  0
Title                   0
Votes                   3
Movie_Rating            0
User_Rating             0
Brightness              0
Contrast                0
Saturation              0
Hue                     0
Texture                 0
Entropy                 0
Noise                   0
Colorfulness            0
num_years_released      0
duration_mins         244
Ad/SciFi/Fantasy        0
Adult                   0
Animation               0
Comedy                  0
Crime/Mystery           0
Drama                   0
Family                  0
Film-Noir               0
Horror/Thriller         0
Music                   0
None                    0
Reality                 0
Romance                 0
Short                   0
Sport/Action            0
War/History             0
Western                 0
viewer__(Banned)        0
viewer__GP              0
viewer__M               0
viewer__M18             0
viewer__NC-17           0
viewer__Others          0
viewer__PG  

In [20]:
df_textual.shape

(37798, 49)

In [21]:
df_numerical.shape

(37798, 42)

In [26]:
df_final = pd.merge(df_numerical, df_textual, on = ['UserID', 'Title'])
df_final.drop(columns = ['UserID', 'Title'], inplace = True)

In [27]:
df_final.shape

(37798, 87)

In [29]:
from sklearn.model_selection import train_test_split

x = df_final.loc[:, df_final.columns != "User_Rating"]
y = df_final.loc[:, df_final.columns == 'User_Rating']


X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state= 123)

In [32]:
#fill na values with mean
X_train['Votes'] = X_train['Votes'].fillna(X_train['Votes'].mean())
X_test['Votes'] = X_test['Votes'].fillna(X_test['Votes'].mean())

X_train['duration_mins'] = X_train['duration_mins'].fillna(X_train['duration_mins'].mean())
X_test['duration_mins'] = X_test['duration_mins'].fillna(X_test['duration_mins'].mean())

X_train['sentiment_score'] = X_train['sentiment_score'].fillna(X_train['sentiment_score'].mean())
X_test['sentiment_score'] = X_test['sentiment_score'].fillna(X_test['sentiment_score'].mean())

       
scaler = StandardScaler()
col_to_scale = ['Votes', 'Movie_Rating', 'num_years_released','duration_mins', 'Brightness', 'Contrast', 'Saturation', 'Hue', 'Texture', 'Entropy', 'Noise']
X_train[col_to_scale] = scaler.fit_transform(X_train[col_to_scale])
X_test[col_to_scale] = scaler.transform(X_test[col_to_scale])


In [35]:
X_train.to_csv('../Data_Files/Training_Data/x_train.csv', index= False)
X_test.to_csv('../Data_Files/Training_Data/x_test.csv', index= False)

Y_train.to_csv('../Data_Files/Training_Data/y_train.csv', index=False)
Y_test.to_csv('../Data_Files/Training_Data/y_test.csv', index=False)