In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df_numerical = pd.read_csv('numerical_features_cleaned.csv')
df_textual = pd.read_csv('textual_features_cleaned.csv')

In [3]:
df_numerical.head(5)

Unnamed: 0,UserID,Title,Genre,Votes,Movie_Rating,User_Rating,Brightness,Contrast,Saturation,Hue,...,War/History/Biography,Western,viewer__(Banned),viewer__Children,viewer__GP,viewer__M,viewer__NC-17,viewer__PG,viewer__R21,viewer__Unrated
0,ur3223254,The Making of 'West Side Story',"Documentary, Music",186.0,8.1,10,28.345112,1.0,0.483944,0.286751,...,0,0,0,0,0,0,0,0,0,1
1,ur3223254,West Side Story (2021),"Crime, Drama, Musical",87055.0,7.2,4,56.452657,1.0,0.387084,0.630909,...,0,0,0,0,0,0,0,1,0,0
2,ur3223254,Supernova,"Drama, Romance",12422.0,6.9,10,129.423365,1.0,0.336827,0.314486,...,0,0,0,0,0,0,0,0,1,0
3,ur3223254,The Many Saints of Newark,"Crime, Drama",56763.0,6.3,2,59.251703,1.0,0.147507,0.146196,...,0,0,0,0,0,0,0,0,1,0
4,ur3223254,Never Look Away,"Biography, Drama, Romance",23016.0,7.7,10,95.750548,1.0,0.333741,0.614238,...,1,0,0,0,0,0,0,0,1,0


In [4]:
df_textual.head(5)

Unnamed: 0,UserID,Title,sentiment_score,desc_topic__0,desc_topic__1,desc_topic__2,desc_topic__3,desc_topic__4,desc_topic__5,desc_topic__6,...,review_topic__5.0,review_topic__6.0,review_topic__7.0,review_topic__8.0,review_topic__9.0,review_topic__10.0,review_topic__11.0,review_topic__12.0,review_topic__13.0,review_topic__14.0
0,ur3223254,The Making of 'West Side Story',0.6249,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,ur3223254,West Side Story (2021),,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,ur3223254,Supernova,0.25,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,ur3223254,The Many Saints of Newark,0.4926,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,ur3223254,Never Look Away,0.4215,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
nan_counts = df_textual.isna().sum()
print(nan_counts)

UserID                    0
Title                     0
sentiment_score       11073
desc_topic__0             0
desc_topic__1             0
desc_topic__2             0
desc_topic__3             0
desc_topic__4             0
desc_topic__5             0
desc_topic__6             0
desc_topic__7             0
desc_topic__8             0
desc_topic__9             0
review_topic__0.0         0
review_topic__1.0         0
review_topic__2.0         0
review_topic__3.0         0
review_topic__4.0         0
review_topic__5.0         0
review_topic__6.0         0
review_topic__7.0         0
review_topic__8.0         0
review_topic__9.0         0
review_topic__10.0        0
review_topic__11.0        0
review_topic__12.0        0
review_topic__13.0        0
review_topic__14.0        0
dtype: int64


In [6]:
nan_counts = df_numerical.isna().sum()
print(nan_counts)

UserID                       0
Title                        0
Genre                        4
Votes                        3
Movie_Rating                 0
User_Rating                  0
Brightness                   0
Contrast                     0
Saturation                   0
Hue                          0
Texture                      0
Entropy                      0
Noise                        0
Colorfulness                 0
is_top3_genre                0
is_top3_year                 0
num_years_released           0
duration_mins              244
Ad/SciFi/Fantasy             0
Adult                        0
Animation                    0
Crime/Mystery/Film-Noir      0
Drama                        0
Family                       0
Horror/Thriller              0
Music                        0
Reality                      0
RomCom                       0
Short                        0
Sport/Action/Adventure       0
War/History/Biography        0
Western                      0
viewer__

In [7]:
df_textual.shape

(37798, 28)

In [8]:
df_numerical.shape

(37798, 40)

In [11]:
df_final = pd.merge(df_numerical, df_textual, on = ['UserID', 'Title'])
df_final.drop(columns = ['UserID', 'Title', 'Genre', 'Year'], inplace = True)

In [12]:
df_final.shape

(37798, 63)

In [11]:
from sklearn.model_selection import train_test_split

x = df_final.loc[:, df_final.columns != "User_Rating"]
y = df_final.loc[:, df_final.columns == 'User_Rating']


X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state= 123)

In [12]:
#fill na values with mean
X_train['Votes'] = X_train['Votes'].fillna(X_train['Votes'].mean())
X_test['Votes'] = X_test['Votes'].fillna(X_test['Votes'].mean())

X_train['duration_mins'] = X_train['duration_mins'].fillna(X_train['duration_mins'].mean())
X_test['duration_mins'] = X_test['duration_mins'].fillna(X_test['duration_mins'].mean())

X_train['sentiment_score'] = X_train['sentiment_score'].fillna(X_train['sentiment_score'].mean())
X_test['sentiment_score'] = X_test['sentiment_score'].fillna(X_test['sentiment_score'].mean())

       
scaler = StandardScaler()
col_to_scale = ['Votes', 'Movie_Rating', 'num_years_released','duration_mins', 'Brightness', 'Contrast', 'Saturation', 'Hue', 'Texture', 'Entropy', 'Noise']
X_train[col_to_scale] = scaler.fit_transform(X_train[col_to_scale])
X_test[col_to_scale] = scaler.transform(X_test[col_to_scale])


In [13]:
X_train.to_csv('../Data_Files/Training_Data/x_train.csv', index= False)
X_test.to_csv('../Data_Files/Training_Data/x_test.csv', index= False)

Y_train.to_csv('../Data_Files/Training_Data/y_train.csv', index=False)
Y_test.to_csv('../Data_Files/Training_Data/y_test.csv', index=False)