In [22]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# data science imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

In [2]:
# Set data location
data_path = 'C:/Users/IOLAP-USER/Documents/Movie-Recommendation-System/Data/Reviews-1M/'

In [3]:
# Import Movie and User data
movies_df = pd.read_csv(data_path + 'movie_metadata_ohe_subset.csv')

users_df = pd.read_csv(data_path + 'users_metadata.csv')
users_df = users_df.fillna(0)

ratings_df = pd.read_csv(data_path + 'ratings.dat',
                        sep = '::', header = None)
ratings_df.columns = ['userId', 'itemId', 'label', 'timestamp']
ratings_df.drop(['timestamp'], axis = 1, inplace = True)

  


In [4]:
ratings_metadata = ratings_df.join(movies_df, ['itemId'], how = 'left', rsuffix='_right')
ratings_metadata.drop(['itemId_right'], axis = 1, inplace = True)

ratings_full = ratings_metadata.join(users_df, ['userId'], how = 'left', rsuffix='_right')
rating_labels = ratings_full.label
ratings_full.drop(['userId_right', 'userId', 'itemId', 'title', 
                        'imdb_id', 'label'], axis = 1, inplace = True)
ratings_full = ratings_full.fillna(0)

del ratings_df, users_df

In [5]:
ratings_train, ratings_test, labels_train, labels_test = train_test_split(ratings_full, rating_labels, 
                                                                          test_size=0.33, random_state=42)
del ratings_full, rating_labels

## Metadata and Full User Data
### Random Forest Classifier

In [15]:
%%time
rfc  = RandomForestClassifier(n_estimators = 100, 
                              max_depth = 20, 
                              random_state = 42, 
                              max_features = 'sqrt', 
                              min_samples_leaf = 20, 
                              n_jobs = 2)
rfc_model = rfc.fit(ratings_train, labels_train)

Wall time: 10min 3s


In [16]:
%%time
rfc_model_preds = rfc_model.predict(ratings_test)

Wall time: 14 s


In [24]:
%%time
print('Accuracy:', accuracy_score(labels_test, rfc_model_preds))
print('Mean Absolute Error:', mean_absolute_error(labels_test, rfc_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, rfc_model_preds))
del rfc_model_preds

Accuracy: 0.39568696242300855
Mean Absolute Error: 0.8120665678994392
Mean Squared Error: 1.337617286082607
Wall time: 113 ms


### Random Forest Regressor

In [18]:
%%time
rfr  = RandomForestRegressor(n_estimators = 100, 
                              max_depth = 10, 
                              random_state = 42, 
                              max_features = 'sqrt', 
                              min_samples_leaf = 20, 
                              n_jobs = 2)
rfr_model = rfr.fit(ratings_train, labels_train)

Wall time: 3min 57s


In [19]:
%%time
rfr_model_preds = rfr_model.predict(ratings_test)

Wall time: 5.51 s


In [23]:
%%time
print('Mean Absolute Error:', mean_absolute_error(labels_test, rfr_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, rfr_model_preds))
del rfr_model_preds

Mean Absolute Error: 0.9007535521813503
Mean Squared Error: 1.173028708242167
Wall time: 44.1 ms


### Gradient Boosting Tree Classifier

In [None]:
%%time
gbc  = GradientBoostingClassifier(n_estimators = 100, 
                                  learning_rate = 0.1, 
                                  max_depth = 10,
                                  random_state = 42, 
                                  max_features = 'sqrt', 
                                  min_samples_leaf = 20,
                                  n_iter_no_change = 5)
gbc_model = gbc.fit(ratings_train, labels_train)

In [None]:
%%time
gbc_model_preds = gbc_model.predict(ratings_test)

In [None]:
%%time
print('Accuracy:', accuracy_score(labels_test, gbc_model_preds))
print('Mean Absolute Error:', mean_absolute_error(labels_test, gbc_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, gbc_model_preds))
del gbc_model_preds

### Gradient Boosting Tree Regressor

In [None]:
%%time
gbr  = GradientBoostingRegressor(n_estimators = 100, 
                                 learning_rate = 0.1, 
                                 max_depth = 10,
                                 random_state = 42, 
                                 max_features = 'sqrt', 
                                 min_samples_leaf = 20,
                                 n_iter_no_change = 5)
gbr_model = gbr.fit(ratings_train, labels_train)

In [None]:
%%time
gbr_model_preds = gbr_model.predict(ratings_test)

In [None]:
%%time
print('Mean Absolute Error:', mean_absolute_error(labels_test, rfr_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, rfr_model_preds))
del gbr_model_preds