In [None]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# data science imports
import numpy as np
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

In [None]:
# Set data location
system_path = ''
data_path = system_path + '/Movie-Recommendation-System/Data/Reviews-1M/'
# model_path = system_path + '/Movie-Recommendation-System/Without-Spark/Models/'

In [None]:
# Import Movie and User data
movies_df = pd.read_csv(data_path + 'movies_metadata_ohe_subset.csv')

users_df = pd.read_csv(data_path + 'users_metadata.csv')
users_df = users_df.fillna(0)

ratings_df = pd.read_csv(data_path + 'ratings.dat',
                        sep = '::', header = None)
ratings_df.columns = ['userId', 'itemId', 'label', 'timestamp']
ratings_df.drop(['timestamp'], axis = 1, inplace = True)

In [None]:
ratings_metadata = ratings_df.join(movies_df, ['itemId'], how = 'left', rsuffix='_right')
ratings_metadata.drop(['itemId_right'], axis = 1, inplace = True)

ratings_full = ratings_metadata.join(users_df, ['userId'], how = 'left', rsuffix='_right')
rating_labels = ratings_full.label
ratings_full.drop(['userId_right', 'userId', 'itemId', 'title', 
                        'imdb_id', 'label'], axis = 1, inplace = True)
ratings_full = ratings_full.fillna(0)

del ratings_df, users_df

In [None]:
ratings_train, ratings_test, labels_train, labels_test = train_test_split(ratings_full, rating_labels, 
                                                                          test_size=0.33, random_state=42)
del ratings_full, rating_labels

## Metadata and Full User Data
### Random Forest Classifier

In [None]:
%%time
rfc  = RandomForestClassifier(n_estimators = 500, 
                              max_depth = 10, 
                              random_state = 42, 
                              max_features = 'log2', 
                              min_samples_leaf = 20, 
                              n_jobs = 3)
rfc_model = rfc.fit(ratings_train, labels_train)

In [None]:
%%time
rfc_model_preds = rfc_model.predict(ratings_test)

In [None]:
%%time
print('Accuracy:', accuracy_score(labels_test, rfc_model_preds))
print('Mean Absolute Error:', mean_absolute_error(labels_test, rfc_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, rfc_model_preds))
del rfc_model_preds

In [None]:
# save the model to disk
filename = models_write_path + 'rfc_model.sav'
pickle.dump(rfc_model, open(filename, 'wb'))

In [None]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

# model_preds = loaded_model.predict(ratings_test)

# print('Accuracy:', accuracy_score(labels_test, model_preds))
# print('Mean Absolute Error:', mean_absolute_error(labels_test, model_preds))
# print('Mean Squared Error:', mean_squared_error(labels_test, model_preds))

### Random Forest Regressor

In [None]:
%%time
rfr  = RandomForestRegressor(n_estimators = 100, 
                              max_depth = 30, 
                              random_state = 42, 
                              max_features = 'sqrt', 
                              min_samples_leaf = 20, 
                              n_jobs = 3)
rfr_model = rfr.fit(ratings_train, labels_train)

In [None]:
%%time
rfr_model_preds = rfr_model.predict(ratings_test)

In [None]:
%%time
print('Mean Absolute Error:', mean_absolute_error(labels_test, rfr_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, rfr_model_preds))
del rfr_model_preds

In [None]:
# save the model to disk
filename = 'Models/rfr_model.sav'
pickle.dump(rfr_model, open(filename, 'wb'))

### Gradient Boosting Tree Classifier

In [None]:
%%time
gbc  = GradientBoostingClassifier(n_estimators = 10, 
                                  learning_rate = 0.1, 
                                  max_depth = 10,
                                  random_state = 42, 
                                  max_features = 'sqrt', 
                                  min_samples_leaf = 20,
                                  n_iter_no_change = 5)
gbc_model = gbc.fit(ratings_train, labels_train)

In [None]:
%%time
gbc_model_preds = gbc_model.predict(ratings_test)

In [None]:
%%time
print('Accuracy:', accuracy_score(labels_test, gbc_model_preds))
print('Mean Absolute Error:', mean_absolute_error(labels_test, gbc_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, gbc_model_preds))
del gbc_model_preds

### Gradient Boosting Tree Regressor

In [None]:
%%time
gbr  = GradientBoostingRegressor(n_estimators = 10, 
                                 learning_rate = 0.1, 
                                 max_depth = 10,
                                 random_state = 42, 
                                 max_features = 'sqrt', 
                                 min_samples_leaf = 20,
                                 n_iter_no_change = 5)
gbr_model = gbr.fit(ratings_train, labels_train)

In [None]:
%%time
gbr_model_preds = gbr_model.predict(ratings_test)

In [None]:
%%time
print('Mean Absolute Error:', mean_absolute_error(labels_test, gbr_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, gbr_model_preds))
del gbr_model_preds