In [1]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# data science imports
import numpy as np
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

In [2]:
# Set data location
data_path = 'C:/Users/IOLAP-USER/Documents/Movie-Recommendation-System/Data/Reviews-1M/'
model_write_path = 'C:/Users/IOLAP-USER/Documents/Movie-Recommendation-System/Without-Spark/Models/'

In [3]:
# Import Movie and User data
movies_df = pd.read_csv(data_path + 'movies_metadata_ohe_subset.csv')

users_df = pd.read_csv(data_path + 'users_metadata.csv')
users_df = users_df.fillna(0)

ratings_df = pd.read_csv(data_path + 'ratings.dat',
                        sep = '::', header = None)
ratings_df.columns = ['userId', 'itemId', 'label', 'timestamp']
ratings_df.drop(['timestamp'], axis = 1, inplace = True)

  


In [4]:
ratings_metadata = ratings_df.join(movies_df, ['itemId'], how = 'left', rsuffix='_right')
ratings_metadata.drop(['itemId_right'], axis = 1, inplace = True)

ratings_full = ratings_metadata.join(users_df, ['userId'], how = 'left', rsuffix='_right')
rating_labels = ratings_full.label
ratings_full.drop(['userId_right', 'userId', 'itemId', 'title', 
                        'imdb_id', 'label'], axis = 1, inplace = True)
ratings_full = ratings_full.fillna(0)

del ratings_df, users_df

In [5]:
ratings_train, ratings_test, labels_train, labels_test = train_test_split(ratings_full, rating_labels, 
                                                                          test_size=0.33, random_state=42)
del ratings_full, rating_labels

## Metadata and Full User Data
### Random Forest Classifier

In [7]:
%%time
rfc  = RandomForestClassifier(n_estimators = 500, 
                              max_depth = 10, 
                              random_state = 42, 
                              max_features = 'log2', 
                              min_samples_leaf = 20, 
                              n_jobs = 3)
rfc_model = rfc.fit(ratings_train, labels_train)

Wall time: 10min 15s


In [8]:
%%time
rfc_model_preds = rfc_model.predict(ratings_test)

Wall time: 25 s


In [9]:
%%time
print('Accuracy:', accuracy_score(labels_test, rfc_model_preds))
print('Mean Absolute Error:', mean_absolute_error(labels_test, rfc_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, rfc_model_preds))
del rfc_model_preds

Accuracy: 0.3590097828029897
Mean Absolute Error: 0.8529307508430055
Mean Squared Error: 1.3857647946338494
Wall time: 59.8 ms


In [10]:
# save the model to disk
filename = models_write_path + 'rfc_model.sav'
pickle.dump(rfc_model, open(filename, 'wb'))

In [39]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

# model_preds = loaded_model.predict(ratings_test)

# print('Accuracy:', accuracy_score(labels_test, model_preds))
# print('Mean Absolute Error:', mean_absolute_error(labels_test, model_preds))
# print('Mean Squared Error:', mean_squared_error(labels_test, model_preds))

### Random Forest Regressor

In [34]:
%%time
rfr  = RandomForestRegressor(n_estimators = 100, 
                              max_depth = 30, 
                              random_state = 42, 
                              max_features = 'sqrt', 
                              min_samples_leaf = 20, 
                              n_jobs = 3)
rfr_model = rfr.fit(ratings_train, labels_train)

Wall time: 6min 23s


In [35]:
%%time
rfr_model_preds = rfr_model.predict(ratings_test)

Wall time: 10.5 s


In [36]:
%%time
print('Mean Absolute Error:', mean_absolute_error(labels_test, rfr_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, rfr_model_preds))
del rfr_model_preds

Mean Absolute Error: 0.8248584405217898
Mean Squared Error: 1.053643693704921
Wall time: 32.7 ms


In [43]:
# save the model to disk
filename = 'Models/rfr_model.sav'
pickle.dump(rfr_model, open(filename, 'wb'))

### Gradient Boosting Tree Classifier

In [9]:
%%time
gbc  = GradientBoostingClassifier(n_estimators = 10, 
                                  learning_rate = 0.1, 
                                  max_depth = 10,
                                  random_state = 42, 
                                  max_features = 'sqrt', 
                                  min_samples_leaf = 20,
                                  n_iter_no_change = 5)
gbc_model = gbc.fit(ratings_train, labels_train)

Wall time: 24min 51s


In [10]:
%%time
gbc_model_preds = gbc_model.predict(ratings_test)

Wall time: 6.57 s


In [11]:
%%time
print('Accuracy:', accuracy_score(labels_test, gbc_model_preds))
print('Mean Absolute Error:', mean_absolute_error(labels_test, gbc_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, gbc_model_preds))
del gbc_model_preds

Accuracy: 0.37357037467923376
Mean Absolute Error: 0.8322684044851228
Mean Squared Error: 1.346300319024204
Wall time: 76.1 ms


### Gradient Boosting Tree Regressor

In [12]:
%%time
gbr  = GradientBoostingRegressor(n_estimators = 10, 
                                 learning_rate = 0.1, 
                                 max_depth = 10,
                                 random_state = 42, 
                                 max_features = 'sqrt', 
                                 min_samples_leaf = 20,
                                 n_iter_no_change = 5)
gbr_model = gbr.fit(ratings_train, labels_train)

Wall time: 5min 4s


In [13]:
%%time
gbr_model_preds = gbr_model.predict(ratings_test)

Wall time: 3.05 s


In [15]:
%%time
print('Mean Absolute Error:', mean_absolute_error(labels_test, gbr_model_preds))
print('Mean Squared Error:', mean_squared_error(labels_test, gbr_model_preds))
del gbr_model_preds

Mean Absolute Error: 0.9085235750165572
Mean Squared Error: 1.1890251845425888
Wall time: 39.4 ms
