COS80024

DATA SCIENCE PROJECT 1

PROJECT 4: MOVIE RECOMMENDATION SYSTEM

# S3.5.3: Hybrid Approach B (Content-Based and Model-Based Collaborative Filter)  (Executor: Nakib)

This task aims to develop, select, train and tune parameters for a hybrid approach using collaborative filter (user-based) and conten-based.

Task Leader: Nakib

In [1]:
# Importing all the necessary libraries
import numpy as np
import pandas as pd
from sklearn import metrics
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import neighbors
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Importing the csv data files 

#predictions for trainset by content based model
df1 = pd.read_csv('train_predict_content_df.csv',error_bad_lines=False, warn_bad_lines=True)
#predictions for testset by content based model
df2 = pd.read_csv('predicted_content_based_test_df.csv',error_bad_lines=False, warn_bad_lines=True)
#predictions for trainset by model-based collaborative filter 
df3 = pd.read_csv('train_df_cf_model.csv',error_bad_lines=False, warn_bad_lines=True)
#predictions for testset by model-based collaborative filter 
df4 = pd.read_csv('test_df_cf_model.csv',error_bad_lines=False, warn_bad_lines=True)

In [3]:
#Concatenating predictions made on the trainset by the two models
x_train = pd.merge(df1, df3)

In [4]:
#Keeping only the relvant columns
x_train = x_train[['prediction','pred_rating','rating']]

In [5]:
#Setting actual rating values for trainset as label
y_train = x_train[['rating']]

In [6]:
#Getting rid of the column 'rating' from x_train
x_train = x_train.drop(columns=['rating'])

In [7]:
#Concatenating predictions made on the testset by the two models
x_test = pd.merge(df2, df4)

In [8]:
#Keeping only the relvant columns
x_test = x_test[['predict','pred_rating']]

In [9]:
#Setting actual rating values for testset as label
y_test = df2[['rating']]

# Linear Regression Model

In [10]:
lr = LinearRegression()
hybrid_b = lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test) 

In [11]:
#calculate rmse
error1 = sqrt(mean_squared_error(y_test, lr_pred)) 
#calculate mae
error2 = mean_absolute_error(y_test, lr_pred)
print('RMSE value is:', np.round(error1, 4))
print('MAE value is:', np.round(error2, 4))

RMSE value is: 0.9434
MAE value is: 0.7131


# Decision Tree

In [12]:
dtr = tree.DecisionTreeRegressor()
dtr.fit(x_train, y_train)
dtr_pred = dtr.predict(x_test) 

In [13]:
#calculate rmse
error1 = sqrt(mean_squared_error(y_test, dtr_pred)) 
#calculate mae
error2 = mean_absolute_error(y_test, dtr_pred)
print('RMSE value is:', np.round(error1, 4))
print('MAE value is:', np.round(error2, 4))

RMSE value is: 1.0619
MAE value is: 0.7875


# RandomForest Model

In [14]:

grid_param = {'n_estimators': [7,8,9,10,12,15], 'max_depth':  [5,6,7], \
              'min_samples_split': [100,200,300], 'max_leaf_nodes': [4,5,6,7]}
rf = RandomForestRegressor(random_state=0)
rf_h = RandomizedSearchCV(estimator = rf, param_distributions = grid_param, cv=5, verbose = 3, random_state= 42 )
rf_h.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7, score=0.629, total=   0.2s
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7, score=0.650, total=   0.2s
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7 
[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7, score=0.694, total=   0.2s
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7, score=0.570, total=   0.2s
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7 
[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=7, max_depth=7, score=0.653, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7, score=0.629, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7, score=0.650, total=   0.3s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7, score=0.694, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7, score=0.570, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7 
[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=7, max_depth=7, score=0.653, total=   0.2s

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)



[CV] n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6 
[CV]  n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.630, total=   0.1s
[CV] n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.652, total=   0.1s
[CV] n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6 
[CV]  n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.693, total=   0.2s
[CV] n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.568, total=   0.2s
[CV] n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6 
[CV]  n_estimators=7, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.653, total=   0.1s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7, score=0.608, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7 
[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7, score=0.625, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7, score=0.676, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7 
[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7, score=0.545, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7 
[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=5, max_depth=7, score=0.631, total=   0.2s

  estimator.fit(X_train, y_train, **fit_params)



[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5 
[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5, score=0.586, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5, score=0.604, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5 
[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5, score=0.650, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5, score=0.531, total=   0.2s
[CV] n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5 
[CV]  n_estimators=10, min_samples_split=300, max_leaf_nodes=4, max_depth=5, score=0.612, total=   0.2s
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6, score=0.619, total=   0.2s
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6 
[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6, score=0.637, total=   0.2s

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)



[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6 
[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6, score=0.685, total=   0.2s
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6 
[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6, score=0.560, total=   0.2s
[CV] n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=10, min_samples_split=100, max_leaf_nodes=6, max_depth=6, score=0.640, total=   0.2s
[CV] n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7 
[CV]  n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7, score=0.619, total=   0.1s
[CV] n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7, score=0.638, total=   0.2s
[CV] n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7 
[CV]  n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7, score=0.685, total=   0.1s
[CV] n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7, score=0.560, total=   0.1s
[CV] n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7 
[CV]  n_estimators=9, min_samples_split=100, max_leaf_nodes=6, max_depth=7, score=0.638, total=   0.2s
[CV] n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6, score=0.586, total=   0.1s
[CV] n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6 
[CV]  n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6, score=0.605, total=   0.1s
[CV] n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6, score=0.650, total=   0.1s
[CV] n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6 
[CV]  n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6, score=0.528, total=   0.1s
[CV] n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=8, min_samples_split=100, max_leaf_nodes=4, max_depth=6, score=0.612, total=   0.1s
[CV] n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7 
[CV]  n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7, score=0.629, total=   0.1s
[CV] n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7, score=0.651, total=   0.1s
[CV] n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7 
[CV]  n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7, score=0.693, total=   0.1s
[CV] n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7, score=0.568, total=   0.1s
[CV] n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7 
[CV]  n_estimators=8, min_samples_split=200, max_leaf_nodes=7, max_depth=7, score=0.653, total=   0.1s
[CV] n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.629, total=   0.1s
[CV] n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6 
[CV]  n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.651, total=   0.1s
[CV] n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.693, total=   0.1s
[CV] n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6 
[CV]  n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.568, total=   0.1s
[CV] n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=8, min_samples_split=300, max_leaf_nodes=7, max_depth=6, score=0.653, total=   0.1s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    8.1s finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=0),
                   param_distributions={'max_depth': [5, 6, 7],
                                        'max_leaf_nodes': [4, 5, 6, 7],
                                        'min_samples_split': [100, 200, 300],
                                        'n_estimators': [7, 8, 9, 10, 12, 15]},
                   random_state=42, verbose=3)

In [15]:
rf_pred = rf_h.predict(x_test) 

In [16]:
#calculate rmse
error1 = sqrt(mean_squared_error(y_test, rf_pred)) 
#calculate mae
error2 = mean_absolute_error(y_test, rf_pred)
print('RMSE value is:', np.round(error1, 4))
print('MAE value is:', np.round(error2, 4))

RMSE value is: 0.9434
MAE value is: 0.7152


Linear Regression Model gives the best accuracy: 0.9434. 

In [17]:
import pickle

In [18]:
# Importing pickle file
filename = 'hybrid_b.pickle'
pickle.dump(hybrid_b, open(filename, 'wb'))