## Movie Recommendation System Models

In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV


In [2]:
#Import data into a DataFrame and drop unnecessary columns 
df = pd.read_csv('cleaneddata', index_col=False)
df2 = df[['userId', 'movieId', 'rating']]

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df2.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
#Check sparsity of matrix
numratings = len(df2['rating'])
numusers = len(df2['userId'].unique())
numitems = len(df2['movieId'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.9829821819213007

Our matrix is very sparse, which could negatively impact our model results. In order to improve this issue, we will remove any users that have rated less than 200 movies. 

In [4]:
#Remove users who have rated less than 200 movies
df3 = df2.groupby('userId').filter(lambda x : len(x)>200)
df4 = df3.groupby('movieId').filter(lambda x : len(x)>10)
df.to_csv('d4')
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47232 entries, 0 to 100817
Data columns (total 3 columns):
userId     47232 non-null int64
movieId    47232 non-null int64
rating     47232 non-null float64
dtypes: float64(1), int64(2)
memory usage: 1.4 MB


In [24]:
len(df4['userId'].unique())

133

In [6]:
#Check sparsity of new matrix
numratings = len(df4['rating'])
numusers = len(df4['userId'].unique())
numitems = len(df4['movieId'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.7931695867508024

This result looks pretty good; our resulting matrix is much less sparse, and is less than 95%, so we hope to see improvements in our SVD models. Let's begin modeling and investigate our results.

In [7]:
#Look at the distribution of ratings again 
df4.rating.value_counts()

4.0    12991
3.0     9065
3.5     6970
5.0     5598
4.5     4265
2.0     3239
2.5     2911
1.0     1030
1.5      716
0.5      447
Name: rating, dtype: int64

In [8]:
#Instansiate reader and data 
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df4, reader) 

In [9]:
#Train test split 
trainset, testset = train_test_split(data, test_size=.2)

In [10]:
# Print number of uses and items for the trainset 
print('Number of users in train set : ', trainset.n_users, '\n')
print('Number of items in train set : ', trainset.n_items, '\n')


Number of users in train set :  133 

Number of items in train set :  1717 



### Baseline Model

In [11]:
#Instansiate a baseline model using KNNBaseline 
baseline = KNNBaseline(random_state=42)

In [12]:
#Fit model on the trainset 
baseline.fit(trainset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fe7b91be2b0>

In [13]:
#Predict on the test set 
baselinepreds = baseline.test(testset)

In [14]:
#Check RMSE and MAE results 
accuracy.rmse(baselinepreds)
accuracy.mae(baselinepreds)

RMSE: 0.8090
MAE:  0.6230


0.6230389428536773

In [15]:
#Run 3-fold cross validation on the data and print results 
cv_baseline = cross_validate(baseline, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8221  0.8133  0.8120  0.8158  0.0045  
MAE (testset)     0.6301  0.6231  0.6228  0.6253  0.0033  
Fit time          0.08    0.06    0.07    0.07    0.01    
Test time         1.28    1.10    1.17    1.18    0.07    


In [16]:
# Print out the RMSE score for each fold 
for i in cv_baseline.items():
    print(i)

('test_rmse', array([0.82205868, 0.81332741, 0.81203809]))
('test_mae', array([0.63006249, 0.62313599, 0.6228419 ]))
('fit_time', (0.07831215858459473, 0.05973505973815918, 0.06706714630126953))
('test_time', (1.2760586738586426, 1.104356288909912, 1.1722657680511475))


In [17]:
#Find the average test RMSE from the 3-Fold cross-validation
np.mean(cv_baseline['test_rmse'])

0.8158080602830129

### Model 1

In [18]:
#Set parameters for GridSearch on SVD model 
parameters = {'n_factors': [20, 50, 80],
             'reg_all': [0.04, 0.06],
             'n_epochs': [10, 20, 30],
             'lr_all': [.002, .005, .01]}
gridsvd = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [19]:
#Fit SVD model on data
gridsvd.fit(data)

In [20]:
#Print best score and best parameters from the GridSearch 
print(gridsvd.best_score)
print(gridsvd.best_params)

{'rmse': 0.7983138047656222, 'mae': 0.6111846045487044}
{'rmse': {'n_factors': 80, 'reg_all': 0.06, 'n_epochs': 30, 'lr_all': 0.01}, 'mae': {'n_factors': 80, 'reg_all': 0.06, 'n_epochs': 30, 'lr_all': 0.01}}


In [21]:
#Reinstantiate the model with the best parameters fromGridSearch 
svdtuned = SVD(n_factors=80,
               reg_all=0.06,
               n_epochs=30,
               lr_all=0.01)

In [22]:
#Fit and predict the model 
svdtuned.fit(trainset)
svdpreds = svdtuned.test(testset)

In [23]:
#Print RMSE and MAE results 
accuracy.rmse(svdpreds)
accuracy.mae(svdpreds)

RMSE: 0.7974
MAE:  0.6134


0.6134127853460452

In [24]:
#Perform 3-Fold cross validation for SVD tuned model
cv_svd_tuned = cross_validate(svdtuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8078  0.8133  0.8033  0.8081  0.0041  
MAE (testset)     0.6189  0.6237  0.6165  0.6197  0.0030  
Fit time          2.23    2.40    2.29    2.30    0.07    
Test time         0.12    0.11    0.13    0.12    0.01    


In [25]:
#Display the results for all 3-folds 
for i in cv_svd_tuned.items():
    print(i)

('test_rmse', array([0.8077734 , 0.81329083, 0.80329865]))
('test_mae', array([0.61888004, 0.6237021 , 0.61648926]))
('fit_time', (2.225506067276001, 2.396040916442871, 2.289271831512451))
('test_time', (0.11564493179321289, 0.11256980895996094, 0.1343250274658203))


In [26]:
# Print out the average RMSE score for the test set
np.mean(cv_svd_tuned['test_rmse'])

0.8081209584157191

### Model 2

In [27]:
# Set parameters to be used in KNN models 
knn_params = {'name': ['cosine', 'pearson'],
              'user_based':[True, False], 
              'min_support':[True, False],
            'min_k' : [1, 2]}

In [28]:
# Apply GridSearch to the KNN Basic model to identify the best parameters
gsknnbasic = GridSearchCV(KNNBasic, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbasic.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [29]:
#Display the best scores and parameters from GridSearch
print(gsknnbasic.best_score)
print(gsknnbasic.best_params)

{'rmse': 0.884847778767107, 'mae': 0.6857712646134838}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}}


In [30]:
#Reinstantiate the model with the best parameters from GridSearch 
knnbasic_tuned = KNNBasic(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [31]:
#Fit on the train set and predict on the test set 
knnbasic_tuned.fit(trainset)
knnbpreds = knnbasic_tuned.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [32]:
#Print RMSE and MAE results 
accuracy.rmse(knnbpreds)
accuracy.mae(knnbpreds)

RMSE: 0.8966
MAE:  0.6992


0.6992134285971248

Another way to evalute the model is to perform a cross validation and print the resulting scores. We will explore this below:

In [33]:
#Conduct cross validation for the KNNBasic tuned model 
cv_knn_basic = cross_validate(knnbasic_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9152  0.9019  0.8975  0.9049  0.0075  
MAE (testset)     0.7089  0.7008  0.6988  0.7028  0.0043  
Fit time          0.03    0.03    0.03    0.03    0.00    
Test time         0.83    0.83    0.84    0.83    0.01    


In [34]:
# Print out results from the cross-valdiatoin 
for i in cv_knn_basic.items():
    print(i)

('test_rmse', array([0.91521319, 0.90191786, 0.8974969 ]))
('test_mae', array([0.70886452, 0.70079596, 0.69881621]))
('fit_time', (0.02921295166015625, 0.030929088592529297, 0.03017902374267578))
('test_time', (0.8292660713195801, 0.8253788948059082, 0.8378198146820068))


In [35]:
# Print out the average RMSE score for the test set
np.mean(cv_knn_basic['test_rmse'])

0.9048759833874648

This average of test RMSE results in our cross validation is approximately 0.97, similar to the RMSE we found above. This is a significant improvement from our baseline model which had an RMSE of 0.873.

### Model 3

In [36]:
#Apply KNN GridSearch parameters on the KNNBaseline model 
gsknnbaseline = GridSearchCV(KNNBaseline, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbaseline.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [37]:
#Display the best score and the best parameters 
print(gsknnbaseline.best_score)
print(gsknnbaseline.best_params)

{'rmse': 0.8176944686841295, 'mae': 0.6266536384986057}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}}


In [38]:
#Reinstantiate the model with the best parameters from GridSearch 
knnbaseline_tuned = KNNBaseline(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [39]:
#Fit the trainset and predict on the test set 
knnbaseline_tuned.fit(trainset)
knnbaselinepreds = knnbaseline_tuned.test(testset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [40]:
#Print the RMSE and MAE scores 
accuracy.rmse(knnbaselinepreds)
accuracy.mae(knnbaselinepreds)

RMSE: 0.8123
MAE:  0.6259


0.625944142354212

In [41]:
#Perform 3 fold cross validation 
cv_knn_baseline = cross_validate(knnbaseline_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8196  0.8276  0.8123  0.8198  0.0063  
MAE (testset)     0.6262  0.6328  0.6262  0.6284  0.0031  
Fit time          0.08    0.07    0.09    0.08    0.01    
Test time         1.25    1.20    1.22    1.22    0.02    


In [42]:
#Show the mean RMSE score for the test set 
np.mean(cv_knn_baseline['test_rmse'])

0.81982896491215

### Model 4

In [43]:
#Apply GridSearch to the KNNWithMeans model 
gsknnWM = GridSearchCV(KNNWithMeans, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnWM.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [44]:
#Display the best score and best parameters from GridSearch 
print(gsknnWM.best_score)
print(gsknnWM.best_params)

{'rmse': 0.8240119392584893, 'mae': 0.6319216808849227}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}}


In [45]:
#Reinstansiate the model with the best parameters 
knnwm_tuned = KNNWithMeans(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [46]:
#Fit on the trainset, predict on the testset 
knnwm_tuned.fit(trainset)
knnwmpreds = knnwm_tuned.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [47]:
#Print RMSE and MAE results
accuracy.rmse(knnwmpreds)
accuracy.mae(knnwmpreds)

RMSE: 0.8197
MAE:  0.6309


0.6309305074204756

In [48]:
#Perform 3-Fold cross validation on KNNWithMeans model 
cv_knn_wm = cross_validate(knnwm_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8251  0.8251  0.8324  0.8275  0.0035  
MAE (testset)     0.6340  0.6310  0.6390  0.6347  0.0033  
Fit time          0.03    0.04    0.06    0.04    0.01    
Test time         0.88    1.34    1.11    1.11    0.18    


In [49]:
#Print the average RMSE score for the test set 
np.mean(cv_knn_wm['test_rmse'])

0.8275205999348221

### All results

In [50]:
#Create a dictionary for each models' results 
baselineresult = {'model': 'baseline','RMSE': accuracy.rmse(baselinepreds), 'MAE': accuracy.mae(baselinepreds), 'CV': np.mean(cv_baseline['test_rmse'])}
svdresult = {'model':'svd', 'RMSE': accuracy.rmse(svdpreds), 'MAE': accuracy.mae(svdpreds), 'CV': np.mean(cv_svd_tuned['test_rmse'])}
knnbasicresult = {'model':'knnbasic','RMSE': accuracy.rmse(knnbpreds), 'MAE': accuracy.mae(knnbpreds), 'CV': np.mean(cv_knn_basic['test_rmse'])}
knnbaselineresult = {'model':'knnbaseline','RMSE': accuracy.rmse(knnbaselinepreds), 'MAE': accuracy.mae(knnbaselinepreds), 'CV': np.mean(cv_knn_baseline['test_rmse'])}
knnwmresult = {'model':'knnwm','RMSE': accuracy.rmse(knnwmpreds), 'MAE': accuracy.mae(knnwmpreds), 'CV': np.mean(cv_knn_wm['test_rmse'])}

RMSE: 0.8090
MAE:  0.6230
RMSE: 0.7974
MAE:  0.6134
RMSE: 0.8966
MAE:  0.6992
RMSE: 0.8123
MAE:  0.6259
RMSE: 0.8197
MAE:  0.6309


In [51]:
#Combine all the results into a list 
result_list = [baselineresult, svdresult, knnbasicresult, knnbaselineresult, knnwmresult]

In [52]:
#Transform the results lists into a DataFrame 
df_results_updated = pd.DataFrame.from_dict(result_list, orient='columns')
df_results_updated = df_results_updated.set_index('model')

In [53]:
#Display the results for all of the models 
df_results_updated

Unnamed: 0_level_0,RMSE,MAE,CV
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baseline,0.808995,0.623039,0.815808
svd,0.797356,0.613413,0.808121
knnbasic,0.89663,0.699213,0.904876
knnbaseline,0.812294,0.625944,0.819829
knnwm,0.819708,0.630931,0.827521


### Generating New Ratings 

We will create a function that generates ratings for a brand new user. We will then show how our model can use these ratings in order to make predictions. This step is important as it shows how our models and our recommendation systems can actually make predictions on new ratings!


In [5]:
#Define function that can generate new user movie ratings 
def movie_rater(movie_df,num, genre=None):
    #Create new user with userId = 1000
    userID = 1000
    
    #Create an empty list of ratings 
    rating_list = []
    
    #For all number of ratings, provide a random movie sample within the specified genre for the user to rate 
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
    
    #Provide user with a prompt to rate the movie, then print the userID, movieID, then title, then append 
    #results to the rating_list 
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'title':movie['title'].values[0], 'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list  

In [6]:
dfnew = df[['userId', 'movieId', 'rating', 'title', 'genres']]

In [7]:
userrating = movie_rater(dfnew, 3, 'Action')

       userId  movieId  rating                      title  \
18664     119    87232     4.0  X-Men: First Class (2011)   

                                     genres  
18664  Action|Adventure|Sci-Fi|Thriller|War  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
       userId  movieId  rating               title         genres
11011      68     4701     2.0  Rush Hour 2 (2001)  Action|Comedy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
       userId  movieId  rating           title                  genres
98727     608      379     3.0  Timecop (1994)  Action|Sci-Fi|Thriller
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2


In [8]:
## Display the new user ratings 
userrating

[{'userId': 1000,
  'movieId': 87232,
  'title': 'X-Men: First Class (2011)',
  'rating': '3'},
 {'userId': 1000,
  'movieId': 4701,
  'title': 'Rush Hour 2 (2001)',
  'rating': '4'},
 {'userId': 1000, 'movieId': 379, 'title': 'Timecop (1994)', 'rating': '2'}]

The new user has rated three new movies: Dawn of the Dead, The Four Musketeers, and Braveheart. Our model can now provide predictions for these ratings by adding them to our model.

In [9]:
#Add new ratings to our DataFrame
new_ratings_df = df4.append(userrating,ignore_index=True, sort=False)

In [10]:
#Drop the 'title' column so that our dataframe is ready to be put into surprise
new_ratings_df.drop(['title'], axis=1, inplace=True)

In [11]:
#Investigate new DataFrame
new_ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4
1,1,3,4
2,1,6,4
3,1,47,5
4,1,50,5


Now we will redo the same modeling process as above in order to find predictions for the above movies.

In [12]:
#Instansiate reader and data 
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(new_ratings_df, reader)

In [13]:
#Train test split 
trainset, testset = train_test_split(data, test_size=.2)

In [14]:
#Reinstantiate the model with the best parameters from GridSearch and fit on the trainset 
svdtuned2 = SVD(n_factors=80,
               reg_all=0.06,
               n_epochs=30,
               lr_all=0.01)
svdtuned2.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc01e8436a0>

In [20]:
#Find predictions for the three movies that user with userId=1000 just rated
print(svdtuned2.predict(1000,736))
print(svdtuned2.predict(1000,163))
print(svdtuned2.predict(1000,87192))


user: 1000       item: 736        r_ui = None   est = 3.16   {'was_impossible': False}
user: 1000       item: 163        r_ui = None   est = 3.68   {'was_impossible': False}
user: 1000       item: 87192      r_ui = None   est = 3.48   {'was_impossible': False}


Now we have predictions for User 1000's movies that it has rated so far. We have now seen how our model can generate new ratings, and formulate predictions for these ratings as well.

In [15]:
#Create list of unique userIds and movieIds 
userids = new_ratings_df['userId'].unique()
movieids = new_ratings_df['movieId'].unique()


In [16]:
#Create a list and append the userId, movieId, and estimated ratings 
predictions = [] 
for u in userids:
    for m in movieids:
        predicted = svdtuned2.predict(u, m)
        predictions.append([u, m, predicted[3]])

In [17]:
#Convert the list to a dataframe
estimated = pd.DataFrame(predictions)


In [18]:
#rename columns of DataFrame 
estimated.rename(columns={0: 'userId', 1: 'movieId', 2:'estimatedrating'}, inplace=True)

In [19]:
#Print the final dataFrame
estimated

Unnamed: 0,userId,movieId,estimatedrating
0,1,1,4.648733
1,1,3,3.981910
2,1,6,4.386362
3,1,47,4.731515
4,1,50,4.864031
...,...,...,...
230073,1000,8640,3.086044
230074,1000,51412,2.792345
230075,1000,85510,2.983296
230076,1000,111364,2.618949


In [22]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47232 entries, 0 to 100817
Data columns (total 3 columns):
userId     47232 non-null int64
movieId    47232 non-null int64
rating     47232 non-null float64
dtypes: float64(1), int64(2)
memory usage: 1.4 MB


In [20]:
estimated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230078 entries, 0 to 230077
Data columns (total 3 columns):
userId             230078 non-null int64
movieId            230078 non-null int64
estimatedrating    230078 non-null float64
dtypes: float64(1), int64(2)
memory usage: 5.3 MB


In [None]:
##Post Modeling EDA 

# have to double loop through all the users for each item and loop through movieID for each item 
# estimated = svdtuned.predict(1,1)[3]
# estimated

In [None]:
svdtuned.predict(1,1)
#do post model EDA on userId, movieID and estimated rating 
#is there popularity bias 
#can do a histogram of all the errors - distribution of the errors (leave to end)