In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.sparse as sp

from sklearn.model_selection import train_test_split as train_test_split_sklearn
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder


from surprise import Dataset
from surprise import KNNBasic, SVD
from surprise import Reader
from surprise.model_selection import cross_validate, split
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [2]:
movies = pd.read_csv('/kaggle/input/movietweetings/movies.dat', delimiter='::', engine='python', header=None, names = ['Movie ID', 'Movie Title', 'Genre'])
users = pd.read_csv('/kaggle/input/movietweetings/users.dat', delimiter='::', engine='python', header=None, names = ['User ID', 'Twitter ID'])
ratings = pd.read_csv('/kaggle/input/movietweetings/ratings.dat', delimiter='::', engine='python', header=None, names = ['User ID', 'Movie ID', 'Rating', 'Rating Timestamp'])
movies.head()

Unnamed: 0,Movie ID,Movie Title,Genre
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


In [3]:
movies.set_index("Movie ID", inplace=True)

In [4]:
users.head()

Unnamed: 0,User ID,Twitter ID
0,1,139564917
1,2,17528189
2,3,522540374
3,4,475571186
4,5,215022153


In [5]:
ratings.head()

Unnamed: 0,User ID,Movie ID,Rating,Rating Timestamp
0,1,114508,8,1381006850
1,2,499549,9,1376753198
2,2,1305591,8,1376742507
3,2,1428538,1,1371307089
4,3,75314,1,1595468524


In [6]:
ratings.describe()

Unnamed: 0,User ID,Movie ID,Rating,Rating Timestamp
count,906831.0,906831.0,906831.0,906831.0
mean,35597.050164,2239036.0,7.315742,1462485000.0
std,20522.988882,2122506.0,1.852979,71823020.0
min,1.0,8.0,0.0,1362062000.0
25%,18147.0,765010.0,6.0,1397069000.0
50%,35396.0,1723121.0,8.0,1451972000.0
75%,53034.0,2938956.0,9.0,1518231000.0
max,70783.0,14740900.0,10.0,1623118000.0


In [7]:
ratings.head()

Unnamed: 0,User ID,Movie ID,Rating,Rating Timestamp
0,1,114508,8,1381006850
1,2,499549,9,1376753198
2,2,1305591,8,1376742507
3,2,1428538,1,1371307089
4,3,75314,1,1595468524


In [8]:
# make dataset
reader = Reader(rating_scale=(0, 10))
data_suprise = Dataset.load_from_df(ratings[[ 'User ID', 'Movie ID', 'Rating']], reader)

In [9]:
#param_grid = {'n_factors': [30, 50, 100]}

#gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

#gs.fit(data_suprise)

# best RMSE score
#print(gs.best_score['rmse'])
#print(gs.best_params['rmse'])

In [10]:
trainset, testset = train_test_split(data_suprise, test_size=0.2)

# will use SVD 
algo_svd = SVD(n_factors=30, n_epochs=20)

algo_svd.fit(trainset)

predictions = algo_svd.test(testset)

accuracy.rmse(predictions)

RMSE: 1.4989


1.4988807906496073

In [11]:
# sample predicitions
for item in ratings.sample(10).values:
    #print(item)
    uid = item[0] 
    iid = item[1]
    pred = algo_svd.predict(uid, iid, verbose=False)
    print("{:7.3} {:4}".format(pred.est, item[2]))

   4.77    4
   5.01    5
    3.2    2
   6.48    3
   7.81    7
    7.5   10
   6.64    7
   4.89    5
   7.14    5
   8.29    8


In [12]:
def sample_recommendation(model, data, num, user_ids):

    items = data['Movie ID'].unique()

    for user_id in user_ids:
        print()
        print("*" * 60)
        print("  Recomendations for user_id =", user_id)
        print("*" * 60)
        
        print("\n     Already seen:")
        rtg = data[data["User ID"]== user_id]["Rating"].values
        for i, item in enumerate(data[data["User ID"] == user_id]["Movie ID"].values):           
            print("{:50} user rating = {}".format(movies.loc[item, "Movie Title"], rtg[i]))
        
        #known_positives = data['Movie ID'][data.tocsr()[user_id].indices]
        scores = []
        for item in items:
            scores.append(model.predict(user_id, item).est)
        
        #print(items[:10])
        argsort_scores_idx = np.argsort(scores)[-num:]
        
        #print(top_items[:10])
        #print(scores[np.argsort(scores)])
        
        
        print("\n     Recommended:")

        for id in argsort_scores_idx:
            movie_id = items[id]
            print("{:50} predicted rating = {}".format(movies.loc[movie_id, "Movie Title"], scores[id]))
        print()
sample_recommendation(algo_svd, ratings, 10, [529, 999, 4, 15, 22, 99, 150, 222])


************************************************************
  Recomendations for user_id = 529
************************************************************

     Already seen:
Alan Partridge: Alpha Papa (2013)                  user rating = 9
The Wolf of Wall Street (2013)                     user rating = 5
The Lone Ranger (2013)                             user rating = 8
The Wolverine (2013)                               user rating = 8
Pacific Rim (2013)                                 user rating = 1
Inside Llewyn Davis (2013)                         user rating = 6
The Monuments Men (2014)                           user rating = 5
White House Down (2013)                            user rating = 5

     Recommended:
Man of Steel (2013)                                predicted rating = 8.973886577017216
Gisaengchung (2019)                                predicted rating = 8.985167757233798
The Lord of the Rings: The Return of the King (2003) predicted rating = 8.998128238029599
1