In [1]:
import io
import operator
from surprise import Dataset
from surprise import KNNWithMeans,accuracy
from surprise.model_selection import train_test_split

## Put movie IDs and names in dictionary

In [2]:
movies = dict()
data_movies = io.open('ml-100k/u.item',encoding="ISO-8859-1").read().split('\n')
for i in data_movies:
    info = i.split('|')
    if(info != ['']): 
        movie_id = (info[0])
        movie_name = (info[1])
        movies[movie_id]=[movie_name]

## Stores movies the user has already watched in a dictionary

In [3]:
movies_watched = dict()
data_users = io.open('ml-100k/u.data',encoding="ISO-8859-1").read().split('\n')
for i in data_users:
    info = i.split('\t')
    if(info != ['']):
        userID = info[0]
        movieID = info[1]
        if(userID in movies_watched):
            movies_watched[int(userID)].append(int(movieID))
        else:
            movies_watched[int(userID)] = [int(movieID)]

## Select movies that the user didn't watched

In [9]:
indications = dict()
moviesIds = set(movies.keys())
for i in movies_watched:
    moviesUserWatched= set(movies_watched[i])
    indications[i] = moviesIds.difference(moviesUserWatched)

## Use KNN algorithm to train the data

In [10]:
data = Dataset.load_builtin('ml-100k')
trainset,testset = train_test_split(data,test_size =.2)

algo = KNNWithMeans(k=4, sim_options={'name': 'cosine', 'user_based': True})

algo.fit(trainset)
predictions = algo.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


## Select the top 5 movies 

In [11]:
def top5Movies(userId):
    indicationsByRating = dict()
    indicationsByUser = list(indications[userId])

    for i in indicationsByUser:
        indicationsByRating[i] = algo.predict(uid=str(userId),iid=str(i)).est
    indicationsByRating = sorted(indicationsByRating.items(), key=lambda x: x[1],reverse = True)
    top5 = indicationsByRating[:5]
    for w in top5:
        print(movies[w[0]],w[1])

## Show the top 3 neighbors of user

In [12]:
def top3Neighbors(uid):
    neighbors = algo.get_neighbors(iid=uid, k=3)
    return neighbors


In [13]:
top5Movies(230)
top3 = top3Neighbors(360)
print('3 users neighbors of user id 360: ' + str(top3))
print('KNN RMSE: %.3f' % accuracy.rmse(predictions,verbose = False))

([u'Good Will Hunting (1997)'], 5)
([u'Close Shave, A (1995)'], 5)
([u'Star Kid (1997)'], 5)
([u'Great Day in Harlem, A (1994)'], 5)
([u'Usual Suspects, The (1995)'], 5)
3 users neighbors of user id 360: [34, 345, 371]
KNN RMSE: 1.041
