# Assignment 2: recommender System

## Code from the lecture

In [1]:
#read data to DataFrames
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../../DATA/movielens100k/u.user', sep='|', names=u_cols, encoding = "ISO-8859-1")

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../../DATA/movielens100k/u.data', sep='\t', names=r_cols, encoding = "ISO-8859-1")

m_cols=['movie_id', 'title', 'release date', 'video release date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('../../DATA/movielens100k/u.item', sep='|', names=m_cols ,encoding = "ISO-8859-1" )

In [2]:
#generate matrix (this can be done more efficiently!)
R=np.zeros((np.shape(users)[0],np.shape(movies)[0]))
for i in range(np.shape(ratings)[0]):
    R[ratings['user_id'][i]-1, ratings['movie_id'][i]-1]=ratings['rating'][i]

In [3]:
def CosineDist(a,b):
    return a.dot(b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [4]:
#again, implementation not efficient: better use lamda opperators 
D=np.zeros((np.shape(movies)[0],np.shape(movies)[0]))
for i in range(0,np.shape(movies)[0]):
    for j in range(0,np.shape(movies)[0]):
        if i!=j:
            D[i,j]=CosineDist(R[:,i],R[:,j])

In [35]:
def getTopN(movie_id,D,N=5):
    return D[movie_id,:].argsort()[-N:]

def getIDbyName(name):
    if np.size(movies.movie_id[movies.title.str.contains(name)]) > 0:
        m = int(movies.movie_id[movies.title.str.contains(name)][:1]), str(movies.title[movies.title.str.contains(name)][:1])
        return m[0]-1
    else:
        return -1
    
def getNameByID(IDs):
    res=movies.iloc[IDs]
    return res.title

def CII(title, D):
    if getIDbyName(title) > 0:
        print ("recommending movies for: '" + str(getNameByID(getIDbyName(title)))+"'")
        return getNameByID(getTopN(getIDbyName(title),D))[::-1]
    else:
        print ("no movie title containing " + str(title) + "found...")
        
def Score_byID(ID, D, Test):
    #print ("Hit Scores for: ", getNameByID(ID))
    res_id = getTopN(ID,D)[::-1]
    res_title = getNameByID(getTopN(ID,D))[::-1]
    res_score = Test[res_id]
    return res_id, res_title, res_score, np.mean(res_score)
    
#compute scores for all test users
def test_Score(D_train, R_test):
    userScores=[]
    for i in range(R_test.shape[0]):
        userScore=0
        userTop = np.argsort(R_test[i])[-5:]
        for e in userTop:
            res_id, res_title, res_score, av_score = Score_byID(e,D_train, R_test[i])
            userScore+=av_score
        userScores.append(userScore/(5))
    return userScores

## 4.1 Alternative Distance Measures
Implement a different distance measure, e.g. eucledian and compare the ressults.

### Euklidischer Abstand:
$\vec{p},\vec{q}$ in $R$ gilt die Distanz:<BR><BR>
$d(\vec{p},\vec{q}) := {||\vec{q}-\vec{p}||_{2}} = {\sqrt{{(q_1-p_1)²} \ + \ ... \ + \ {(q_n-p_n)²}}} $ <BR><BR>

In [9]:
def euklidDist(p,q):
    return np.linalg.norm(q-p)

In [41]:
#again, implementation not efficient: better use lamda opperators 
D2=np.zeros((np.shape(movies)[0],np.shape(movies)[0]))
for i in range(0,np.shape(movies)[0]):
    for j in range(0,np.shape(movies)[0]):
        if i!=j:
            D2[i,j]=euklidDist(R[:,i],R[:,j])

In [65]:
D3=np.zeros((np.shape(movies)[0],np.shape(movies)[0]))
mask = [(i, j) for i in range(0,np.shape(movies)[0]) for j in range(0,np.shape(movies)[0]) if i!=j]
for i, j in mask:
    D3[i,j] = euklidDist(R[:,i],R[:,j])

In [None]:
#[euklidDist(R[:,i],R[:,j]) for i in range(0,np.shape(movies)[0]) for j in range(0,np.shape(movies)[0]) if i!=j]

In [81]:
assert D2[4,4]==D3[4,4], "D2 and D3 aren't the same"

In [30]:
# Recommendations by using eucledian distance
D2[getIDbyName('Star Wars'),:].argsort()[-5:]

array([1242,  937, 1126, 1061,  989])

In [31]:
# Recommendations by using cosine distance
D[getIDbyName('Star Wars'),:].argsort()[-5:]

array([126,   0, 171, 173, 180])

In [36]:
CII('Star Wars', D)

recommending movies for: 'Star Wars (1977)'


180          Return of the Jedi (1983)
173     Raiders of the Lost Ark (1981)
171    Empire Strikes Back, The (1980)
0                     Toy Story (1995)
126              Godfather, The (1972)
Name: title, dtype: object

In [37]:
CII('Star Wars', D2)

recommending movies for: 'Star Wars (1977)'


989              Anna Karenina (1997)
1061    Four Days in September (1997)
1126          Truman Show, The (1998)
937        Smile Like Yours, A (1997)
1242               Night Flier (1997)
Name: title, dtype: object

## 4.2 Baseline Results
Implement and compare results of
* random recommendations
* always recommending the top 5 movies (over all users)

In [69]:
#split into train and test data
from sklearn.model_selection import train_test_split
R_train, R_test = train_test_split(R, test_size=0.1)

In [71]:
DCos=np.zeros((np.shape(movies)[0],np.shape(movies)[0]))
mask = [(i, j) for i in range(0,np.shape(movies)[0]) for j in range(0,np.shape(movies)[0]) if i!=j]
for i, j in mask:
    DCos[i,j] = CosineDist(R_train[:,i],R_train[:,j])
    
DEuc=np.zeros((np.shape(movies)[0],np.shape(movies)[0]))
mask = [(i, j) for i in range(0,np.shape(movies)[0]) for j in range(0,np.shape(movies)[0]) if i!=j]
for i, j in mask:
    DEuc[i,j] = euklidDist(R_train[:,i],R_train[:,j])  

  return a.dot(b)/(np.linalg.norm(a)*np.linalg.norm(b))


In [80]:
assert DCos[2,0]!=DEuc[2,0], "D2 and D3 shouldn't be the same"

## 4.3 SVD
Implement the SVD approach. Hints: 
* use numpy.linalg.svd()
* apply SVD on $D$
* plot the singular values and decide where to cut off the reconstruction
* reconstruct and evaluate $D'$
