## DataPreprocess

### Movielen-100k

In [1]:
import numpy as np
import pandas as pd
import os
import pickle #store model

In [2]:
import collections
import itertools
import random
def get_rate():
    ratings = []
    with open('./ml-100k/u.data')as f:
        for line in itertools.islice(f, 0, None):
            user,movie,rate = line.strip('\r\n').split('\t')[:3]
            ratings.append([int(user),int(movie),int(rate)])
    return ratings
res = get_rate()
print(len(res))

100000


In [3]:
def train_test_split(ratings,test_size = 0.2):

    train, test = collections.defaultdict(dict), collections.defaultdict(dict)
    
    trainset_len = 0
    testset_len = 0
    

    for user,movie,rate in ratings:
        if random.random() < test_size:
            test[user][movie] = int(rate)
            testset_len += 1
        else:
            train[user][movie] = int(rate)
            trainset_len += 1
    return train,test
trainset,testset = train_test_split(res)

In [4]:
import collections
import math
from collections import defaultdict

def cal_user_sim(trainset,iif = False):
    print('Building movie-user table...')
    movie_user = collections.defaultdict(set)
    for user,movie in trainset.items():
        for m in movie:
            movie_user[m].add(user)
    print('Build movie-user table success!')
    
    movie_count = len(movie_user)
    
    user_sim_matrix = {}
    print('Build user co-rated movies matrix ...')
    for movie, users in movie_user.items():
            for u in users:
                for v in users:
                    if u == v:
                        continue
                    user_sim_matrix.setdefault(u, defaultdict(int))
                    user_sim_matrix[u].setdefault(v, 0)
                    if iif:
                        user_sim_matrix[u][v] += 1 / math.log(1 + len(users))
                    else:    
                        user_sim_matrix[u][v] += 1
    print('Build user co-rated movies matrix success!')
    
    
    print('Calculating user similarity matrix ...')
    for u, related_users in user_sim_matrix.items():
        for v, count in related_users.items():
            ##print(u,v)
            user_sim_matrix[u][v] = count / math.sqrt(len(trainset[u]) * len(trainset[v]))
    print('Calculate user similarity matrix success!')
    
    return user_sim_matrix,movie_count
user_sim_matrix1,movie_count1 = cal_user_sim(trainset)
user_sim_matrix2,movie_count2 = cal_user_sim(trainset,iif = True)
# print(user_sim_matrix[393])

Building movie-user table...
Build movie-user table success!
Build user co-rated movies matrix ...
Build user co-rated movies matrix success!
Calculating user similarity matrix ...
Calculate user similarity matrix success!
Building movie-user table...
Build movie-user table success!
Build user co-rated movies matrix ...
Build user co-rated movies matrix success!
Calculating user similarity matrix ...
Calculate user similarity matrix success!


In [5]:
def movie_popular(trainset):
    popularity_list = defaultdict(int)

    for user, movies in trainset.items():
        for m in movies:
            popularity_list[m] += 1

    return popularity_list
popularity = movie_popular(trainset)

In [6]:
from operator import itemgetter
def recommend(trainset, user_sim_matrix, user, nsimuser = 20, nrecmov = 10):
    K = nsimuser 
    N = nrecmov
    rank = {}
    watched_movie = trainset[user]
    for v, wuv in sorted(user_sim_matrix[user].items(),key = itemgetter(1),reverse = True)[0:K]:
        for movie,rating in trainset[v].items():
            if movie in watched_movie:
                continue
            rank.setdefault(movie,0)
            rank[movie] += wuv*rating
            
    return sorted(rank.items(),key = itemgetter(1),reverse=True)[0:N]
rec = recommend(trainset, user_sim_matrix1, 1)
movie_data = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1')
print("---------Recommended Movies---------" + '\n')
print([movie_data.iloc[r[0] - 2][1] for r in rec])



---------Recommended Movies---------

['Star Wars (1977)', 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)', 'Pulp Fiction (1994)', "Schindler's List (1993)", 'Fargo (1996)', 'Dead Poets Society (1989)', "One Flew Over the Cuckoo's Nest (1975)", 'Seven (Se7en) (1995)', 'GoodFellas (1990)', 'Hudsucker Proxy, The (1994)']


In [7]:

def evaluate(trainset, testset, movie_count, iif = False, nrecmov = 10):
    print("Evaluation start ...")
    N = nrecmov
    
    hit = 0
    rec_count = 0
    test_count = 0
    
    all_rec_movies = set()
    popular_sum = 0
    for i,user in enumerate(trainset):
        test_movies = testset.get(user,{})
        if not iif:
            rec_movies = recommend(trainset, user_sim_matrix1, user)
        else:
            rec_movies = recommend(trainset, user_sim_matrix2, user)
        for movie, w in rec_movies:
            if movie in test_movies:
                hit += 1
            all_rec_movies.add(movie)
            popular_sum += math.log(1 + popularity[movie])
        rec_count += N
        test_count += len(test_movies)
    precision = hit / (1.0 * rec_count)
    recall = hit / (1.0 * test_count)
    coverage = len(all_rec_movies) / (1.0 * movie_count)
    popularity_score = popular_sum / (1.0 * rec_count)
    print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' %
          (precision, recall, coverage, popularity_score))
evaluate(trainset, testset, movie_count1,)
evaluate(trainset, testset, movie_count2, iif = True)

Evaluation start ...
precision=0.3131	recall=0.1470	coverage=0.2118	popularity=5.4123

Evaluation start ...
precision=0.3150	recall=0.1479	coverage=0.2203	popularity=5.3870



In [8]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import collections
from collections import defaultdict
import itertools
import math

# Read data files:
train_data = pd.read_csv('ml-100k/ua.base', sep='\t')
test_data = pd.read_csv('ml-100k/ua.test', sep='\t')

# Create matrices with ratings
train, test = collections.defaultdict(dict), collections.defaultdict(dict)

for row in train_data.itertuples():
    train[row[1]][row[2]] = row[3]
    
for row in test_data.itertuples():
    test[row[1]][row[2]] = row[3]
#I changed to the same train test set for compare USER and ITEM
train,test = train_test_split(res)

In [9]:
# Calculate popularity of each movie in data set
def cal_popularity(trainset):
    popularity_list = defaultdict(int)

    for user, movies in trainset.items():
        for m in movies:
            popularity_list[m] += 1

    return popularity_list

In [10]:
# Calculate movie similarity matrix
def cal_item_sim(trainset, iuf = False):
    popularity = cal_popularity(trainset)

    sim_matrix = {}
    for user, movies in trainset.items():
        for m1 in movies:
            sim_matrix.setdefault(m1, defaultdict(int))
            for m2 in movies:
                if m1 == m2:
                    continue
                if not iuf:
                    sim_matrix[m1][m2] += 1
                else:# if a person views a lot of movies, items' similarity will be lower.
                    sim_matrix[m1][m2] += 1 / math.log(1 + len(movies))
    
    for m1, similar_movies in sim_matrix.items():
        for m2, score in similar_movies.items():
            sim_matrix[m1][m2] = score / math.sqrt(popularity[m1] * popularity[m2])

    return sim_matrix, popularity

item_sim_matrix1, popularity = cal_item_sim(train)
item_sim_matrix2, popularity = cal_item_sim(train,iuf = True)

In [11]:
from operator import itemgetter

# Recommend n_rec number of movies for specified user
def recommend(trainset, movie_sim_mat, user, n_sim = 20, n_rec = 10):
    scores = defaultdict(int)
    user_movies = trainset[user]
    for movie, rating in user_movies.items():
        for similar_movie, similarity in sorted(movie_sim_mat[movie].items(),
                                                       key=itemgetter(1), reverse=True)[0:n_sim]:
            if similar_movie in user_movies:
                continue
            scores[similar_movie] += similarity * rating
    return sorted(scores.items(), key=itemgetter(1), reverse=True)[0:n_rec]

rec = recommend(train, item_sim_matrix1, 1)
movie_data = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1')
print("---------Recommended Movies---------" + '\n')
print([movie_data.iloc[r[0] - 2][1] for r in rec])

---------Recommended Movies---------

['Raiders of the Lost Ark (1981)', 'E.T. the Extra-Terrestrial (1982)', 'Back to the Future (1985)', 'Speed (1994)', 'Groundhog Day (1993)', 'Alien (1979)', 'Scream of Stone (Schrei aus Stein) (1991)', 'Batman (1989)', 'Die Hard: With a Vengeance (1995)', 'Star Trek: The Wrath of Khan (1982)']


In [12]:
# Test algorithm performance
def evaluate(trainset, testset, iuf = False, n_rec = 10):
    print("Evaluation start ...")
    matches = 0
    rec_count = 0
    test_count = 0
    all_rec_movies = set()
    popular_sum = 0

    for i, user in enumerate(trainset):
        test_movies = testset.get(user, {})
        if not iuf:
            rec_movies = recommend(trainset, item_sim_matrix1, user)  # type:list
        else:
            rec_movies = recommend(trainset, item_sim_matrix2, user) 
        for movie, score in rec_movies:
            if movie in test_movies:
                matches += 1
            all_rec_movies.add(movie)
            popular_sum += math.log(1 + popularity[movie])
        rec_count += n_rec
        test_count += len(test_movies)
    precision = matches / (1.0 * rec_count)
    recall = matches / (1.0 * test_count)
    coverage = len(all_rec_movies) / (1.0 * len(popularity))
    popularity_score = popular_sum / (1.0 * rec_count)

    print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' %
          (precision, recall, coverage, popularity_score))
    
evaluate(train, test)
evaluate(train,test,iuf=True)

Evaluation start ...
precision=0.3002	recall=0.1415	coverage=0.1245	popularity=5.5230

Evaluation start ...
precision=0.3137	recall=0.1479	coverage=0.1172	popularity=5.5403



Model Base

In [13]:
import random
import math
import numpy as np

def train_test_split(ratings,test_size = 0.2):

    train, test = collections.defaultdict(dict), collections.defaultdict(dict)
    
    trainset_len = 0
    testset_len = 0
    

    for user,movie,rate in ratings:
        if random.random() < test_size:
            test[user][movie] = int(rate)
            testset_len += 1
        else:
            train[user][movie] = int(rate)
            trainset_len += 1
    return train,test
trainset,testset = train_test_split(res)

In [14]:
def movie_popular(trainset):
    popularity_list = defaultdict(int)
    maxnum = 0
    for user, movies in trainset.items():
        for m in movies:
            if m > maxnum:
                maxnum = m
            popularity_list[m] += 1
    return popularity_list,maxnum
popularity_list,maxnum = movie_popular(trainset)
userlen = len(trainset)

In [15]:
def newList(data,movieLen = 1682,userLen = 943):
        mateData = np.zeros((userLen, movieLen), dtype=float)
        for u, item in data.items():
            for n, r in item.items():
                mateData[int(u)-1][int(n)-1] = float(int(r))
        return mateData
mateData = newList(trainset)

In [16]:
def gradDes(dataMatrix,k,alpha,lam,maxCycles):
    m, n = np.shape(dataMatrix)
    p = np.mat(np.random.random((m, k)))
    q = np.mat(np.random.random((k, n)))
    
    for step in range(maxCycles):
        for i in range(m):
            for j in range(n):
                if dataMatrix[i, j] > 0:
                    error = dataMatrix[i, j]
                    for r in range(k):
                        error = error - p[i, r] * q[r, j]
                    for r in range(k):
                        p[i, r] = p[i, r] + alpha * (2 * error * q[r, j] - lam * p[i, r])
                        q[r, j] = q[r, j] + alpha * (2 * error * p[i, r] - lam * q[r, j])

        loss = 0.0
        for i in range(m):
            for j in range(n):
                if dataMatrix[i, j] > 0:
                    error = 0.0
                    for r in range(k):
                        error = error + p[i, r] * q[r, j]
                    # calculate loss function
                    loss = (dataMatrix[i, j] - error) * (dataMatrix[i, j] - error)
                    for r in range(k):
                        loss = loss + lam * (p[i, r] * p[i, r] + q[r, j] * q[r, j]) / 2
        print('This is step: ',step,'The loss is: ',loss)
        if loss < 0.001:
            break
    return p,q
p, q = gradDes(mateData, 5, 0.001, 0.01, 50)

This is step:  0 The loss is:  1.8421470082634357
This is step:  1 The loss is:  1.2878987377553603
This is step:  2 The loss is:  1.0475120043532082
This is step:  3 The loss is:  0.9175784077669128
This is step:  4 The loss is:  0.8334610275911499
This is step:  5 The loss is:  0.7699182444423589
This is step:  6 The loss is:  0.7164167837338128
This is step:  7 The loss is:  0.6684649325831546
This is step:  8 The loss is:  0.624128058045996
This is step:  9 The loss is:  0.5825592586009712
This is step:  10 The loss is:  0.5433709130920916
This is step:  11 The loss is:  0.5063660086769591
This is step:  12 The loss is:  0.4714243343515185
This is step:  13 The loss is:  0.4384551633506663
This is step:  14 The loss is:  0.40737825402614236
This is step:  15 The loss is:  0.37811674006500295
This is step:  16 The loss is:  0.35059488375746145
This is step:  17 The loss is:  0.32473771734788387
This is step:  18 The loss is:  0.300471334256457
This is step:  19 The loss is:  0.27772

In [1]:
def predict(dataMatrix, user, p, q, n_rec = 10):
    n = np.shape(dataMatrix)[1]
    predict = {}
    for j in range(n):
        if dataMatrix[int(user)-1, j] == 0:
            predict[j] = (p[int(user)-1,] * q[:, j])[0, 0]

    return sorted(predict.items(), key=lambda d: d[1], reverse=True)[0:n_rec]
rec = predict(mateData, 1,p,q)
movie_data = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1')
print("---------Recommended Movies---------" + '\n')
print([movie_data.iloc[r[0] - 2][1] for r in rec])

NameError: name 'mateData' is not defined

In [18]:
def evaluate(mateData, trainset, testset, n_rec = 10):
    print("Evaluation start ...")
    matches = 0
    rec_count = 0
    test_count = 0
    all_rec_movies = set()
    popular_sum = 0

    for i, user in enumerate(trainset):
        test_movies = testset.get(user, {})
        rec_movies = predict(mateData, user, p, q, n_rec = 10) 
        for movie, score in rec_movies:
            if movie in testset:
                matches += 1
            all_rec_movies.add(movie)
            popular_sum += math.log(1 + popularity[movie])
        rec_count += n_rec
        test_count += len(test_movies)
    precision = matches / (1.0 * rec_count)
    recall = matches / (1.0 * test_count)
    coverage = len(all_rec_movies) / (1.0 * len(popularity))
    popularity_score = popular_sum / (1.0 * rec_count)

    print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' %
          (precision, recall, coverage, popularity_score))

In [19]:
evaluate(mateData, trainset, testset, n_rec = 10)

Evaluation start ...
precision=0.8528	recall=0.4031	coverage=0.0843	popularity=3.9963

