In [None]:
import numpy as np
import scipy.stats
import scipy.spatial
from sklearn.cross_validation import KFold
import random
from sklearn.metrics import mean_squared_error
from math import sqrt
import math
import warnings
import sys

warnings.simplefilter("error")

users = 700
items = 9066

def readingFile(filename):
    
    print "Loading ratings data ..."
    
    f = open(filename,"r")
    movie_index = []
    next(f)
    for row in f:
        r = row.split(',')
        if int(r[1]) not in movie_index:
            movie_index.append(int(r[1]))
    
    f = open(filename,"r")
    data = []
    next(f)
    for row in f:
        r = row.split(',')
        e = [int(r[0]), movie_index.index(int(r[1]))+1, float(r[2])]
        data.append(e)
        
    print "Finished loading ratings data."
    
    return data

def similarity_user(data):

    print "Calculating Pearson Correlation between users ..."
    
    user_similarity_pearson = np.zeros((users,users))
    for user1 in range(users):
        for user2 in range(users):
            if np.count_nonzero(data[user1]) and np.count_nonzero(data[user2]):
                try:
                    if not math.isnan(scipy.stats.pearsonr(data[user1],data[user2])[0]):
                        user_similarity_pearson[user1][user2] = scipy.stats.pearsonr(data[user1],data[user2])[0]
                    else:
                        user_similarity_pearson[user1][user2] = 0
                except:
                    user_similarity_pearson[user1][user2] = 0

    print "Finished calculating Pearson Correlation between users."
    
    return user_similarity_pearson

def crossValidation(data):
    
    k_fold = KFold(n=len(data), n_folds=2)

    Mat = np.zeros((users,items))
    for e in data:
        Mat[e[0]-1][e[1]-1] = e[2]
        
    sim_user_pearson = similarity_user(Mat)
    rmse_pearson = []
    
    for train_indices, test_indices in k_fold:
        
        train = [data[i] for i in train_indices]
        test = [data[i] for i in test_indices]

        M = np.zeros((users,items))

        for e in train:
            M[e[0]-1][e[1]-1] = e[2]

        true_rate = []
        pred_rate_pearson = []

        for e in test:
            user = e[0]
            item = e[1]
            true_rate.append(e[2])

            pred_pearson = 3.0

            if np.count_nonzero(M[user-1]):
                
                sim_pearson = sim_user_pearson[user-1]
                ind = (M[:,item-1] > 0)

                normal_pearson = np.sum(np.absolute(sim_pearson[ind]))

                if normal_pearson > 0:
                    pred_pearson = np.dot(sim_pearson,M[:,item-1])/normal_pearson

            if pred_pearson < 0:
                pred_pearson = 0

            if pred_pearson > 5:
                pred_pearson = 5

            print str(user) + "\t" + str(item) + "\t" + str(e[2]) + "\t" + str(pred_pearson)
            pred_rate_pearson.append(pred_pearson)

        rmse_pearson.append(sqrt(mean_squared_error(true_rate, pred_rate_pearson)))
        
        print str(sqrt(mean_squared_error(true_rate, pred_rate_pearson)))

    rmse_pearson = sum(rmse_pearson) / float(len(rmse_pearson))
    print str(rmse_pearson)
    
    return

recommend_data = readingFile("ratings.csv")
crossValidation(recommend_data)
