# COEN 272 Project 2: Movie Recommendation
Brandon Quant

In [1]:
import pandas as pd
import numpy as np
import math

# Cosine Similarity

In [2]:
df = pd.read_csv('train.txt', sep="\t", header=None)

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,5,3,0,3,3,5,0,1,5,3,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,4,0,0,3,0,2,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,1,0,0,0,0,0,4,0,5,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
'''
Calculates the cosine similarity between two vectors and deals with the following conditions:
    -If there is only 1 dimension that is similar between the two users, calculate the similarity by taking the
     euclidean distance and then taking the inverse
    -If there is no similar dimenions between users, the simliarity is 0
'''
def cos_sim(v1, v2):
    v1_new = []
    v2_new = []
    for index in range(len(v1)):
        if v1[index] != 0 and v2[index] != 0:
            v1_new.append(v1[index])
            v2_new.append(v2[index])
    if(len(v1_new) == 0 and len(v2_new) == 0):
        #print("NO COMMON RATINGS\n")
        sim = 0;
        return sim;
    
    if(len(v1_new) == 1 and len(v2_new) == 1):
        euclidean_distance = np.linalg.norm(v1_new[0]-v2_new[0]);
        sim = 1/(1+euclidean_distance)
        return sim
    
    numerator = np.dot(v1_new,v2_new)
    v_norm_v1 = np.sqrt(np.sum(np.square(v1_new)))
    v_norm_v2 = np.sqrt(np.sum(np.square(v2_new)))
    sim = numerator/(v_norm_v1 * v_norm_v2)
    return sim

In [5]:
arr = df.to_numpy()
print(arr)
print(arr.shape)

[[5 3 0 ... 0 0 0]
 [4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [4 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [5 4 0 ... 0 0 0]]
(200, 1000)


In [6]:
test5 = pd.read_csv('test5.txt', sep=" ", header=None)
test10 = pd.read_csv('test10.txt', sep=" ", header=None)
test20 = pd.read_csv('test20.txt', sep=" ", header=None)

In [7]:
'''
Creates the 1x1000 list for each of the active users to match the dimensions of the training data lists
Matrix Dimensions: 100x1000
'''
def create_arr_for_new_users(num_ratings, test_file):
    init_ratings = [0] * 1000
    test_users = {}
    ratings = init_ratings
    counter = 0
    for index,row in test_file.iterrows():
        if row[2] == 0:
            continue;
        #print(row[0], row[1], row[2])
        user = row[0]
        index = row[1]-1
        rating = row[2]
        ratings[index] = rating
        counter += 1
        if counter == num_ratings:
            test_users[user] = ratings
            ratings = [0] * 1000
            counter = 0
    return test_users

'''
Helper functions to print out the dimensiosn and movies rated for each test user
'''
def print_dimensions_test_arr(test_arr, starting_user_id):
    print("Rows: " + str(len(test_arr)) + ", Columns: " + str(len(test_arr[starting_user_id])))
    
def print_rankings_test_arr(test_arr):
    for key in test_arr:
        print("New User: " + str(key))
        for index,rating in enumerate(test_arr[key]):
            if rating != 0:
                print("Movie " + str(index+1) + ": " + str(rating))
        print("\n")

def print_users_dict(test_arr):
    for key in test_arr:
        print(key, "->", test_arr[key], "\n")
        
def print_movie_ratings(arr):
    for index, val in enumerate(arr):
        if val != 0:
            print("Movie: " + str(index) + ", Rating: " + str(val))

In [8]:
test5_user_arr = create_arr_for_new_users(5, test5)
test10_user_arr = create_arr_for_new_users(10, test10)
test20_user_arr = create_arr_for_new_users(20, test20)

In [9]:
print_dimensions_test_arr(test5_user_arr, 201)
print_dimensions_test_arr(test10_user_arr, 301)
print_dimensions_test_arr(test20_user_arr, 401)

Rows: 100, Columns: 1000
Rows: 100, Columns: 1000
Rows: 100, Columns: 1000


In [10]:
'''
Calculating the cosine similarity for each active user with every single user in the training data. 
Row 0 is the list of similarities between active user 201 with users 1-200
Matrix Dimensions: 100x200
'''
def create_cos_sim_matrix(matrix, users_arr, training_arr):
    count = 0
    matrix = []
    vals = []
    for key in users_arr:
        for row in training_arr:
            cos_sim_value = cos_sim(users_arr[key],row)
            vals.append(cos_sim_value)
            count += 1
            if count == 200:
                matrix.append(vals)
                vals = []
                count = 0
    return matrix

def print_matrix_dimensions(matrix):
    print("Rows: " + str(len(matrix)))
    print("Columns: " + str(len(matrix[0])) + "\n")

In [11]:
cos_sim_matrix_5 = pd.DataFrame()
cos_sim_matrix_10 = pd.DataFrame()
cos_sim_matrix_20 = pd.DataFrame()
cos_sim_matrix_5 = create_cos_sim_matrix(cos_sim_matrix_5, test5_user_arr, arr)
cos_sim_matrix_10 = create_cos_sim_matrix(cos_sim_matrix_10, test10_user_arr, arr)
cos_sim_matrix_20 = create_cos_sim_matrix(cos_sim_matrix_20, test20_user_arr, arr)

In [12]:
print_matrix_dimensions(cos_sim_matrix_5)
print_matrix_dimensions(cos_sim_matrix_10)
print_matrix_dimensions(cos_sim_matrix_20)

Rows: 100
Columns: 200

Rows: 100
Columns: 200

Rows: 100
Columns: 200



In [13]:
'''
Returns the list of matrix of indexes with the highest cosine similarity in ascending order
Matrix dimensions: 100x200
'''
def index_k_most_similar(matrix):
    max_index = []
    abs_cos_sim = []
    for index,users in enumerate(matrix):
        for cos_sim_val in users:
            abs_cos_sim.append(abs(cos_sim_val))
        #print(abs_cos_sim)
        val = np.argsort(abs_cos_sim)[-200:]
        max_index.append(val.tolist())
        abs_cos_sim = []
    return max_index
   
k_most_similar_indexes_5 = index_k_most_similar(cos_sim_matrix_5)
k_most_similar_indexes_10 = index_k_most_similar(cos_sim_matrix_10)
k_most_similar_indexes_20 = index_k_most_similar(cos_sim_matrix_20)

print("Rows: " + str(len(k_most_similar_indexes_5)) + ", Cols: " + str(len(k_most_similar_indexes_5[0])))
print("Rows: " + str(len(k_most_similar_indexes_10)) + ", Cols: " + str(len(k_most_similar_indexes_10[0])))
print("Rows: " + str(len(k_most_similar_indexes_5)) + ", Cols: " + str(len(k_most_similar_indexes_10[0])))

Rows: 100, Cols: 200
Rows: 100, Cols: 200
Rows: 100, Cols: 200


In [14]:
print(k_most_similar_indexes_5[0], "\n")
print(k_most_similar_indexes_10[0], "\n")
print(k_most_similar_indexes_20[0], "\n")

[99, 132, 131, 130, 128, 126, 125, 123, 122, 121, 113, 110, 109, 106, 105, 104, 102, 101, 97, 96, 95, 92, 90, 87, 86, 85, 82, 76, 75, 71, 133, 137, 138, 139, 196, 194, 191, 190, 189, 186, 182, 179, 178, 174, 171, 169, 168, 167, 69, 165, 162, 160, 157, 155, 154, 153, 152, 150, 148, 147, 146, 145, 142, 141, 164, 66, 199, 34, 48, 50, 40, 39, 37, 52, 36, 54, 35, 33, 32, 29, 28, 59, 24, 27, 60, 64, 2, 4, 7, 22, 11, 18, 8, 20, 21, 45, 156, 188, 170, 30, 5, 38, 115, 149, 124, 117, 197, 31, 0, 81, 93, 70, 198, 49, 94, 55, 26, 58, 84, 151, 19, 158, 15, 77, 176, 9, 181, 6, 187, 72, 193, 195, 17, 134, 46, 47, 107, 119, 112, 118, 120, 43, 100, 144, 74, 114, 91, 16, 180, 44, 143, 108, 175, 159, 62, 12, 163, 103, 63, 88, 53, 80, 57, 61, 129, 185, 56, 65, 135, 67, 98, 23, 51, 68, 25, 14, 1, 41, 89, 136, 111, 192, 3, 140, 73, 127, 166, 183, 116, 83, 79, 177, 78, 161, 10, 173, 172, 42, 13, 184] 

[70, 189, 54, 186, 30, 77, 79, 142, 141, 35, 31, 102, 107, 121, 120, 182, 171, 45, 118, 87, 137, 166, 28, 1

# Calcuating the ranking using a weighted average

In [15]:
'''
Weighted rating function for cosine similarity
'''
def weighted_avg_ranking(test_file, shift_index, k_most_similar_indexes, cos_sim_matrix, k):
    numerator = 0
    denominator = 0
    output = []
    line = []
    
    for index,row in test_file.iterrows():
        temp_counter = 0
        counter = 0
        if row[2] == 0:
            #print(row[0], row[1], row[2])
            user_id = row[0] - shift_index
            movie = row[1]
            movie_id = row[1]-1
            
            indexes = k_most_similar_indexes[user_id]
            reverse_indexes = indexes.copy()
            reverse_indexes.reverse()

            for ind in reverse_indexes:
                cos_sim = cos_sim_matrix[user_id][ind]
                ranking = arr[ind][movie_id]
                if ranking != 0 and cos_sim > 0.6:
                    temp_counter += 1
                    #print("INDEX: " + str(ind) +  ", COS_SIM: " + str(cos_sim) + ", RANKING: " + str(ranking))

                    counter += 1
                    numerator += (cos_sim * ranking)
                    denominator += cos_sim
                    if counter == k:
                        break;
            #print("TEMP COUNTER: " + str(temp_counter))
            if temp_counter == 95: #testing to find users with a true K value of 95
                print("GREATER THAN 95")
            #temp_counter = 0
            if numerator != 0 and denominator != 0:
                #print("Numerator: " + str(numerator) + ", denominator: " + str(denominator))    
                final_rank = numerator/denominator
                final_rank = int(round(final_rank, 0))
                line = [row[0], row[1], final_rank]
                output.append(line)

            elif numerator == 0 and denominator == 0:
                #print("NO USER HAS WATCHED THE MOVIE")
                line = [row[0], row[1], 3]
                output.append(line) 
            numerator = 0
            denominator = 0       
        #print("\n")
    return output       

In [16]:
output5 = weighted_avg_ranking(test5, 201, k_most_similar_indexes_5, cos_sim_matrix_5, 95)

In [17]:
output10 = weighted_avg_ranking(test10, 301, k_most_similar_indexes_10, cos_sim_matrix_10, 95)

In [18]:
output20 = weighted_avg_ranking(test20, 401, k_most_similar_indexes_20, cos_sim_matrix_20, 95)

GREATER THAN 95


In [19]:
def convert_list_to_str(line):
    string = str(line)[1:-1]
    string = string.replace(',', '')
    return string

In [20]:
'''
Function to write the output to a file
'''
def write_to_file(output, output_file):
    with open(output_file, 'w') as f:
        for line in output:
            f.write(convert_list_to_str(line))
            f.write('\n')

In [21]:
write_to_file(output5, 'threshold0.6_k95_cos_sim_result5.txt')

In [22]:
write_to_file(output10, 'threshold0.6_k95_cos_sim_result10.txt')

In [23]:
write_to_file(output20, 'threshold0.6_k95_cos_sim_result20.txt')

# Pearson Correlation

In [24]:
'''
Calculates the mean for each user and subtracting the mean from each rating value != 0
Matrix Dimensions: 200x1000
'''
def pearson_train(training_data):
    pearson_correlation_arr = []
    pearson_correlation_list = []
    pearson_correlation_mean = []

    for index,user_ratings in enumerate(training_data):
        counter = 0
        count = 0
        summation = 0
        mean = 0
        
        #calculating means for each user
        for movie_rating in user_ratings:
            if movie_rating != 0:
                summation += movie_rating
                count += 1
        mean = summation/count
        pearson_correlation_mean.append(mean)

        for rating in user_ratings:
            if rating != 0:
                rating = rating - mean
            pearson_correlation_list.append(rating)
            counter += 1
            if counter == 1000:
                pearson_correlation_arr.append(pearson_correlation_list)
                counter = 0
                pearson_correlation_list = []
    return pearson_correlation_arr, pearson_correlation_mean

In [25]:
train_pearson_arr, train_pearson_means = pearson_train(arr)
#print(train_pearson_arr)
#print(train_pearson_means)

print("Rows of train_pearson_arr: " + str(len(train_pearson_arr)))
print("Cols of train_pearson_arr: " + str(len(train_pearson_arr[0])))
print("Length of train_pearson_means: " + str(len(train_pearson_means)))

Rows of train_pearson_arr: 200
Cols of train_pearson_arr: 1000
Length of train_pearson_means: 200


In [26]:
'''
Calculates the mean for each active user and subtracting the mean from each rating value != 0
Matrix Dimensions: 100x1000
'''
def pearson_test(test_user_arr):
    test_pearson_correlation_arr = []
    test_mean = []
    count = 0
    summation = 0
    for key in test_user_arr:
        summation = 0
        count = 0
        mean = 0
        for rating in test_user_arr[key]:
            if rating != 0:
                summation += rating
                count += 1
        mean = summation/count
        test_mean.append(mean)
        
        for rating in test_user_arr[key]:
            if rating != 0:
                rating = rating - mean
            test_pearson_correlation_arr.append(rating)

        entry = {key: test_pearson_correlation_arr}
        test_user_arr.update(entry)
        test_pearson_correlation_arr = []
    return test_mean

In [27]:
pearson_test5_user_arr = test5_user_arr.copy()
pearson_test10_user_arr = test10_user_arr.copy()
pearson_test20_user_arr = test20_user_arr.copy()

test5_pearson_means = pearson_test(pearson_test5_user_arr)
test10_pearson_means = pearson_test(pearson_test10_user_arr)
test20_pearson_means = pearson_test(pearson_test20_user_arr)

In [28]:
pearson_cos_sim_matrix_5 = pd.DataFrame()
pearson_cos_sim_matrix_10 = pd.DataFrame()
pearson_cos_sim_matrix_20 = pd.DataFrame()

pearson_cos_sim_matrix_5 = create_cos_sim_matrix(pearson_cos_sim_matrix_5, pearson_test5_user_arr, train_pearson_arr)
pearson_cos_sim_matrix_10 = create_cos_sim_matrix(pearson_cos_sim_matrix_10, pearson_test10_user_arr, train_pearson_arr)
pearson_cos_sim_matrix_20 = create_cos_sim_matrix(pearson_cos_sim_matrix_20, pearson_test20_user_arr, train_pearson_arr)

print_matrix_dimensions(pearson_cos_sim_matrix_5)
print_matrix_dimensions(pearson_cos_sim_matrix_10)
print_matrix_dimensions(pearson_cos_sim_matrix_20)

Rows: 100
Columns: 200

Rows: 100
Columns: 200

Rows: 100
Columns: 200



In [29]:
pearson_k_most_similar_indexes_5 = index_k_most_similar(pearson_cos_sim_matrix_5)
pearson_k_most_similar_indexes_10 = index_k_most_similar(pearson_cos_sim_matrix_10)
pearson_k_most_similar_indexes_20 = index_k_most_similar(pearson_cos_sim_matrix_20)

print_matrix_dimensions(pearson_k_most_similar_indexes_5)
print_matrix_dimensions(pearson_k_most_similar_indexes_10)
print_matrix_dimensions(pearson_k_most_similar_indexes_20)

Rows: 100
Columns: 200

Rows: 100
Columns: 200

Rows: 100
Columns: 200



In [30]:
'''
Weighted rating function for Pearson Correlation
'''
def ranking_pearson_correlation(test_file, shift_index, k_most_similar_indexes, cos_sim_matrix, test_pearson_means, train_pearson_means, arr, k):
    numerator = 0
    denominator = 0
    output = []
    line = []
    for index,row in test_file.iterrows():
        counter = 0
        if row[2] == 0:
            #print(row[0], row[1], row[2])
            active_user_id = row[0] - shift_index
            movie = row[1]
            movie_id = row[1]-1

            indexes = k_most_similar_indexes[active_user_id]
            reverse_indexes = indexes.copy()
            reverse_indexes.reverse()
            for ind in reverse_indexes:
                test_mean_val = test_pearson_means[active_user_id]
                mean_val_train_user = train_pearson_means[ind]
                #print("Mean val for user " + str(row[0]) + ": " + str(test_mean_val))
                #print("Mean val for user " + str(ind) + ": " + str(mean_val_train_user))
                cos_sim = cos_sim_matrix[active_user_id][ind]
                ranking = arr[ind][movie_id]
                if ranking != 0:
                    counter += 1
                    numerator += (cos_sim * (ranking - mean_val_train_user))
                    denominator += abs(cos_sim)
                    if counter == k:
                        break;
                #else:
                    #print("Ranking: NOT RANKED")
            if denominator != 0:
                #print("Numerator: " + str(numerator) + ", denominator: " + str(denominator))    
                final_rank = test_mean_val + (numerator/denominator)
                final_rank = int(round(final_rank, 0))
                if final_rank < 1:
                    final_rank = 1
                elif final_rank > 5:
                    final_rank = 5
                line = [row[0], row[1], final_rank]
                output.append(line)
                #print("Final Rank for movie " + str(movie) + ": " + str(final_rank))  
            elif numerator == 0 and denominator == 0:
                #print("NO USER HAS WATCHED THE MOVIE\n")
                line = [row[0], row[1], 3]
                output.append(line)     
            numerator = 0
            denominator = 0
        #print("\n")
    return output

In [31]:
pearson_output5 = ranking_pearson_correlation(test5, 201, pearson_k_most_similar_indexes_5, pearson_cos_sim_matrix_5, test5_pearson_means, train_pearson_means, arr, 20)

In [32]:
pearson_output10 = ranking_pearson_correlation(test10, 301, pearson_k_most_similar_indexes_10, pearson_cos_sim_matrix_10, test10_pearson_means, train_pearson_means, arr, 20)

In [33]:
pearson_output20 = ranking_pearson_correlation(test20, 401, pearson_k_most_similar_indexes_20, pearson_cos_sim_matrix_20, test20_pearson_means, train_pearson_means, arr, 20)

In [34]:
write_to_file(pearson_output5, 'abs_k20_pearson_result5.txt')
write_to_file(pearson_output10, 'abs_k10_pearson_result10.txt')
write_to_file(pearson_output20, 'abs_k10_pearson_result20.txt')

# Pearson Correlation With Inverse User Frequency

In [35]:
print(arr)
print(arr.shape)

[[5 3 0 ... 0 0 0]
 [4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [4 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [5 4 0 ... 0 0 0]]
(200, 1000)


In [36]:
'''
Calculating the movie counts for the training data
'''
def training_movie_counts():
    movie_rated_counts = [0] * 1000
    for row in arr:
        for index,rating in enumerate(row):
            if rating != 0:
                movie_rated_counts[index] += 1
    print("Length of Training Movie Counts: " + str(len(movie_rated_counts)))
    return movie_rated_counts

In [37]:
'''
Returns an array of iuf values for each training user
'''
def iuf_train_vals_arr(movie_rated_counts):
    iuf_training = []
    for i in range(len(movie_rated_counts)):
        if movie_rated_counts[i] != 0:
            #print("movie_rated_counts[i]: " + str(movie_rated_counts[i]))
            iuf_rating = math.log10((200/movie_rated_counts[i]))
        else:
            iuf_rating = 1
        iuf_training.append(iuf_rating)
    print("Length of IUF Training Values Array: " + str(len(iuf_training)))
    return iuf_training

In [38]:
'''
Returns the training matrix with each rating = rating * IUF value
'''
def create_iuf_matrix(training_data, iuf_training_vals):
    for index,row in enumerate(training_data):
        for index2,rating in enumerate(row):
            training_data[index][index2] = float(training_data[index][index2]) * float(iuf_training_vals[index2])
    return training_data

In [39]:
'''
Complete function to return the training matrix
Matrix Dimensions: 200x1000
'''
def final_iuf_train_arr():
    movie_rated_counts = training_movie_counts()
    iuf_training_vals = iuf_train_vals_arr(movie_rated_counts)
    #print(iuf_training_vals)

    training_data = arr.copy()
    training_data = training_data.astype(float)
    #print(training_data)
    
    iuf_training_data = create_iuf_matrix(training_data, iuf_training_vals)
    print("Rows of iuf_training_data: " + str(len(iuf_training_data)))
    print("Cols of iuf_training_data: " + str(len(iuf_training_data[0])))
    return iuf_training_data

iuf_training_data = final_iuf_train_arr()

Length of Training Movie Counts: 1000
Length of IUF Training Values Array: 1000
Rows of iuf_training_data: 200
Cols of iuf_training_data: 1000


In [40]:
iuf_pearson_test5_user_arr = test5_user_arr.copy()
iuf_pearson_test10_user_arr = test10_user_arr.copy()
iuf_pearson_test20_user_arr = test20_user_arr.copy()

In [41]:
'''
Calculating the movie counts for the test data
'''
def test_movie_counts(test_user_arr):
    test_movie_rated_counts = [0] * 1000
    for key in test_user_arr:
        for index,rating in enumerate(test_user_arr[key]):
            if rating != 0:
                test_movie_rated_counts[index] += 1
    return test_movie_rated_counts

test5_movie_rated_counts = test_movie_counts(iuf_pearson_test5_user_arr)
test10_movie_rated_counts = test_movie_counts(iuf_pearson_test10_user_arr)
test20_movie_rated_counts = test_movie_counts(iuf_pearson_test20_user_arr)

print("Length of test5_movie_rated_counts: " + str(len(test5_movie_rated_counts)))
print("Length of test10_movie_rated_counts: " + str(len(test10_movie_rated_counts)))
print("Length of test20_movie_rated_counts: " + str(len(test20_movie_rated_counts)))

Length of test5_movie_rated_counts: 1000
Length of test10_movie_rated_counts: 1000
Length of test20_movie_rated_counts: 1000


In [42]:
'''
Returns an array of iuf values for each active user
'''
def iuf_test_vals_arr(test_movie_rated_counts):
    iuf_testing_vals = []
    for i in range(len(test_movie_rated_counts)):
        if test_movie_rated_counts[i] != 0:
            iuf_rating = math.log10((200/test_movie_rated_counts[i]))
        else:
            iuf_rating = 1
        iuf_testing_vals.append(iuf_rating)
    return iuf_testing_vals

iuf_test5_vals = iuf_test_vals_arr(test5_movie_rated_counts)
iuf_test10_vals = iuf_test_vals_arr(test10_movie_rated_counts)
iuf_test20_vals = iuf_test_vals_arr(test20_movie_rated_counts)

print("Length of iuf_test5_vals: " + str(len(iuf_test5_vals)))
print("Length of iuf_test10_vals: " + str(len(iuf_test10_vals)))
print("Length of iuf_test20_vals: " + str(len(iuf_test20_vals)))

Length of iuf_test5_vals: 1000
Length of iuf_test10_vals: 1000
Length of iuf_test20_vals: 1000


In [43]:
'''
Updates the test users dictionary with each rating = rating * IUF value
'''
def iuf_test_arr(test_user_arr, iuf_test_vals):
    iuf_testing_list = []
    new_rating = 0
    for key in test_user_arr:
        for index,rating in enumerate(test_user_arr[key]):
            #print(rating, iuf_test_vals[index])
            new_rating = rating * iuf_test_vals[index]
            iuf_testing_list.append(new_rating)
        entry = {key: iuf_testing_list}
        test_user_arr.update(entry)
        iuf_testing_list = []
        
iuf_test_arr(iuf_pearson_test5_user_arr, iuf_test5_vals)
iuf_test_arr(iuf_pearson_test10_user_arr, iuf_test10_vals)
iuf_test_arr(iuf_pearson_test20_user_arr, iuf_test20_vals)

In [44]:
iuf_train_pearson_arr, iuf_train_pearson_means = pearson_train(iuf_training_data)

print("Rows of iuf_train_pearson_arr: " + str(len(iuf_train_pearson_arr)))
print("Cols of iuf_train_pearson_arr: " + str(len(iuf_train_pearson_arr[0])))
print("Length of iuf_train_pearson_means: " + str(len(iuf_train_pearson_means)))

Rows of iuf_train_pearson_arr: 200
Cols of iuf_train_pearson_arr: 1000
Length of iuf_train_pearson_means: 200


In [45]:
iuf_test5_pearson_means = pearson_test(iuf_pearson_test5_user_arr)
iuf_test10_pearson_means = pearson_test(iuf_pearson_test10_user_arr)
iuf_test20_pearson_means = pearson_test(iuf_pearson_test20_user_arr)

print("Length of iuf_test5_pearson_means: " + str(len(iuf_test5_pearson_means)))
print("Length of iuf_test10_pearson_means: " + str(len(iuf_test10_pearson_means)))
print("Length of iuf_test20_pearson_means: " + str(len(iuf_test20_pearson_means)))
#print(iuf_pearson_test5_user_arr)
#print(iuf_test5_pearson_means)

Length of iuf_test5_pearson_means: 100
Length of iuf_test10_pearson_means: 100
Length of iuf_test20_pearson_means: 100


In [46]:
iuf_pearson_cos_sim_matrix_5 = pd.DataFrame()
iuf_pearson_cos_sim_matrix_10 = pd.DataFrame()
iuf_pearson_cos_sim_matrix_20 = pd.DataFrame()

iuf_pearson_cos_sim_matrix_5 = create_cos_sim_matrix(iuf_pearson_cos_sim_matrix_5, iuf_pearson_test5_user_arr, iuf_train_pearson_arr)
iuf_pearson_cos_sim_matrix_10 = create_cos_sim_matrix(iuf_pearson_cos_sim_matrix_10, iuf_pearson_test10_user_arr, iuf_train_pearson_arr)
iuf_pearson_cos_sim_matrix_20 = create_cos_sim_matrix(iuf_pearson_cos_sim_matrix_20, iuf_pearson_test20_user_arr, iuf_train_pearson_arr)

In [47]:
iuf_pearson_k_most_similar_indexes_5 = index_k_most_similar(iuf_pearson_cos_sim_matrix_5)
iuf_pearson_k_most_similar_indexes_10 = index_k_most_similar(iuf_pearson_cos_sim_matrix_10)
iuf_pearson_k_most_similar_indexes_20 = index_k_most_similar(iuf_pearson_cos_sim_matrix_20)

print("Rows: " + str(len(iuf_pearson_k_most_similar_indexes_5)) + ", Cols:" + str(len(iuf_pearson_k_most_similar_indexes_5[0])))
print("Rows: " + str(len(iuf_pearson_k_most_similar_indexes_10)) + ", Cols:" + str(len(iuf_pearson_k_most_similar_indexes_10[0])))
print("Rows: " + str(len(iuf_pearson_k_most_similar_indexes_20)) + ", Cols:" + str(len(iuf_pearson_k_most_similar_indexes_20[0])))


Rows: 100, Cols:200
Rows: 100, Cols:200
Rows: 100, Cols:200


IUF does not affect rating prediction, thus we pass the old cos_sim_matrix, and old training and testing means

In [48]:
iuf_pearson_output5 = ranking_pearson_correlation(test5, 201, iuf_pearson_k_most_similar_indexes_5, pearson_cos_sim_matrix_5, test5_pearson_means, train_pearson_means, arr, 20)

In [49]:
iuf_pearson_output10 = ranking_pearson_correlation(test10, 301, iuf_pearson_k_most_similar_indexes_10, pearson_cos_sim_matrix_10, test10_pearson_means, train_pearson_means, arr, 20)

In [50]:
iuf_pearson_output20 = ranking_pearson_correlation(test20, 401, iuf_pearson_k_most_similar_indexes_20, pearson_cos_sim_matrix_20, test20_pearson_means, train_pearson_means, arr, 20)

In [51]:
write_to_file(iuf_pearson_output5, 'abs_k20_iuf_pearson_result5.txt')
write_to_file(iuf_pearson_output10, 'abs_k20_iuf_pearson_result10.txt')
write_to_file(iuf_pearson_output20, 'abs_k20_iuf_pearson_result20.txt')

# Pearson Correlation With Case Amplification

In [52]:
#pearson_test5_user_arr = test5_user_arr.copy()
#pearson_test10_user_arr = test10_user_arr.copy()
#pearson_test20_user_arr = test20_user_arr.copy()
#test5_pearson_means = pearson_test(pearson_test5_user_arr)
#test10_pearson_means = pearson_test(pearson_test10_user_arr)
#test20_pearson_means = pearson_test(pearson_test20_user_arr)
#print(pearson_test5_user_arr)

print(test5_pearson_means)
print(test10_pearson_means)
print(test20_pearson_means)

[4.4, 3.4, 3.4, 4.0, 2.2, 2.6, 4.0, 4.0, 4.2, 2.2, 3.6, 3.8, 3.6, 4.2, 4.2, 4.0, 3.4, 2.8, 4.8, 3.8, 4.4, 3.2, 3.4, 4.0, 2.6, 4.0, 4.8, 3.6, 3.0, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 4.0, 3.2, 4.0, 3.4, 2.8, 3.8, 2.2, 4.2, 3.6, 1.4, 2.4, 4.2, 3.0, 3.4, 4.2, 2.2, 3.6, 3.4, 2.4, 3.6, 3.6, 4.4, 3.8, 3.2, 3.8, 3.4, 3.8, 2.4, 3.8, 3.6, 3.8, 3.8, 3.4, 4.8, 2.8, 3.6, 4.2, 3.6, 4.4, 4.6, 3.4, 4.2, 4.0, 3.4, 3.4, 4.2, 3.6, 2.8, 4.6, 2.6, 3.2, 3.4, 2.6, 3.4, 3.0, 3.2, 3.6, 3.8, 4.4, 3.8, 4.2, 3.4, 3.2, 3.8, 4.0]
[2.6, 2.9, 3.9, 4.8, 3.3, 3.6, 3.6, 3.9, 3.3, 3.0, 3.7, 4.2, 3.6, 3.9, 4.1, 3.5, 3.8, 4.4, 3.0, 3.5, 3.9, 2.8, 3.9, 2.5, 2.8, 4.1, 3.6, 4.4, 3.8, 3.6, 4.5, 4.0, 4.0, 3.7, 3.1, 3.8, 4.2, 4.5, 4.2, 3.1, 4.2, 3.5, 3.8, 3.4, 3.9, 4.0, 3.8, 4.2, 3.9, 4.0, 3.7, 3.7, 4.0, 3.0, 4.3, 3.3, 3.8, 4.0, 4.0, 3.3, 3.1, 3.8, 3.1, 4.4, 4.0, 3.0, 3.8, 2.8, 3.8, 4.1, 4.1, 4.0, 3.7, 3.7, 3.9, 3.7, 4.5, 3.2, 3.8, 3.5, 4.6, 4.0, 3.1, 3.7, 3.3, 4.5, 3.9, 3.8, 3.8, 3.9, 3.7, 4.5, 3.9, 3.5, 3.8, 3.7, 3.2, 3.6, 2.7, 4.2

In [53]:
'''
Returns the original cosine matrix with rating = rating * abs(rating)^1.5
Matrix Dimensions: 100x1000
'''
def case_mod_cos_sim_matrix(cos_sim_matrix):
    for index,row in enumerate(cos_sim_matrix):
        for index2,cos_sim_val in enumerate(row):
            cos_sim_matrix[index][index2] = cos_sim_matrix[index][index2] * (math.pow(abs(cos_sim_matrix[index][index2]),1.5))
    return cos_sim_matrix

case_mod_pearson_cos_sim_matrix_5 = pearson_cos_sim_matrix_5.copy()
case_mod_pearson_cos_sim_matrix_10 = pearson_cos_sim_matrix_10.copy()
case_mod_pearson_cos_sim_matrix_20 = pearson_cos_sim_matrix_20.copy()

case_mod_pearson_cos_sim_matrix_5 = case_mod_cos_sim_matrix(case_mod_pearson_cos_sim_matrix_5)
case_mod_pearson_cos_sim_matrix_10 = case_mod_cos_sim_matrix(case_mod_pearson_cos_sim_matrix_10)
case_mod_pearson_cos_sim_matrix_20 = case_mod_cos_sim_matrix(case_mod_pearson_cos_sim_matrix_20)

In [54]:
case_mod_pearson_k_most_similar_indexes_5 = index_k_most_similar(case_mod_pearson_cos_sim_matrix_5)
case_mod_pearson_k_most_similar_indexes_10 = index_k_most_similar(case_mod_pearson_cos_sim_matrix_10)
case_mod_pearson_k_most_similar_indexes_20 = index_k_most_similar(case_mod_pearson_cos_sim_matrix_20)

print(len(case_mod_pearson_k_most_similar_indexes_5))
print(len(case_mod_pearson_k_most_similar_indexes_10))
print(len(case_mod_pearson_k_most_similar_indexes_20))

100
100
100


In [55]:
case_mod_pearson_output5 = ranking_pearson_correlation(test5, 201, case_mod_pearson_k_most_similar_indexes_5, case_mod_pearson_cos_sim_matrix_5, test5_pearson_means, train_pearson_means, arr, 20)

In [56]:
case_mod_pearson_output10 = ranking_pearson_correlation(test10, 301, case_mod_pearson_k_most_similar_indexes_10, case_mod_pearson_cos_sim_matrix_10, test10_pearson_means, train_pearson_means, arr, 20)

In [57]:
case_mod_pearson_output20 = ranking_pearson_correlation(test20, 401, case_mod_pearson_k_most_similar_indexes_20, case_mod_pearson_cos_sim_matrix_20, test20_pearson_means, train_pearson_means, arr, 20)

In [58]:
write_to_file(case_mod_pearson_output5, 'abs_k20_case_mod_pearson_result5.txt')
write_to_file(case_mod_pearson_output10, 'abs_k20_case_mod_pearson_result10.txt')
write_to_file(case_mod_pearson_output20, 'abs_k20_case_mod_pearson_result20.txt')

# Item Based Collaborative Filtering

In [59]:
item_based_arr = arr.copy()
item_based_arr = item_based_arr.T

print(item_based_arr)
print(len(item_based_arr), len(item_based_arr[0]))

[[5 4 0 ... 4 1 5]
 [3 0 0 ... 0 0 4]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
1000 200


In [60]:
'''
Calculates the adjusted cosine similarity between two movies
'''
def adj_cos_sim(v1,v2,active_user_mean):
    #print(v1, v2, active_user_mean)
    v1_new = []
    v2_new = []
    numerator = 0
    denominator = 0
    counter = 0
    for i in range(0,len(v1)):
        if v1[i] != 0 and v2[i] != 0:
            v1_new.append(v1[i])
            v2_new.append(v2[i])
            numerator += (v1[i] - active_user_mean) * (v2[i] - active_user_mean)
            denominator += (math.sqrt(math.pow(v1[i]-active_user_mean,2)))*(math.sqrt(math.pow(v2[i]-active_user_mean,2)))
            counter += 1
    #print("NUM: " + str(numerator) + ", DENOM: " + str(denominator))
    if(len(v1_new) == 0 and len(v2_new) == 0):
        #print("NO COMMON USERS HAVE RATED THE MOVIE\n")
        sim = 0
        return sim
    elif(len(v1_new) == 1 and len(v2_new) == 1):
        euclidean_distance = np.linalg.norm(v1_new[0]-v2_new[0]);
        sim = 1/(1+euclidean_distance)
        return sim     
    
    if counter > 1:
        if numerator == 0.0 and denominator == 0.0:
            sim = 0 
            return sim
        sim = numerator/denominator
    return sim

In [61]:
test5_item_based_means = test5_pearson_means.copy()
test10_item_based_means = test10_pearson_means.copy()
test20_item_based_means = test20_pearson_means.copy()

#print(test5_item_based_means)
#print(test10_item_based_means)
#print(test20_item_based_means)

In [62]:
'''
Creating a matrix with lists for each active user of the movies they have already rated.
So dimensions for test5, test10, test20 are 100x5, 100x10, 100x20
'''
def create_list_rated_movies_test_users(num_ratings, test_file):
    total_test_users_rated_movie_list = []
    user_rated_movies = []
    counter = 0
    
    for index,row in test_file.iterrows():
        if row[2] == 0:
            continue;

        movie_id = row[1]-1
        user_rated_movies.append((movie_id,row[2]))
        counter += 1
        if counter == num_ratings:
            total_test_users_rated_movie_list.append(user_rated_movies)
            counter = 0
            user_rated_movies = []
    return total_test_users_rated_movie_list

test5_users_rated_movie_matrix = create_list_rated_movies_test_users(5, test5)
test10_users_rated_movie_matrix = create_list_rated_movies_test_users(10, test10)
test20_users_rated_movie_matrix = create_list_rated_movies_test_users(20, test20)

In [63]:
'''
Returns the indexes of the most similar movies in descending order
'''
def item_based_index_sorted(indexes, k):
    val = np.argsort(indexes)[-k:]
    val = val.tolist()
    val.reverse()
    return val

In [64]:
'''
Ranking function for item based CF
'''
def item_based_weighted_ranking(test_file, test_users_rated_movie_matrix, test5_item_based_means, item_based_arr, shift_index, k):
    output = []
    cos_sim_weights = []
    total_cos_sim_weights = []
    for index,row in test_file.iterrows():
        counter = 0
        weight = 0
        numerator = 0
        denominator = 0
        if row[2] == 0:
            #print("\n",row[0], row[1], row[2])
            movie_id = row[0] - shift_index
            #print("MOVIE_ID: " + str(movie_id))
            test_movie = row[1]-1
            #print("TEST_MOVIE: " + str(test_movie))
            
            #print("MEAN: " + str(test5_item_based_means[movie_id]))
            #print("ACTIVE USER RATED MOVIES: " + str(test_users_rated_movie_matrix[movie_id]))
            
            for movie in test_users_rated_movie_matrix[movie_id]:
                weight = adj_cos_sim(item_based_arr[movie[0]],item_based_arr[test_movie],test5_item_based_means[movie_id])
                #print("WEIGHT: " + str(weight))
                cos_sim_weights.append(weight)
                counter += 1
            
            sorted_indexes = item_based_index_sorted(cos_sim_weights, k)
            #print("INDEX: " + str(index))
            #print("SORTED_INDEXES: " + str(sorted_indexes))
            #print("COS_SIM_WEIGHTS: " + str(cos_sim_weights))
            for i in range(k):
                #print("SORTED_INDEXES[I]: " + str(sorted_indexes[i]))
                movie_ranking_tuple_list = test_users_rated_movie_matrix[movie_id]
                #print("MOVIE[movie_id]: " + str(movie_ranking_tuple_list))
                #print("cos_sim_weights[sorted_indexes[i]]: " + str(cos_sim_weights[sorted_indexes[i]]))
                ranking = movie_ranking_tuple_list[sorted_indexes[i]][1]
                if cos_sim_weights[i] > 0.3:
                    #print("ranking: " + str(movie_ranking_tuple_list[sorted_indexes[i]]) + str(ranking))
                    numerator += (cos_sim_weights[sorted_indexes[i]] * ranking)
                    #print(cos_sim_weights[sorted_indexes[i]])
                    denominator += cos_sim_weights[sorted_indexes[i]]
            
            #print("NUMERATOR: " + str(numerator) + ", DENOMINATOR: " + str(denominator))
            if denominator != 0.0:
                final_rank = numerator/denominator
                #print("FINAL_RANK: " + str(final_rank))
                if final_rank > 5:
                    final_rank = 5
                if final_rank < 1:
                    final_rank = 1
                final_rank = int(round(final_rank, 0))
                line = [row[0], row[1], final_rank]
                output.append(line)
                line = [] 
            else:
                line = [row[0], row[1], 3]
                output.append(line)
                line = [] 
            cos_sim_weights = []
    return output 

In [65]:
item_based_output5 = item_based_weighted_ranking(test5, test5_users_rated_movie_matrix, test5_item_based_means, item_based_arr, 201, 3)

In [66]:
item_based_output10 = item_based_weighted_ranking(test10, test10_users_rated_movie_matrix, test10_item_based_means, item_based_arr, 301, 5)

In [67]:
item_based_output20 = item_based_weighted_ranking(test20, test20_users_rated_movie_matrix, test20_item_based_means, item_based_arr, 401, 5)

In [68]:
write_to_file(item_based_output5, 'threshold0.3_k3_item_based_result5.txt')
write_to_file(item_based_output10, 'threshold0.3_k5_item_based_result10.txt')
write_to_file(item_based_output20, 'threshold0.3_k5_item_based_result20.txt')

# Own Algorithm

In [70]:
'''
Taking a weighted average of cosine similarity, pearson correlation, and pearson correlation with IUF ratings
Below is my trial and error using different combinations of weights to find the best MAE
'''
def own_algo(cos_sim_output, pearson_output, iuf_pearson_output):  
    output = []
    for i in range(len(cos_sim_output)):

        #algo1
        #average = (cos_sim_output[i][2] + pearson_output[i][2] + iuf_pearson_output[i][2])/3  0.77
        #algo3
        #average = (cos_sim_output[i][2]*0.3) + (pearson_output[i][2]*0.3) + (iuf_pearson_output[i][2]*0.4) 0.78
        #algo2
        #average = (cos_sim_output[i][2]*0.2) + (pearson_output[i][2]*0.4) + (iuf_pearson_output[i][2]*0.4) 0.77
        #algo4
        #average = (cos_sim_output[i][2]*0.4) + (pearson_output[i][2]*0.2) + (iuf_pearson_output[i][2]*0.4) 0.769
        #algo5: 
        #average = (cos_sim_output[i][2]*0.7) + (pearson_output[i][2]*0.1) + (iuf_pearson_output[i][2]*0.2) 0.778
        #algo6: 0.769
        average = (cos_sim_output[i][2]*0.5) + (pearson_output[i][2]*0.2) + (iuf_pearson_output[i][2]*0.3) 
        #algo7: 
        #average = (cos_sim_output[i][2]*0.6) + (pearson_output[i][2]*0.2) + (iuf_pearson_output[i][2]*0.2) 0.769
        #algo8:
        #average = (cos_sim_output[i][2]*0.65) + (pearson_output[i][2]*0.2) + (iuf_pearson_output[i][2]*0.15) 0.777
        #algo9: close to algo3
        #average = (cos_sim_output[i][2]*0.2) + (pearson_output[i][2]*0.3) + (iuf_pearson_output[i][2]*0.5)
        #algo10: same as 11
        #average = (cos_sim_output[i][2]*0.55) + (pearson_output[i][2]*0.2) + (iuf_pearson_output[i][2]*0.25)
        #algo11: close to 5 and 7
        #average = (cos_sim_output[i][2]*0.55) + (pearson_output[i][2]*0.25) + (iuf_pearson_output[i][2]*0.2)
        #algo12: same as 7
        #average = (cos_sim_output[i][2]*0.6) + (pearson_output[i][2]*0.25) + (iuf_pearson_output[i][2]*0.15)
        #algo13
        #average = (cos_sim_output[i][2]*0.45) + (pearson_output[i][2]*0.2) + (iuf_pearson_output[i][2]*0.35) 0.769
        #algo14
        #average = (cos_sim_output[i][2]*0.4) + (pearson_output[i][2]*0.3) + (iuf_pearson_output[i][2]*0.3)
        #algo 15: possible submission
        #average = (cos_sim_output[i][2]*0.8) + (pearson_output[i][2]*0.1) + (iuf_pearson_output[i][2]*0.1)
        #algo 16: 
        #average = (cos_sim_output[i][2]*0.1) + (pearson_output[i][2]*0.1) + (iuf_pearson_output[i][2]*0.8)
        
        average = int(round(average, 0))
        line = [cos_sim_output[i][0], cos_sim_output[i][1], average]
        #print("LINE: " + str(line) + "\n")
        output.append(line)
        line = []
    return output

own_algo_output5 = own_algo(output5, pearson_output5, iuf_pearson_output5)
own_algo_output10 = own_algo(output10, pearson_output10, iuf_pearson_output10)
own_algo_output20 = own_algo(output20, pearson_output20, iuf_pearson_output20)

In [71]:
#print(own_algo_output5)

In [72]:
#print(own_algo_output10)

In [73]:
#print(own_algo_output20)

In [74]:
write_to_file(own_algo_output5, 'my_algo17_result5.txt')
write_to_file(own_algo_output10, 'my_algo17_result10.txt')
write_to_file(own_algo_output20, 'my_algo17_result20.txt')