In [1]:
import numpy as np
import pandas as pd
import matrix_factorization_utilities as mfu

In [2]:
# Load user ratings
df = pd.read_csv('movie_ratings_data_set.csv')

In [3]:
# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

In [7]:
# Convert the running list of user ratings into a matrix
ratings_df=df.pivot_table(index="user_id", columns="movie_id", aggfunc=np.max)

In [8]:
# Apply matrix factorization to find the latent features
U,M = mfu.low_rank_matrix_factorization(ratings_df.as_matrix(), num_features=15, regularization_amount=1.0)

Optimization terminated successfully.
         Current function value: 312.762757
         Iterations: 1950
         Function evaluations: 2914
         Gradient evaluations: 2914


In [13]:
# Swap the rows and columns of product_features just so it's easier to work with
M=M.T

In [14]:
# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id=5

In [18]:
# Get movie #1's name and genre
movie_info=movies_df.loc[movie_id]

In [19]:
movie_info

title    The Big City Judge 2
genre             legal drama
Name: 5, dtype: object

In [22]:
# Get the features for movie #1 we found via matrix factorization
curr_movie_features=M[movie_id-1]
curr_movie_features

array([ 0.66560176, -0.82905751, -0.7268944 ,  0.52223663, -0.84842728,
       -1.8416539 , -0.78724325,  0.25976641, -0.11943102,  0.11387471,
       -0.1507451 , -0.17665636, -0.23284005, -0.81289917,  1.08270014])

In [24]:
diff_bw_selected_and_all=M-curr_movie_features
diff_bw_selected_and_all

array([[-0.06546423,  0.0560829 , -0.20528189, -0.38391784, -0.40040755,
         0.17205696,  0.81089422,  0.35768483,  0.15416447,  0.16605008,
         0.26100185,  0.03621231, -0.09429087, -0.07122611, -0.24792998],
       [ 0.10538404,  0.30107901,  0.23216003, -0.49243745, -0.02319974,
         0.04410975,  0.73594386,  0.06315603,  0.25126552, -0.44248736,
         0.12882624, -0.40322287, -0.11312932, -0.28904511, -0.06728922],
       [-0.12175367, -0.14143299, -0.3024589 , -0.28790192, -0.07573618,
         0.15921718,  0.06559218,  0.09105583,  0.55450081,  0.3623073 ,
         0.14139169, -0.04943249, -0.05588607, -0.08106114, -0.205808  ],
       [-0.0673468 ,  0.45705304, -0.13010322, -0.99954631,  0.05149285,
         0.46819616,  0.03587744, -0.23470063,  0.57512377, -0.04367244,
        -0.36709991,  0.3027938 ,  0.27076364, -0.18962886,  0.03476994],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        , 

In [25]:
abs_diff=np.abs(diff_bw_selected_and_all)
abs_diff

array([[ 0.06546423,  0.0560829 ,  0.20528189,  0.38391784,  0.40040755,
         0.17205696,  0.81089422,  0.35768483,  0.15416447,  0.16605008,
         0.26100185,  0.03621231,  0.09429087,  0.07122611,  0.24792998],
       [ 0.10538404,  0.30107901,  0.23216003,  0.49243745,  0.02319974,
         0.04410975,  0.73594386,  0.06315603,  0.25126552,  0.44248736,
         0.12882624,  0.40322287,  0.11312932,  0.28904511,  0.06728922],
       [ 0.12175367,  0.14143299,  0.3024589 ,  0.28790192,  0.07573618,
         0.15921718,  0.06559218,  0.09105583,  0.55450081,  0.3623073 ,
         0.14139169,  0.04943249,  0.05588607,  0.08106114,  0.205808  ],
       [ 0.0673468 ,  0.45705304,  0.13010322,  0.99954631,  0.05149285,
         0.46819616,  0.03587744,  0.23470063,  0.57512377,  0.04367244,
         0.36709991,  0.3027938 ,  0.27076364,  0.18962886,  0.03476994],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        , 

In [27]:
summed_up_diff=np.sum(abs_diff,axis=1)
summed_up_diff

array([  3.48266609,   3.69273554,   2.69553634,   4.22816883,
         0.        ,   3.64420004,   4.61322615,   2.82207339,
         2.59968244,   1.87218408,   4.46729777,   4.11699176,
         5.57529509,   6.55905089,   5.89548655,   5.32925908,
         8.45681864,   7.35792095,   6.93098728,   7.91144358,
         6.02544521,   5.38212229,   8.10955688,   2.78724392,
         4.6647307 ,   3.6912011 ,   3.26915643,   3.40571377,
         8.23616612,   7.52047454,  10.02173057,   9.55479989,
         6.26821213,   8.78375226])

In [28]:
movies_df['diff_from_selected']=summed_up_diff
movies_df.head()

Unnamed: 0_level_0,title,genre,diff_from_selected
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,The Sheriff 1,"crime drama, western",3.482666
2,The Big City Judge 1,legal drama,3.692736
3,The Sheriff 2,"crime drama, western",2.695536
4,Just a Regular Family,reality,4.228169
5,The Big City Judge 2,legal drama,0.0


In [31]:
movies_df.sort_values('diff_from_selected')

Unnamed: 0_level_0,title,genre,diff_from_selected
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,The Big City Judge 2,legal drama,0.0
10,Surrounded by Zombies 1,"horror, zombie fiction",1.872184
9,Biker Gangs,"crime drama, action",2.599682
3,The Sheriff 2,"crime drama, western",2.695536
24,The Big City Judge 3,legal drama,2.787244
8,Sci-Fi Murder Detectives,"supernatural, mystery",2.822073
27,Surrounded by Zombies 2,"horror, zombie fiction",3.269156
28,The Sheriff 4,"crime drama, western",3.405714
1,The Sheriff 1,"crime drama, western",3.482666
6,Attack on Earth 1,"sci-fi, action",3.6442
