In [1]:
import pandas as pd
from scipy import sparse 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings = pd.read_csv('dataset.csv', index_col=0)
ratings

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,,2.0,1.0,4.0,,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


In [3]:
ratings.fillna(0, inplace=True)

In [4]:
def standardize(row):
    new_row = (row-row.mean())/(row.max()-row.min())
    return new_row

ratings_std = ratings.apply(standardize)

In [5]:
ratings_std

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,0.36,0.6,0.4,-0.65,-0.08,-0.35
user 2,0.56,0.2,0.4,-0.15,-0.08,-0.6
user 3,-0.24,-0.4,-0.6,0.35,0.52,0.4
user 4,-0.44,0.0,-0.266667,0.35,-0.48,0.15
user 5,-0.24,-0.4,0.066667,0.1,0.12,0.4


In [6]:
similarity = cosine_similarity(ratings_std.values.T)
similarity

array([[ 1.        ,  0.70668875,  0.81368151, -0.79941088, -0.02539184,
        -0.91410609],
       [ 0.70668875,  1.        ,  0.72310153, -0.84515425, -0.5189993 ,
        -0.84337386],
       [ 0.81368151,  0.72310153,  1.        , -0.84794611, -0.3799803 ,
        -0.80218063],
       [-0.79941088, -0.84515425, -0.84794611,  1.        ,  0.14803913,
         0.72374686],
       [-0.02539184, -0.5189993 , -0.3799803 ,  0.14803913,  1.        ,
         0.39393939],
       [-0.91410609, -0.84337386, -0.80218063,  0.72374686,  0.39393939,
         1.        ]])

In [7]:
similarity_df = pd.DataFrame(similarity, index = ratings.columns, columns = ratings.columns)
similarity_df

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [8]:
def get_similarities(movie_name,movie_rating):
    similar_score = similarity_df[movie_name] * (movie_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

In [9]:
print(get_similarities('action1', 5))

action1      2.500000
action3      2.034204
action2      1.766722
romantic2   -0.063480
romantic1   -1.998527
romantic3   -2.285265
Name: action1, dtype: float64


In [10]:
print(get_similarities('romantic1', 1))

action3      1.271919
action2      1.267731
action1      1.199116
romantic2   -0.222059
romantic3   -1.085620
romantic1   -1.500000
Name: romantic1, dtype: float64


In [11]:
action_lover = [('action1',5),('romantic2',1),('romantic3',1)]

similar_movies_list = []

In [12]:
for movie,rating in action_lover:
    similar_movies_list.append(get_similarities(movie,rating))
    
similar_movies = pd.concat(similar_movies_list)

similar_movies

action1      2.500000
action3      2.034204
action2      1.766722
romantic2   -0.063480
romantic1   -1.998527
romantic3   -2.285265
action2      0.778499
action3      0.569970
action1      0.038088
romantic1   -0.222059
romantic3   -0.590909
romantic2   -1.500000
action1      1.371159
action2      1.265061
action3      1.203271
romantic2   -0.590909
romantic1   -1.085620
romantic3   -1.500000
dtype: float64

In [13]:
action_lover = [('action1', 5), ('romantic2', 1), ('romantic3', 1)]

similar_movies_dict = {}

for movie, rating in action_lover:
    similar_movies_dict[movie] = get_similarities(movie, rating)


similar_movies = pd.DataFrame.from_dict(similar_movies_dict, orient='index')

similar_movies

Unnamed: 0,action1,action3,action2,romantic2,romantic1,romantic3
action1,2.5,2.034204,1.766722,-0.06348,-1.998527,-2.285265
romantic2,0.038088,0.56997,0.778499,-1.5,-0.222059,-0.590909
romantic3,1.371159,1.203271,1.265061,-0.590909,-1.08562,-1.5


In [15]:
 similar_movies.sum().sort_values(ascending=False)

action1      3.909247
action2      3.810282
action3      3.807445
romantic2   -2.154389
romantic1   -3.306206
romantic3   -4.376174
dtype: float64