In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# Read the csv file
movies_ak = pd.read_csv('Movie.csv')

In [3]:
movies_ak.head()

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [4]:
movies_ak.shape

(8992, 3)

In [5]:
len(movies_ak["userId"].unique())

4081

In [6]:
movies_ak["movie"].value_counts()

Toy Story (1995)                      2569
GoldenEye (1995)                      1548
Heat (1995)                           1260
Jumanji (1995)                        1155
Sabrina (1995)                         700
Grumpier Old Men (1995)                685
Father of the Bride Part II (1995)     657
Sudden Death (1995)                    202
Waiting to Exhale (1995)               138
Tom and Huck (1995)                     78
Name: movie, dtype: int64

In [7]:
movies_ak["rating"].describe()

count    8992.000000
mean        3.557162
std         0.967071
min         0.500000
25%         3.000000
50%         3.500000
75%         4.000000
max         5.000000
Name: rating, dtype: float64

In [8]:
movies_ak["rating"].unique()

array([4. , 5. , 4.5, 3. , 1. , 3.5, 1.5, 2. , 2.5, 0.5])

In [9]:
movies_ak["rating"].value_counts()

3.0    2736
4.0    2660
5.0    1394
3.5     679
2.0     542
4.5     374
2.5     277
1.0     212
1.5      61
0.5      57
Name: rating, dtype: int64

In [10]:
movies_ak.sort_values('userId')

Unnamed: 0,userId,movie,rating
2569,1,Jumanji (1995),3.5
3724,2,Grumpier Old Men (1995),4.0
0,3,Toy Story (1995),4.0
5204,4,Heat (1995),3.0
7444,4,GoldenEye (1995),4.0
...,...,...,...
6463,7117,Heat (1995),5.0
2567,7119,Toy Story (1995),5.0
2568,7120,Toy Story (1995),4.5
3723,7120,Jumanji (1995),4.0


In [11]:
# Create User-item interaction matrix
user_movies_ak = movies_ak.pivot(index='userId',
                                 columns='movie',
                                 values='rating')

In [12]:
user_movies_ak.head()

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,3.5,,,,,
2,,,4.0,,,,,,,
3,,,,,,,,,4.0,
4,,4.0,,3.0,,,,,,
5,,,,,3.0,,,,,


In [13]:
#user_movies_ak.index = movies_ak.userId.unique()

In [14]:
#Impute those NaNs with 0 values
user_movies_ak.fillna(0, inplace=True)

In [15]:
user_movies_ak.head()

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#Calculating Cosine Similarity between Users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [17]:
user_sim = 1 - pairwise_distances( user_movies_ak.values,
                                  metric='cosine')

In [18]:
user_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

In [19]:
#Store the results in a dataframe
user_sim_ak = pd.DataFrame(user_sim)

user_sim_df.head()

In [20]:
#Set the index and column names to user ids 
user_sim_ak.index = movies_ak.userId.unique()
user_sim_ak.columns = movies_ak.userId.unique()

In [21]:
user_sim_ak.shape

(4081, 4081)

In [22]:
user_sim_ak.iloc[0:5, 0:5]

Unnamed: 0,3,6,8,10,11
3,1.0,0.0,0.0,0.0,1.0
6,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0
10,0.0,0.0,0.0,1.0,0.0
11,1.0,0.0,0.0,0.0,1.0


In [23]:
np.fill_diagonal(user_sim, 0)
user_sim_ak.iloc[0:5, 0:5]

Unnamed: 0,3,6,8,10,11
3,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0
11,1.0,0.0,0.0,0.0,0.0


In [24]:
user_sim_ak.shape

(4081, 4081)

In [25]:
#Most Similar Users
user_sim_ak.idxmax(axis=1)[0:10]

3       11
6      168
8       16
10    4047
11       3
12    6676
13    5953
14    4138
16       8
19    3603
dtype: int64

In [26]:
movies_ak[(movies_ak['userId']==6) | (movies_ak['userId']==168)]

Unnamed: 0,userId,movie,rating
1,6,Toy Story (1995),5.0
60,168,Toy Story (1995),4.5
3725,6,Grumpier Old Men (1995),3.0
6464,6,Sabrina (1995),5.0


In [27]:
user_1=movies_ak[movies_ak['userId']==6]

In [28]:
user_2=movies_ak[movies_ak['userId']==168]

In [29]:
pd.merge(user_1,user_2,on='movie',how='outer')

Unnamed: 0,userId_x,movie,rating_x,userId_y,rating_y
0,6,Toy Story (1995),5.0,168.0,4.5
1,6,Grumpier Old Men (1995),3.0,,
2,6,Sabrina (1995),5.0,,


In [30]:
movies_ak[(movies_ak['userId']==3) | (movies_ak['userId']==11)]

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5
7446,11,GoldenEye (1995),2.5
