# Practicum Problems

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import math

## Problem 1

##### Load data into dataframe

In [2]:
columns = [['user id','age','gender','occupation','zip code'],
           ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance ','Sci-Fi','Thriller','War' ,'Western'],
           ['user id','movie id','rating','timestamp']]

user_data = pd.read_csv('./data/u.user', sep='|', names = columns[0], encoding='latin-1')
item_data = pd.read_csv('./data/u.item', sep='|', names = columns[1], encoding='latin-1')
ratings_data = pd.read_csv('./data/u.data', sep='\t', names = columns[2], encoding='latin-1')

##### Visualize and understand data

In [3]:
print(user_data.shape, item_data.shape, ratings_data.shape)

(943, 5) (1682, 24) (100000, 4)


In [4]:
user_data.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
item_data.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
ratings_data.head()

Unnamed: 0,user id,movie id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


##### Make a utility matrix and center the data

In [7]:
utility_matrix = ratings_data.pivot(index='user id',columns='movie id',values='rating')
user_means = utility_matrix.mean(axis=1)
utility_centered = utility_matrix - user_means
utility_centered = utility_centered.where((pd.notnull(utility_centered)),0)
utility_centered

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.709677,1.203704,-1.333333,0.125714,1.364929,0.034739,-2.79661,0.727273,-1.206522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.389706,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,-2.206522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.389706,-0.709677,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.727273,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.000000,0.000000,0.000000,-2.333333,0.000000,0.000000,0.034739,1.20339,-1.272727,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,1.389706,0.000000,0.000000,0.000000,0.000000,0.000000,0.034739,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Save user 15 and 200

In [9]:
user_200 = user_data.iloc[199]
user_15 = user_data.iloc[14]
print(user_200)
print('\n',user_15)

user id              200
age                   40
gender                 M
occupation    programmer
zip code           93402
Name: 199, dtype: object

 user id             15
age                 49
gender               F
occupation    educator
zip code         97301
Name: 14, dtype: object


##### Save item 95

In [10]:
item_95 = item_data[94:95]
print(item_95)

    movie id     movie title release date  video release date  \
94        95  Aladdin (1992)  01-Jan-1992                 NaN   

                                             IMDb URL  unknown  Action  \
94  http://us.imdb.com/M/title-exact?Aladdin%20(1992)        0       0   

    Adventure  Animation  Childrens  ...  Fantasy  Film-Noir  Horror  Musical  \
94          0          1          1  ...        0          0       0        1   

    Mystery  Romance   Sci-Fi  Thriller  War  Western  
94        0         0       0         0    0        0  

[1 rows x 24 columns]


##### Select Features for our Item 95

In [11]:
feat_select = item_95.iloc[:, 5:24]
feat_select.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
94,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0


##### Save item profile

In [12]:
item_profile = item_data.iloc[:,5:24]
item_profile.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


##### Generate User Profile by taking dot product of Item Profile and Utility Matrix

In [13]:
user_profile = np.dot(utility_centered,item_profile)
user_profile_200 = user_profile[199]
user_profile_15 = user_profile[14]
print("\nUser Profile:\n", user_profile)
print("\nUser Profile (200):\n", user_profile_200)
print("\nUser Profile (15):\n", user_profile_15)


User Profile:
 [[  0.06486486 -19.04561424 -23.97711089 ...   0.30691865   5.61156716
   -0.39044204]
 [  0.           0.16987665   2.03863023 ...  -0.51419764  -0.45999396
    0.        ]
 [  0.          -9.90027817   1.20553194 ... -24.45159726  -1.38051487
    0.        ]
 ...
 [  0.           2.68707838   3.3375375  ...   1.02512049   3.50804598
    0.        ]
 [  0.           9.95250865  12.35307555 ...   4.44379932  10.76957541
    2.60729178]
 [  0.           2.49861816  -7.48438072 ...   7.74176264   1.45769492
   -9.91893414]]

User Profile (200):
 [ 0.         53.68782567 35.4495237   7.6489686  14.16588957  8.80920192
  0.20447963 -1.78787879 34.25995913  3.6691485   2.07481203  8.48019395
  8.71830943  2.46827733 25.27766511 36.71256459 22.5206827  13.54465351
  1.98478731]

User Profile (15):
 [  0.         -18.26962989  -5.63815687  -5.52457983 -14.10394259
 -25.84333442  -6.31283599   0.         -24.23614665  -1.76218535
  -0.43397683  -6.48214286  -2.84021164  -6.1972

##### Cosine Similarity

In [14]:
cosine_sim = metrics.pairwise.cosine_similarity(user_profile,feat_select)
print("User 15")
print("Cosine Similarity:", cosine_sim[14])
print("Cosine Distance:", 1-cosine_sim[14])
print("\nUser 200")
print("Cosine Similarity:", cosine_sim[199])
print("Cosine Distance:", 1-cosine_sim[199])

User 15
Cosine Similarity: [-0.43632073]
Cosine Distance: [1.43632073]

User 200
Cosine Similarity: [0.21328933]
Cosine Distance: [0.78671067]


##### In conclusion, the system is more likely to recommend Movie 95 to User 200 as their distance to the recommendation axis is smaller than User 15.

___
## Problem 2

##### Utility Matrix generated and centered in first part will be used further.

##### Save the Users from Centered Utility Matrix

In [15]:
user_1 = utility_centered.iloc[:1,]
user_other = utility_centered.iloc[1:,]

##### Find Top 10 Cosine Similar Users to the User 1:

In [16]:
cosine_sim = metrics.pairwise.cosine_similarity(user_1,user_other)
index = np.argpartition(cosine_sim, -10, axis=1)[:, -10:]
print(index)

[[ 42 274 641 590 755 265 604 455 914 736]]


##### Calculate Expected Rating

In [17]:
c,ratings = 0, [0,0,0,0,0,0,0,0,0,0]

for i in range(10):
    ratings[i] = user_other[508][index[0][i]]
    if ratings[i] != 0.0:
        c += 1

total = math.fsum(ratings)
mean = total / c
print("In Conclusion, the expected rating for the item for user 1 is: ",mean)

In Conclusion, the expected rating for the item for user 1 is:  0.1724137931034484


___