In [89]:
import numpy as np
import pandas as pd
import pickle

In [90]:
df = pd.read_csv('../offline_datasets/ml-latest-small/ratings.csv')

In [91]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [92]:
nu = len(df['userId'].unique())
nm = len(df['movieId'].unique())

In [93]:
df['movieId'].unique()

array([     1,      3,      6, ..., 160836, 163937, 163981])

In [94]:
df['rating'].unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [95]:
M = np.zeros((nu,nm))
M.shape

(610, 9724)

In [96]:
df.head().values.dtype

dtype('float64')

In [97]:
m2id = dict()
for row in df.values:
    u,m,r,_ = row
    u,m = int(u)-1, int(m)
    if m not in m2id:
        m2id[m] = len(m2id)
    m = m2id[m]
    M[u,m] = r

In [98]:
u, s, vh = np.linalg.svd(M, full_matrices=False)

In [99]:
u.shape, s.shape, vh.shape

((610, 610), (610,), (610, 9724))

In [100]:
s[:10]

array([534.41989777, 231.23661142, 191.1508762 , 170.42250831,
       154.552948  , 147.33575651, 135.65556768, 122.66302989,
       121.44217651, 113.11144323])

In [101]:
# np.allclose(a, np.dot(u * s, vh))
ntop = 30

U = u[:, :ntop].T

smat = np.diag(s)
svh = np.dot(smat, vh)
V = svh[:ntop, :]

U.shape, V.shape

((30, 610), (30, 9724))

In [102]:
U[:,0], V[:,0]

(array([-0.05555415, -0.06167385,  0.01089745, -0.00082937,  0.0921447 ,
        -0.05250689, -0.00551149,  0.00820931, -0.03297767,  0.02250404,
        -0.05273948,  0.0196212 , -0.03074507,  0.01346326, -0.0693985 ,
         0.04457773, -0.05034111,  0.03149806, -0.03487815,  0.03808566,
        -0.0336458 ,  0.00959982, -0.06731688,  0.03477066, -0.03346122,
         0.11865251, -0.02239713,  0.06447089, -0.02106628,  0.08692598]),
 array([-37.64982757,  -6.38009441,  14.99461719,   0.28694728,
         -1.88714717,  -4.79004995,  -8.80762925,   2.19863593,
         -1.15330456,   1.38756162,  -3.04663882,  -2.40524419,
          6.95834502,   1.36681044,   4.27302317,  -2.77527557,
          0.29953447,   1.19655044,   1.97963825,  -0.59467467,
        -11.58858021,   6.43065938,   2.60715411,  -2.49877637,
         -2.02412581,   0.79009585,  -0.96189455,  -3.02546698,
         -0.84703585,  -0.13809871]))

In [103]:
dfm = pd.read_csv('../offline_datasets/ml-latest-small/movies.csv')

In [104]:
dfm.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [105]:
V_ = []
g2id = dict()
for row in dfm.values:
    m,t,g = row
    gs = g.split('|')
    if m not in m2id:
        print(m)
        continue
    mid = m2id[int(m)]
    for genre in gs:
        if genre not in g2id:
            g2id[genre] = len(g2id)
        V_.append((mid,g2id[genre]))

len(g2id), len(V_)

1076
2939
3338
3456
4194
5721
6668
6849
7020
7792
8765
25855
26085
30892
32160
32371
34482
85565


(20, 22046)

In [106]:
from numpy.linalg import norm

def sim(a,b):
    # seems most dots are positive
    denom = norm(a)*norm(b)
    nom = max(0, np.dot(a,b))
    return nom / denom
    
n = V.shape[1]
nsam = 500
nusam = 5
sam = np.random.choice(n, size=nsam, replace=False)
usam = np.random.choice(nu, size=nusam, replace=False)
U = U[:,usam]

sims = dict()
sims_m = dict()
sims_tars = []
for i,v in enumerate(V_):
    if i % 1000 == 0:
        print(i)
        
    mid = v[0]
    feat = V[:,mid]
    if mid not in sims_m:
        sims_i = list()
        for j in range(nsam):
            s = sim(feat, V[:,j])
            sims_i.append(s)
        sims_m[mid] = sims_i
    else:
        sims_i = sims_m[mid]
        
    sims[v] = sims_i
    
    sims_tars_i = list()
    for j in range(nusam):
        s = sim(feat, U[:,j])
        sims_tars_i.append(s)

    sims_tars.append(sims_tars_i)
    
sims_tars = np.array(sims_tars).T
len(sims), sims_tars.shape

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000


(22046, (5, 22046))

In [109]:
sims_tars_d = []
for i in range(sims_tars.shape[0]):
    d = dict()
    for v,s in zip(V_,sims_tars[i]):
        d[v] = s
    sims_tars_d.append(d)

In [110]:
with open('../datasets/movie.pkl', 'wb') as fin:
    pickle.dump((V_, sims, sims_tars_d), fin)

In [108]:
# max(0,dot) / denom
sims[(0,0)]

[1.0000000000000002,
 0.6027895741154934,
 0.634120168183958,
 0.7278068324701631,
 0.7160304377657363,
 0.43740986070445903,
 0.4291039878116272,
 0.7221896459832647,
 0.5510876761405208,
 0.2952253597954802,
 0.5356902817726987,
 0.6115395775083321,
 0.7173244794005422,
 0.7009555093807613,
 0.5919767356257729,
 0.8565319660540996,
 0.7596977921479582,
 0.6183923327944915,
 0.5541702868483027,
 0.512924283925958,
 0.8200076939178396,
 0.5804465064508548,
 0.7679999746363937,
 0.3198497078069995,
 0.45198017327752826,
 0.7342195714238237,
 0.832526443974587,
 0.7795043378293676,
 0.741431891019222,
 0.5229620007169511,
 0.6160235550897859,
 0.6499850404240002,
 0.6663979631196535,
 0.7118689397194067,
 0.7737732006677729,
 0.6761475401498697,
 0.752929084642371,
 0.8114293697392553,
 0.6661278787732492,
 0.6278095310157348,
 0.7926882683270418,
 0.7752573867925032,
 0.839705724853639,
 0.21804723390237393,
 0.7435712306561258,
 0.5822747427896963,
 0.3179201092806741,
 0.2419135114909

In [70]:
# (dot+denom) / (2*denom)
sims[(0,0)]

[1.0,
 0.8013947870577467,
 0.8170600840919789,
 0.8639034162350816,
 0.8580152188828682,
 0.7187049303522295,
 0.7145519939058136,
 0.8610948229916323,
 0.7755438380702604,
 0.6476126798977401,
 0.7678451408863494,
 0.805769788754166,
 0.8586622397002711,
 0.8504777546903807,
 0.7959883678128865,
 0.9282659830270499,
 0.8798488960739791,
 0.8091961663972457,
 0.7770851434241514,
 0.756462141962979,
 0.9100038469589197,
 0.7902232532254274,
 0.8839999873181968,
 0.6599248539034998,
 0.7259900866387642,
 0.8671097857119119,
 0.9162632219872935,
 0.8897521689146839,
 0.8707159455096111,
 0.7614810003584757,
 0.808011777544893,
 0.824992520212,
 0.8331989815598267,
 0.8559344698597033,
 0.8868866003338866,
 0.8380737700749349,
 0.8764645423211855,
 0.9057146848696277,
 0.8330639393866245,
 0.8139047655078674,
 0.8963441341635208,
 0.8876286933962516,
 0.9198528624268195,
 0.609023616951187,
 0.871785615328063,
 0.7911373713948482,
 0.658960054640337,
 0.6209567557454632,
 0.78130305582830