In [12]:
import numpy as np
import pandas as pd
import math
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import StratifiedKFold


## Read data

In [4]:
data = np.loadtxt('data.txt')

In [5]:
data.shape

(9817, 784)

In [7]:
import pickle
usr_list = None
with open ('usr_list.txt', 'rb') as fp:
    usr_list = pickle.load(fp) 
len(usr_list)

310

In [23]:
item_list = np.loadtxt('item_list.txt').astype(int)
len(item_list)

9816

## Likelihood probability

In [93]:
#likelihood probablitiy

def  cal_mean_var(X):
    mean = np.mean(X, axis = 0)
    cov = np.cov(X, axis = 0)
    return mean, cov

def  cal_px_y(x, mean, cov):
    return multivariate_normal.logpdf(x, mean = mean, cov = cov)

def  cal_px(px_ys, pys):
    px = 0.0
    for i in range(len(pys)):
        px += px_ys[i] * pys[i]
    return px

def  cal_py(cluster, n):
    return 1 - len(cluster) * 1.0 / n

def  cal_py_x(px, py, px_y):
    return px_y * py / px

## Functions

In [84]:
class  Cluster:
    def __init__(self, mean, cov, weight = 1):
        self.mean = mean
        self.cov = cov
        self.weight = weight

In [82]:
def cal_log_prob(p_items):
    return np.sum(np.log(p_items))

def cal_p_item_cluster(feature_item, cluster):
    return cal_px_y(feature_item, cluster.mean, cluster.cov)
    
def cal_p_user_cluster(user, features, cluster):
    p_item_cluster = [cal_p_item_cluster(features[item], cluster) for item in user]
    return np.prod(p_item_cluster)

def cal_p_user(user, features, clusters):
    return np.sum([cluster.weight * cal_p_user_cluster(user, features, cluster) for cluster in clusters])

def cal_p_item_user(item, features, user, clusters):
    p_item_clusters = np.array([cal_p_item_cluster(features[item], cluster) for cluster in clusters])
    p_clusters_user = np.array([cal_p_cluster_user(cal_p_user_cluster * cluster.weight, p_user, cluster.weight)\
                        for cluster in clusters])
    return np.sum(np.dot(p_item_clusters, p_clusters_user))


In [106]:
#public
def cal_logp_item_cluster(feature_item, cluster):
    return multivariate_normal.logpdf(feature_item, mean = cluster.mean, cov = cluster.cov)

#not public
def cal_logp_user_cluster(user, features, cluster):
    return np.sum([cal_logp_item_cluster(features[item], cluster) for item in user])

#public
def cal_ptrans_user(user, features, clusters):
    return np.sum([cluster.weight * cal_logp_user_cluster(user, features, cluster) for cluster in clusters])

#public
def cal_ptrans_item_user(item, user, features, clusters):
    return np.sum([cal_logp_item_cluster(features[item], cluster) + cal_logp_user_cluster(user, features, cluster)\
                   for cluster in clusters])

In [49]:
def  find_clusters(items, features, n_cluster):
    '''
        itemset: list indexes of images corresponding to video_img. Discard indexes where feature is None.
        usr_list: |U| users, each user is a sets of items.
        feature: gist or lab feature coressponding image in video_img.
    '''
    label_init = np.random.randint(0, n_cluster, len(item))

    gmm = GaussianMixture(n_components=n_cluster, covariance_type = 'full', max_iter = 100, random_state=0)
    gmm.means_init = np.array([features[items[label_init == i]].mean(axis=0) for i in range(n_cluster)])
    gmm.fit(features[items])
    label_pred = gmm.predict(features[items])
    clusters = [np.where(label_pred == i)[0] for i in range(n_cluster)]

#     means = gmm.means_
#     covs = gmm.covariances_
#     weights = gmm.weights_
    return gmm

In [110]:
def  validate(gmm, usr_list, features):
    means, covs, weights = gmm.means_, gmm.covariances_, gmm.weights_
    clusters = [Cluster(means[i], covs[i], weights[i]) for i in range(len(means))]
    
    p_users = []
    for usr in usr_list:
        p_user = cal_ptrans_user(usr, features, clusters)
        p_users.append(p_user)
        print p_user
    return np.array(p_users).mean()

## Run

In [64]:
gmm_2 = find_clusters(item_list, data, n_cluster=32)
gmm_2

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=array([[ 0.,  0., ...,  0.,  0.],
       [ 0.,  0., ...,  0.,  0.],
       ...,
       [ 0.,  0., ...,  0.,  0.],
       [ 0.,  0., ...,  0.,  0.]]),
        n_components=32, n_init=1, precisions_init=None, random_state=0,
        reg_covar=1e-06, tol=0.001, verbose=0, verbose_interval=10,
        warm_start=False, weights_init=None)

In [89]:
means, covs, weights = gmm_2.means_, gmm_2.covariances_, gmm_2.weights_
clusters = [Cluster(means[i], covs[i], weights[i]) for i in range(len(means))]

In [95]:
cal_logp_item_cluster(data[usr_list[0][0]], clusters[0])

4526.9292052694336

In [107]:
cal_logp_user_cluster(usr_list[0], data, clusters[0])

472998.63464479242

In [108]:
cal_ptrans_user(usr_list[0], data, clusters)

392384.31955793931

In [111]:
validate(gmm_2, usr_list, data)

392384.319558
314205.810518
299090.478678
202068.424607
230049.645962
390287.088361
263943.913369
403401.389305
445415.490075
33843.1426213
235679.681302
398419.124816
367527.655899
145188.558026
194928.382271
179157.603939
128372.474097
170788.825767
424625.062028
292889.644954
318749.100627
534260.815235
288990.36177
258740.684534
557159.236121
276399.055878
190109.795127
137016.248892
299173.485426
265234.161183
540631.013162
466453.087343
129168.383486
310602.611557
206540.223168
198102.48005
574609.739254
226906.200679
358326.266361
253820.737044
130907.633849
170219.896229
203162.977415
158509.307147
11176.308331
311770.491492
166563.477099
227952.907841
308298.624537
-2010.58397618
255765.877661
256191.582401
251610.417371
331866.298278
216815.610887
221326.520837
61904.0880482
569548.813299
364566.645232
381854.883014
231577.524168
302694.560673
109561.536853
270706.013756
379052.044851
353790.4654
157878.774747
173454.49208
551852.795991
280785.452118
261288.668602
-143542.151

252148.74166698457