In [1]:
import numpy as np
import pandas as pd
import math
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import StratifiedKFold

## Read data

In [2]:
features = np.loadtxt('../../data/usr_his_sample/features.txt')

In [3]:
features.shape

(9817, 784)

In [4]:
item_list = np.loadtxt('../../data/usr_his_sample/item_list.txt').astype(int)
len(item_list)

9816

In [5]:
import pickle
usr_list = None
with open ('../../data/usr_his_sample/usr_list_5-20.txt', 'rb') as fp:
    usr_list = pickle.load(fp) 
len(usr_list)

44709

In [6]:
def cnt_subset(usr_list, item_list, thres = 5):
    cnt = list()
    n_usr = 0
    for usr in usr_list:
        n_item = np.sum([item in item_list for item in usr])
        if n_item > 0:
            if n_item > thres:
                cnt.append(n_item)
                n_usr += 1
    print np.array(cnt).mean(), np.max(cnt), n_usr
    
def get_subset(usr_list, item_list, thres = 5):
    subset_usrlist = []
    for usr in usr_list:
        n_item = np.sum([item in item_list for item in usr])
        if n_item > 0:
            subset_usrlist.append(usr)
    return subset_usrlist

In [7]:
cnt_subset(usr_list, item_list[:2000], thres = 5)
item_list_small1 = item_list[:2000]
usr_list_small1 = get_subset(usr_list, item_list_small1, thres = 5)

6.52314474651 11 1361


In [8]:
usr_list_train = usr_list_small1[:1000]
usr_list_test = usr_list_small1[1000:]

## Functions

In [9]:
class  Cluster:
    def __init__(self, mean, cov, weight = 1):
        self.mean = mean
        self.cov = cov
        self.weight = weight

In [11]:
#public
def cal_logp_item_cluster(feature_item, cluster):
    return multivariate_normal.logpdf(feature_item, mean = cluster.mean, cov = cluster.cov)

#not public
def cal_logp_user_cluster(user, features, cluster):
    return np.sum([cal_logp_item_cluster(features[item], cluster) for item in user])

#public
def cal_ptrans_user(user, features, clusters):
    return np.sum([cluster.weight * cal_logp_user_cluster(user, features, cluster) for cluster in clusters])

#public
def cal_ptrans_item_user(item, user, features, clusters):
    return np.sum([cal_logp_item_cluster(features[item], cluster) + cal_logp_user_cluster(user, features, cluster)\
                   for cluster in clusters])

In [12]:
import sys
sys.path.append("..")
from clustering.clustering import kmeans_cal

In [13]:
def  find_clusters(items, features, n_cluster):
    '''
        itemset: list indexes of images corresponding to video_img. Discard indexes where feature is None.
        usr_list: |U| users, each user is a sets of items.
        feature: gist or lab feature coressponding image in video_img.
    '''
    kmeans = kmeans_cal(features[items], n_cluster)
    label_init = kmeans.labels_
#     label_init = np.random.randint(0, n_cluster, len(items))

    gmm = GaussianMixture(n_components=n_cluster, covariance_type = 'full', max_iter = 100, random_state=0)
    gmm.means_init = np.array([features[items[label_init == i]].mean(axis=0) for i in range(n_cluster)])
    gmm.fit(features[items])
    label_pred = gmm.predict(features[items])
    clusters = [np.where(label_pred == i)[0] for i in range(n_cluster)]

#     means = gmm.means_
#     covs = gmm.covariances_
#     weights = gmm.weights_
    return gmm

In [85]:
multi_norm = np.vectorize(multivariate_normal.pdf)


(784, 3)

In [29]:
np.cov(features[[0, 1, 2]])

array([[  3.70474708e-04,   6.36349648e-05,   3.38562390e-05],
       [  6.36349648e-05,   2.19750643e-04,   9.06684256e-05],
       [  3.38562390e-05,   9.06684256e-05,   7.73454590e-05]])

In [None]:
np.apply_along_axis(a)

In [None]:
def cal_pdf(x, mean, var):
    return 1.0 / (np.roots(len(X)) * np)

In [48]:
import math 

def norm_pdf_multivariate(x, mu, sigma):
    size = len(x)
    if size == len(mu) and (size, size) == sigma.shape:
        det = np.linalg.det(sigma)
        if det == 0:
            raise NameError("The covariance matrix can't be singular")

        norm_const = 1.0/ ( math.pow((2*pi),float(size)/2) * math.pow(det,1.0/2) )
        x_mu = matrix(x - mu)
        inv = sigma.I        
        result = math.pow(math.e, -0.5 * (x_mu * inv * x_mu.T))
        return norm_const * result
    else:
        raise NameError("The dimensions of the input don't match")

In [158]:

def tmp():
    idx = item_list[:20]
    mean = np.mean(features[[idx]], axis = 0)
    cov = np.cov(features[[idx]], rowvar=0)
    print mean.shape, cov.shape
    print np.apply_along_axis(multivariate_normal.pdf, 1, features[[idx]], mean, cov, True)
tmp()

(784,) (784, 784)
[  1.92380670e+14   1.92380670e+14   1.92380670e+14   1.92380670e+14
   1.92380670e+14   1.92380670e+14   1.92380670e+14   1.92380670e+14
   1.92380670e+14   1.92380670e+14   1.92380670e+14   1.92380670e+14
   1.92380670e+14   1.92380670e+14   1.92380670e+14   1.92380670e+14
   1.92380670e+14   1.92380670e+14   1.92380670e+14   1.92380670e+14]


In [194]:
def cal_meancov(features, items, label, n_cluster):
    mean = [np.mean(features[items[label == i]], axis = 0) for i in range(n_cluster)]
    cov = [np.cov(features[items[label == i]], rowvar = 0) for i in range(n_cluster)]
    return mean, cov

In [274]:
def  my_gmm(items, features, n_cluster):
    kmeans = kmeans_cal(features[items], n_cluster)
    label_init = kmeans.labels_
    mean, cov = cal_meancov(features, items, label_init, n_cluster)
    p_y = np.array([1.0/n_cluster] * n_cluster).reshape(-1, 1)

    iter = 20
    converge = False
    while (iter > 0 and not converge):
        # e step
        p_x_y = np.vstack([np.apply_along_axis(multivariate_normal.pdf, 1, features[[items]], mean[k], cov[k], True) \
                                                for k in range(n_cluster)]).T # N * K
        p_y_x = p_x_y * p_y.reshape(1, -1) # N * K
        p_x = np.sum(p_x_y, axis = 1).reshape(-1, 1) # N * 1
        p_y_x = (p_y_x / p_x).T # N * K
        
        # m step       
        sum_weight = np.sum(p_y_x, axis = 1).reshape(-1, 1) # K x 1
        
        mean = np.dot(p_y_x, features[items]) * (1.0/sum_weight) # K x F
        x_submean = np.array([features[items] - mean[k] for k in range(n_cluster)])

        cov = [np.dot(np.multiply(x_submean[k].T, p_y_x[k]), x_submean[k]) / sum_weight[k] for k in range(n_cluster)]
        p_y = np.dot(p_x_y.T, p_x).reshape(-1, 1) # K x N * N x 1
        
        print iter, np.mean(np.log(p_x))
        iter -= 1
    return p_y, mean, cov

In [13]:
def  validate(gmm, usr_list, features):
    means, covs, weights = gmm.means_, gmm.covariances_, gmm.weights_
    clusters = [Cluster(means[i], covs[i], weights[i]) for i in range(len(means))]
    
    p_users = []
    for usr in usr_list:
        p_user = cal_ptrans_user(usr, features, clusters)
        p_users.append(p_user)
        print p_user
    return np.array(p_users).mean()

## Debug

In [259]:
data = np.array([np.random.normal(loc=10.0, scale=1.0, size=(20, 7))] + [np.random.normal(loc=20.0, scale=1.0, size=(20, 7))] + [np.random.normal(loc=50.0, scale=1.0, size=(20, 7))]).reshape(60, 7)
N, _ = data.shape
K = 3
print data.shape, type(data)

(60, 7) <type 'numpy.ndarray'>


In [270]:
p_y, mean, cov = my_gmm(np.array(range(60)), data, n_cluster = 3)
print mean

20 -12.7604288987
19 -12.472567208
18 -12.3125767144
17 -12.2094962694
16 -12.1386880379
15 -12.1202728674
14 -12.1021861117
13 -12.065644221
12 -12.0482573997
11 -12.0248174028
10 -12.0030535597
9 -12.0014012181
8 -12.0009407176
7 -12.0007598954
6 -12.000684872
5 -12.0006421285
4 -12.000604466
3 -12.0005627467
2 -12.000515067
1 -12.0004633653
[[ 25.8543696   26.82300346  25.66420291  26.52441132  26.66013374
   26.22988166  25.69745427]
 [ 32.52733427  32.35261037  32.54073799  32.28105042  32.1778641
   31.97208452  32.76126487]
 [ 22.85116206  22.48767773  22.75261345  22.40397196  23.30197324
   23.10060802  23.21072158]]


In [275]:
p_y, mean, cov = my_gmm(np.array(range(60)), data, n_cluster = 3)


20 -9.77325395284
19 -9.76872742248
18 -9.76872742248
17 -9.76872742248
16 -9.76872742248
15 -9.76872742248
14 -9.76872742248
13 -9.76872742248
12 -9.76872742248
11 -9.76872742248
10 -9.76872742248
9 -9.76872742248
8 -9.76872742248
7 -9.76872742248
6 -9.76872742248
5 -9.76872742248
4 -9.76872742248
3 -9.76872742248
2 -9.76872742248
1 -9.76872742248


In [276]:
w_y, mean, cov = my_gmm(item_list_small1[:200], features, n_cluster = 32)

20 inf


  


ValueError: array must not contain infs or NaNs

## Run

### Baseline

In [27]:
gmm = find_clusters(item_list_small1, features, n_cluster=32)
gmm

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=array([[ 0.,  0., ...,  0.,  0.],
       [ 0.,  0., ...,  0.,  0.],
       ...,
       [ 0.,  0., ...,  0.,  0.],
       [ 0.,  0., ...,  0.,  0.]]),
        n_components=32, n_init=1, precisions_init=None, random_state=0,
        reg_covar=1e-06, tol=0.001, verbose=0, verbose_interval=10,
        warm_start=False, weights_init=None)

In [28]:
means, covs, weights = gmm.means_, gmm.covariances_, gmm.weights_
clusters = [Cluster(means[i], covs[i], weights[i]) for i in range(len(means))]

In [None]:
validate(gmm, usr_list_train, features)

-14464.7699325
11967.2023176
90.0797264361
12209.6058061
12346.6630891
23901.7393533
7246.52643524
18198.7563252
-1435.56535379
6088.90851677
10829.1549864
-26546.8499828
-52066.1451245
24213.7845474
-33477.2341046
5321.68112101
8154.38468958
10299.7184326
21053.7116404
14190.1849448
-3810.67958563
4842.60712368
17885.4688179
13529.9294485
16164.9407064
17118.5641051
4041.42254556
-1822.98580168
15113.3395535
-13101.4519725
15780.9542663
3828.96561454
8412.9170827
13480.0797003
11721.8427875
24733.688572
11820.587747
11821.4538249
-53782.7900892
-4945.13315772
18934.1635147
13026.3288967
-8100.13375376
18123.5070394
22103.1064673
11480.8570812
-16641.1051322
-8395.9387903
-94782.1859585
14060.5550895
6406.98436968
-34810.6137321
-17369.7484745
29870.7530617
14465.664894
18999.964355
5636.08518467
25302.9033278
3821.70721296
23883.1304035
-2866.98303899
-1685.56100494
-16040.4110154
10181.026481
20667.6041894
-108281.032625
25310.019494
24929.0262989
40464.1182743
-15353.0116436
-36235.