In [49]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import seaborn as sns
from itertools import product
from itertools import combinations_with_replacement
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import time
import math
%matplotlib inline

In [104]:
from sklearn.cluster import KMeans, SpectralClustering, AffinityPropagation

In [2]:
from utils import *
%load_ext autoreload
%autoreload 2

In [85]:
# gj = pd.read_csv('./filled_active_df.csv')
gj = gj[['ifp_id', 'ctt', 'cond', 'training', 'team', 'user_id', 'value', 'fcast_date']]
gj['fcast_year'] = pd.to_datetime(gj['fcast_date']).dt.year
gj['fcast_week'] = pd.to_datetime(gj['fcast_date']).dt.week
gj['ifp_week'] = gj['fcast_year'].map(str) + gj['fcast_week'].map(str) + gj['ifp_id']
gj = gj.drop('fcast_date', axis=1)
gj = gj.drop_duplicates()
gj.to_csv('./gj_df.csv', index=False)

In [86]:
# adult = pd.read_csv('../labels.txt', delimiter='\t', header=0, names=['user_id','website','rating'])
# trec = pd.read_csv('../trec-rf10-crowd/trec-rf10-data.txt', delimiter='\t')
# gj = pd.read_csv('./filled_active_df.csv')

# best_users = trec.groupby('workerID').count().sort_values('docID', ascending=False)[:150].index
# trec = trec[trec['workerID'].isin(best_users)]

# r = pd.Series([2,3,2,3], index=[1,2,0,-2])
# trec['label_bin'] = trec['label'].map(r)
gj.head(10)

Unnamed: 0,ifp_id,ctt,cond,training,team,user_id,value,fcast_date,fcast_year,fcast_week,ifp_week
0,1244-0,1a,1,a,,51,0.2,2015-01-15,2015,3,201531244-0
1,1244-0,1a,1,a,,51,0.2,2015-01-16,2015,3,201531244-0
2,1244-0,1a,1,a,,51,0.2,2015-01-17,2015,3,201531244-0
3,1244-0,1a,1,a,,51,0.2,2015-01-18,2015,3,201531244-0
4,1244-0,1a,1,a,,51,0.2,2015-01-19,2015,4,201541244-0
5,1244-0,1a,1,a,,51,0.2,2015-01-20,2015,4,201541244-0
6,1244-0,1a,1,a,,51,0.2,2015-01-21,2015,4,201541244-0
7,1244-0,1a,1,a,,51,0.2,2015-01-22,2015,4,201541244-0
8,1244-0,1a,1,a,,51,0.2,2015-01-23,2015,4,201541244-0
9,1244-0,1a,1,a,,51,0.2,2015-01-24,2015,4,201541244-0


In [87]:
# testframe = create_user_task_ids(adult, 'user_id', 'website', 'rating')
testframe = create_user_task_ids(gj, 'user_id', 'ifp_week', 'value', False, True)
testframe.head()

Unnamed: 0,ifp_id,ctt,cond,training,team,user_id,value,fcast_date,fcast_year,fcast_week,ifp_week,bin,task_id,uid
0,1244-0,1a,1,a,,51,0.2,2015-01-15,2015,3,201531244-0,0.2,0,0
1,1244-0,1a,1,a,,51,0.2,2015-01-16,2015,3,201531244-0,0.2,0,0
2,1244-0,1a,1,a,,51,0.2,2015-01-17,2015,3,201531244-0,0.2,0,0
3,1244-0,1a,1,a,,51,0.2,2015-01-18,2015,3,201531244-0,0.2,0,0
4,1244-0,1a,1,a,,51,0.2,2015-01-19,2015,4,201541244-0,0.2,1,0


In [132]:
c = pd.cut(
    testframe['value'],
    [-np.inf, .2, .4, .6, .8, np.inf],
    labels=[2,3,5,7,11]
)
testframe['bin_levels'] = c

In [135]:
mini_tf = testframe[testframe['uid']<1000]

In [96]:
def batcher(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def split(df):
    train_df, validate_df, test_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
    return train_df, validate_df, test_df

class Model(nn.Module):
    def __init__(self, users, tasks, k=2):
        super(Model, self).__init__()
        self.user_lut = nn.Embedding(users, k)
        self.task_lut = nn.Embedding(tasks, k)

        self.user_bias = nn.Embedding(users, 1)
        self.task_bias = nn.Embedding(tasks, 1)
        self.global_bias = nn.Parameter(torch.FloatTensor(1))
        
    def forward(self, users, jokes):
        user_vectors = self.user_lut(users)
        task_vectors = self.task_lut(jokes)
        user_bias = self.user_bias(users)
        task_bias = self.task_bias(jokes)

        return torch.bmm(user_vectors.unsqueeze(1),
                         task_vectors.unsqueeze(2)).squeeze() \
                         + user_bias.squeeze() + task_bias.squeeze() + self.global_bias.expand_as(user_bias.squeeze())

def val(df, model):
    crit = nn.MSELoss(size_average=False)
    total_loss = 0.
    total_num = 0
    for batch in batcher(df, 100):
        true_rating = Variable(torch.Tensor(batch.bin.values.astype(float)))
        total_num = total_num + true_rating.size(0)
        users = Variable(torch.LongTensor(batch.uid.values))
        tasks = Variable(torch.LongTensor(batch.task_id.values))
        scores = model.forward(users, tasks)
        total_loss += crit(scores, true_rating).data[0]
    return math.sqrt(total_loss/total_num)


def train(train_iter, val_iter, test_iter, model):
    opt = optim.SGD(model.parameters(), lr=0.1)
    crit = nn.MSELoss()

    print("val:", val(validate_df, model))
    for epochs in range(10):
        avg_loss = 0
        total = 0
        for i,batch in enumerate(batcher(train_df, 100)):
            opt.zero_grad()
            rating = Variable(torch.Tensor(batch.bin.values.astype(float)))
            users = Variable(torch.LongTensor(batch.uid.values))
            tasks = Variable(torch.LongTensor(batch.task_id.values))
            scores = model.forward(users, tasks)
            loss = crit(scores, rating)
            #if i % 1000==0:
            #    print (loss.data[0])
            loss.backward()
            avg_loss += loss.data[0]
            total += 1
            opt.step()
        print("train:", math.sqrt(avg_loss / float(total)))
        print("val:", val(validate_df, model))
    return model.user_lut.weight.data, model.user_bias.weight.data

In [97]:
train_df, validate_df, test_df = split(mini_tf)
users = len(mini_tf.uid.unique())
tasks = len(mini_tf.task_id.unique())
model = Model(users, tasks, k=2)
user_vec, user_bias = train(train_df, validate_df, test_df, model)

val: 2.021425214664118
train: 0.667723291625787
val: 0.2707810338877338
train: 0.2438547120450365
val: 0.23030166764296794
train: 0.2255393392086561
val: 0.22288885701801994
train: 0.22124274013772732
val: 0.22046759810117506
train: 0.2196418507688387
val: 0.21940113176434928
train: 0.21887278126535548
val: 0.21882969232214922
train: 0.21843468186871354
val: 0.21847755741050492
train: 0.21815149239896467
val: 0.21823545003058517
train: 0.2179483060409643
val: 0.2180525252680432
train: 0.21778829251129264
val: 0.21790197649609416


In [102]:
user_features = np.empty((user_vec.numpy().shape[0], user_vec.numpy().shape[1]+1))
user_features[:, :user_vec.numpy().shape[1]] = user_vec.numpy()
user_features[:, :-1] = user_bias.numpy()

In [105]:
kmeans = KMeans(n_clusters=200, random_state=0).fit(user_features)
print(kmeans.labels_)

[152  67  85 180  11 132 163  25   9 136 145 110  68 144  85  20 123 161
 177 118 186  43 124  38 124 146  64  27  18 174  72 195   8 156  30 153
  50 153  95  68 138 150  65  58 159  52 199  45 194   8 178 153 108 140
  45 107   8  46  29  92 166  39  50 175 124  15  79  75   4 166  59 197
 134  21 148 148 194  53 176 112   3   6 124  82 106  46  31   8  41  41
 117 140  69 162 178  49 124 163  84  69  30 111  69 142 182 144 119 120
  35  10 101 183 153  31 169  12  68  94 156   6 101 157 135  94 148  20
 117 198  39  69 143 138  30  50  32 103 122 198  35  81 142 132 160 142
 195 137  48  94  50  27 112  64  31 108  88  80   8 126 161  31  40 142
 174  81 162   9  65  12 187 124 182  28 169  86  52 191 125  94  96  94
 106 163 108  46 132 181 105  98 169  25 161 178  92  31 134 191  92  65
  35  69 103  41 176 176 143 119  78 173  67 190  56   4 128  48 116  92
 188  62 145 138   9 192 177  65 156 199  29  24  53  58 127 137 182 108
  25  40   4  90 183 115   8 126 157  12 107 156  8

In [136]:
completed, values, ind = compute_individual_dist(mini_tf, True, False)

In [142]:
start = time.time()
features = np.empty((ind.shape[0], ind.shape[1]**2*ind.shape[0]))
delta_matrices_all = np.empty((ind.shape[0], ind.shape[0], ind.shape[1], ind.shape[1]))
score_matrices_all = np.empty((ind.shape[0], ind.shape[0], ind.shape[1], ind.shape[1]))
for user_index in range(values.shape[0]):
    if user_index%100==0:
        print(user_index)
    #check for full joint distribution or add a prior later
    if np.sum(ind[user_index]==0) > 0:
        continue
    #compute delta matrices with all other users where applicable
    else:
        #create a mask so that other half of tasks can be used later to find score matrix
#         mask = np.random.randint(0,2,values.shape).astype(bool)
        mask = np.ones((values.shape)).astype(bool)
        delta_matrices, t_m_i_1, cluster_img = compute_deltas(user_index, completed, values, ind, mask, False, 20)
        features[user_index,:] = cluster_img.flatten()
        delta_matrices_all[user_index,:,:,:] = delta_matrices
#         score_matrices, t_m_i_2 = compute_deltas(user_index, completed, values, ind, ~mask, True, 20)
#         score_matrices_all[user_index,:,:,:] = score_matrices
#         print(np.sum(t_m_i_1), np.sum(t_m_i_2))
#         if len(np.intersect1d(np.array(np.where(t_m_i_1==True)), np.array(np.where(t_m_i_2==True))))>0:
#             print(np.intersect1d(np.array(np.where(t_m_i_1==True)), np.array(np.where(t_m_i_2==True))))
#             print(regret(score_matrices, delta_matrices, \
#                      np.logical_and((t_m_i_1==True), (t_m_i_2==True))))
print(time.time()-start)

0
100
200
300
400
500
600
700
800
900
127.64959692955017


In [149]:
np.sum(np.isnan(features))

110825

In [151]:
from sklearn.cluster import KMeans

def kmeans_missing(X, n_clusters, max_iter=10):
    """Perform K-Means clustering on data with missing values.

    Args:
      X: An [n_samples, n_features] array of data to cluster.
      n_clusters: Number of clusters to form.
      max_iter: Maximum number of EM iterations to perform.

    Returns:
      labels: An [n_samples] vector of integer labels.
      centroids: An [n_clusters, n_features] array of cluster centroids.
      X_hat: Copy of X with the missing values filled in.
    """

    # Initialize missing values to their column means
    missing = ~np.isfinite(X)
    mu = np.nanmean(X, 0, keepdims=1)
    X_hat = np.where(missing, mu, X)

    for i in range(max_iter):
        print(i)
        if i > 0:
            # initialize KMeans with the previous set of centroids. this is much
            # faster and makes it easier to check convergence (since labels
            # won't be permuted on every iteration), but might be more prone to
            # getting stuck in local minima.
            cls = KMeans(n_clusters, init=prev_centroids)
        else:
            # do multiple random initializations in parallel
            cls = KMeans(n_clusters, n_jobs=-1)

        # perform clustering on the filled-in data
        labels = cls.fit_predict(X_hat)
        centroids = cls.cluster_centers_

        # fill in the missing values based on their cluster centroids
        X_hat[missing] = centroids[labels][missing]

        # when the labels have stopped changing then we have converged
        if i > 0 and np.all(labels == prev_labels):
            break

        prev_labels = labels
        prev_centroids = cls.cluster_centers_

    return labels, centroids, X_hat

In [152]:
labels, centroids, X_hat = kmeans_missing(features, 200, max_iter=5)
print(labels)

0
1


  return_n_iter=True)


[197  15  25 129   1  15  37  59  15  21  24  24   1  49 161  15   1  88
 118 107   1   1  15  46   3   8   6  40  11  55  21 107  37 163  15   1
 128 187  49 141  33  87 118 130 163   1 199  15   1  55  15   1  27  96
 107  55  95  77  59   1   2  13  31  15  15   1  38 144 135  98  52  59
 117  98  15  15  15  17 159  31  15  31  57  52  15  15  57  25  53 118
  72  33  15  55  15 184  31 107 107 128   3  73  15  86  15  59   1   1
  40  40  15  52 107  53  31  53   3   1   1 140 189  23  53 141 141  15
  18  99  39  15  91  59  33 107   1  95  28 113 162  23  53 141  59  31
  15  45  31  27  44 130  31  89  57  31  17  11  81  23  31  15  99  31
 125   5  22  57 128  11  22  86 107   1   1 124 158  38  12   1   1  15
 141  53 141 199  15  42   1   4  15  33   3  15  55  15 134  21  53   3
 154 199  59   0  25 193   4  53 130 175  15  53   1  99 106   1  15  37
   7  81 113   1 141 107  28 169 128  93  17  36   1   1   1 194  15 141
   3  81   1  33  52  81  52  33  55  77  37 118 13

In [184]:
def calc_cluster_matrix_dist(labels, delta_matrices_all):
#     delta_matrices_new=np.zeros((len(np.unique(labels)), len(np.unique(labels)), delta_matrices_all[0][0].shape[0], \
#                                                               delta_matrices_all[0][0].shape[1]))
    delta_matrices_new=np.empty(delta_matrices_all.shape)
    for i in np.unique(labels):
        if i%100==0:
            print(i)
        missing = ~np.isfinite(delta_matrices_all)
        mu = np.nanmean(delta_matrices_all, 0, keepdims=0)
        filled = np.where(missing, mu, delta_matrices_all)
        cluster_i = np.average(filled[labels==i], axis=0)
        for j in np.unique(labels): 
            if np.sum(np.sum(np.isnan(cluster_i), (1,2)))>0:
                print(i,j)
                missing = ~np.isfinite(deltas_used)
                mu = np.nanmean(delta_matrices_all, 1, keepdims=1)
                X_hat = np.where(missing, mu, X)
            cluster_j = np.average(cluster_i[labels==j], axis=0)
#             delta_matrices_new[i,j] = cluster_j
            delta_matrices_new[np.ix_(labels==i,labels==j)] = cluster_j
    return delta_matrices_new

In [185]:
delta_matrices_clust = calc_cluster_matrix_dist(labels, delta_matrices_all)

0
100


In [189]:
print(np.average([[pairwise_distances(score_matrices_clust[i,j], delta_matrices_all[i,j]) for i in range(1000)] for j in range(1000)]))

0.0776220916645


In [190]:
delta_matrices_clust = calc_cluster_matrix_dist(kmeans.labels_, delta_matrices_all)
print(np.average([[pairwise_distances(delta_matrices_clust[i,j], delta_matrices_all[i,j]) for i in range(1000)] for j in range(1000)]))

0
100
0.0770279382517


In [None]:
for k_ in range(2,11):
    model = Model(users, tasks, k=k_)
    user_vec, user_bias = train(train_df, validate_df, test_df, model)
    user_features = np.empty((user_vec.numpy().shape[0], user_vec.numpy().shape[1]+1))
    user_features[:, :user_vec.numpy().shape[1]] = user_vec.numpy()
    user_features[:, :-1] = user_bias.numpy()
    kmeans = KMeans(n_clusters=200, random_state=0).fit(user_features)
    delta_matrices_clust = calc_cluster_matrix_dist(kmeans.labels_, delta_matrices_all)
    avg_dist = np.average([[pairwise_distances(delta_matrices_clust[i,j], delta_matrices_all[i,j]) for i in range(1000)] for j in range(1000)])
    print("K={} score: {}".format(k_, avg_dist))