In [49]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import seaborn as sns
from itertools import product
from itertools import combinations_with_replacement
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import time
import math
%matplotlib inline

In [2]:
from utils import *
%load_ext autoreload
%autoreload 2

In [10]:
# adult = pd.read_csv('../labels.txt', delimiter='\t', header=0, names=['user_id','website','rating'])
trec = pd.read_csv('../trec-rf10-crowd/trec-rf10-data.txt', delimiter='\t')

# best_users = trec.groupby('workerID').count().sort_values('docID', ascending=False)[:150].index
# trec = trec[trec['workerID'].isin(best_users)]

# r = pd.Series([2,3,2,3], index=[1,2,0,-2])
# trec['label_bin'] = trec['label'].map(r)
trec.head(10)

Unnamed: 0,topicID,workerID,docID,gold,label
0,20002,w1,clueweb09-en0000-66-24091,-1,0
1,20002,w1,clueweb09-en0001-31-15410,-1,0
2,20002,w1,clueweb09-en0000-05-22942,-1,0
3,20002,w1,clueweb09-en0000-05-22943,-1,0
4,20002,w1,clueweb09-en0006-85-33191,2,2
5,20002,w2,clueweb09-en0000-66-24091,-1,1
6,20002,w2,clueweb09-en0001-31-15410,-1,1
7,20002,w2,clueweb09-en0000-05-22942,-1,0
8,20002,w2,clueweb09-en0000-05-22943,-1,0
9,20002,w2,clueweb09-en0006-85-33191,2,-2


98453

In [12]:
# testframe = create_user_task_ids(adult, 'user_id', 'website', 'rating')
testframe = create_user_task_ids(trec, 'workerID', 'docID', 'label', False)
testframe.head()

Unnamed: 0,topicID,workerID,docID,gold,label,task_id,uid,bin
0,20002,w1,clueweb09-en0000-66-24091,-1,0,0,0,1
1,20002,w1,clueweb09-en0001-31-15410,-1,0,1,0,1
2,20002,w1,clueweb09-en0000-05-22942,-1,0,2,0,1
3,20002,w1,clueweb09-en0000-05-22943,-1,0,3,0,1
4,20002,w1,clueweb09-en0006-85-33191,2,2,4,0,3


In [52]:
def batcher(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def split(df):
    train_df, validate_df, test_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
    return train_df, validate_df, test_df

class Model(nn.Module):
    def __init__(self, users, tasks, k=2):
        super(Model, self).__init__()
        self.user_lut = nn.Embedding(users, k)
        self.task_lut = nn.Embedding(tasks, k)

        self.user_bias = nn.Embedding(users, 1)
        self.task_bias = nn.Embedding(tasks, 1)
        self.global_bias = nn.Parameter(torch.FloatTensor(1))
        
    def forward(self, users, jokes):
        user_vectors = self.user_lut(users)
        task_vectors = self.task_lut(jokes)
        user_bias = self.user_bias(users)
        task_bias = self.task_bias(jokes)

        return torch.bmm(user_vectors.unsqueeze(1),
                         task_vectors.unsqueeze(2)).squeeze() \
                         + user_bias.squeeze() + task_bias.squeeze() + self.global_bias.expand_as(user_bias.squeeze())

def val(df, model):
    crit = nn.MSELoss(size_average=False)
    total_loss = 0.
    total_num = 0
    for batch in batcher(df, 100):
        true_rating = Variable(torch.Tensor(batch.bin.values.astype(float)))
        total_num = total_num + true_rating.size(0)
        users = Variable(torch.LongTensor(batch.uid.values))
        tasks = Variable(torch.LongTensor(batch.task_id.values))
        scores = model.forward(users, tasks)
        total_loss += crit(scores, true_rating).data[0]
    return math.sqrt(total_loss/total_num)


def train(train_iter, val_iter, test_iter, model):
    opt = optim.SGD(model.parameters(), lr=0.1)
    crit = nn.MSELoss()

    print("val:", val(validate_df, model))
    for epochs in range(30):
        avg_loss = 0
        total = 0
        for i,batch in enumerate(batcher(train_df, 100)):
            opt.zero_grad()
            rating = Variable(torch.Tensor(batch.bin.values.astype(float)))
            users = Variable(torch.LongTensor(batch.uid.values))
            tasks = Variable(torch.LongTensor(batch.task_id.values))
            scores = model.forward(users, tasks)
            loss = crit(scores, rating)
            #if i % 1000==0:
            #    print (loss.data[0])
            loss.backward()
            avg_loss += loss.data[0]
            total += 1
            opt.step()
        print("train:", math.sqrt(avg_loss / float(total)))
        print("val:", val(validate_df, model))
    return model.

In [53]:
train_df, validate_df, test_df = split(testframe)
users = len(testframe.uid.unique())
tasks = len(testframe.task_id.unique())
model = Model(users, tasks, k=2)
train(train_df, validate_df, test_df, model)

val: 3.0629492165320564
train: 1.7289076184852754
val: 1.538710919677935
train: 1.4723364650047503
val: 1.4414286369074378
train: 1.3993932192983205
val: 1.3961880934423498
train: 1.35787991416051
val: 1.3683860825851535
train: 1.3286123059690955
val: 1.3484143402589266
train: 1.3055272106775073
val: 1.332662450004364
train: 1.286131869542652
val: 1.319518852984899
train: 1.2692084401889832
val: 1.3081546645624285
train: 1.2540751176503102
val: 1.2980891807335073
train: 1.2403077078037839
val: 1.2890159115709454
train: 1.2276209699449618
val: 1.280725353473374
train: 1.2158121166344822
val: 1.2730671205037298
train: 1.204731121300402
val: 1.265929303326758
train: 1.1942636367884238
val: 1.259226471473344
train: 1.1843205513171415
val: 1.252892097524612
train: 1.1748311551882304
val: 1.246873601587481
train: 1.16573847884464
val: 1.2411288006922625
train: 1.156996021454806
val: 1.2356234860170345
train: 1.1485654375750618
val: 1.2303295863451735
train: 1.1404147646158391
val: 1.22522396

In [8]:
from sklearn.cluster import KMeans

def kmeans_missing(X, n_clusters, max_iter=10):
    """Perform K-Means clustering on data with missing values.

    Args:
      X: An [n_samples, n_features] array of data to cluster.
      n_clusters: Number of clusters to form.
      max_iter: Maximum number of EM iterations to perform.

    Returns:
      labels: An [n_samples] vector of integer labels.
      centroids: An [n_clusters, n_features] array of cluster centroids.
      X_hat: Copy of X with the missing values filled in.
    """

    # Initialize missing values to their column means
    missing = ~np.isfinite(X)
    mu = np.nanmean(X, 0, keepdims=1)
    X_hat = np.where(missing, mu, X)

    for i in range(max_iter):
        if i > 0:
            # initialize KMeans with the previous set of centroids. this is much
            # faster and makes it easier to check convergence (since labels
            # won't be permuted on every iteration), but might be more prone to
            # getting stuck in local minima.
            cls = KMeans(n_clusters, init=prev_centroids)
        else:
            # do multiple random initializations in parallel
            cls = KMeans(n_clusters, n_jobs=-1)

        # perform clustering on the filled-in data
        labels = cls.fit_predict(X_hat)
        centroids = cls.cluster_centers_

        # fill in the missing values based on their cluster centroids
        X_hat[missing] = centroids[labels][missing]

        # when the labels have stopped changing then we have converged
        if i > 0 and np.all(labels == prev_labels):
            break

        prev_labels = labels
        prev_centroids = cls.cluster_centers_

    return labels, centroids, X_hat

In [11]:
labels, centroids, X_hat = kmeans_missing(features_nonnan, 30, max_iter=20)
print(labels)

[15 23 13 26 13 13 25 13 13 13 13 13 13 13  0 13 13 12  1 21 12 13 13 12 12
 13 24  7 12 13 12 13 13  6 13 13 29 17 13 12 12 13 27 13 12 13 12 13 11 13
 13 13 13 12 12 13 12 12  9 12 22 28 13 12  3 13 28 19 28 13  5  0 10 14  2
 13 13 18 20 16  0  4  0  0  0  0  8  0]


  return_n_iter=True)


In [12]:
def calc_cluster_scoring_matrices(labels, delta_matrices_all, non_empty_users):
    delta_matrices_new=np.zeros_like(delta_matrices_all)
    deltas_used = delta_matrices_all[np.ix_(non_empty_users, non_empty_users)]
    for i in np.unique(labels):
        for j in np.unique(labels):
            missing = ~np.isfinite(deltas_used)
            mu = np.nanmean(deltas_used, 0, keepdims=0)
            filled = np.where(missing, mu, deltas_used)
            cluster_i = np.average(deltas_used[labels==i], axis=0)
            if np.sum(np.sum(np.isnan(cluster_i), (1,2)))>0:
                print(i,j)
                missing = ~np.isfinite(deltas_used)
                mu = np.nanmean(deltas_used, 1, keepdims=1)
                X_hat = np.where(missing, mu, X)
            cluster_j = np.average(cluster_i[labels==j], axis=0)
            deltas_used[np.ix_(labels==i,labels==j)] = cluster_j
    score_matrices_new = np.sign(deltas_used)
    score_matrices_new[score_matrices_new==-1]=0
    return score_matrices_new

In [13]:
score_matrices_clust = calc_cluster_scoring_matrices(labels, delta_matrices_all, neu)

19397
