In [31]:
%matplotlib inline
import cPickle as pkl
import pandas as pd
import networkx as nx
import numpy as np
import os,sys
import matplotlib.pyplot as plt
import seaborn as sns
import time

from collections import defaultdict
from collections import Counter

In [133]:
# import custom models
from model.Autoencoder import LSTMAuto
from model.MIPnet import MIPnet
from model.githubHandler import *

In [116]:
reload(model.MIPnet)
reload(model.githubHandler)
reload(model.Autoencoder)

In [124]:
rdata, udata, data, users, repos = pkl.load(open('data_structuring_150.pkl', 'rb'))

In [6]:
label_encodings = {'MemberEvent': "none",
 'PublicEvent': "design",
 'PullRequestReviewCommentEvent': "design",
 'ForkEvent': "consume",
 'GollumEvent': "design",
 'ReleaseEvent': "none",
 'PullRequestEvent': "content",
 'IssueCommentEvent': "design",
 'PushEvent': "content",
 'DeleteEvent': "content",
 'CommitCommentEvent': "design",
 'WatchEvent': "consume",
 'IssuesEvent': "design",
 'CreateEvent': "content"}

In [125]:
# Simulate time steps
mip = MIPnet(users, repos)
interactions_bucket1 = get_all_interactions_for_timebucket(1, data, label_encodings)
mip.update_edges_for_time(interactions_bucket1)

found 16194 interactions


In [126]:
# simulate all time buckets
mip = MIPnet(users, repos)
for i in range(1,9):
    print "Bucket", i
    mip.update_edges_for_time(get_all_interactions_for_timebucket(i, data, label_encodings))
    print len(mip.mip.edges()), "Edges between", len(users)+len(repos), "Nodes"
    mip.decay_weights()

Bucket 1
found 16194 interactions
34048 Edges between 26735 Nodes
Bucket 2
found 23973 interactions
65740 Edges between 26735 Nodes
Bucket 3
found 31620 interactions
95379 Edges between 26735 Nodes
Bucket 4
found 26499 interactions
120711 Edges between 26735 Nodes
Bucket 5
found 38469 interactions
172204 Edges between 26735 Nodes
Bucket 6
found 28763 interactions
199121 Edges between 26735 Nodes
Bucket 7
found 26121 interactions
226551 Edges between 26735 Nodes
Bucket 8
found 25313 interactions
251684 Edges between 26735 Nodes


In [28]:
for ednum,val in mip.mip["Angelfirenze"]["deadlyvipers/dojo_rules"].iteritems():
    print val

{'ntype': 'content', 'weight': 9.850707420300003}
{'ntype': 'design', 'weight': 3.4480423521000008}
{'ntype': 'consume', 'weight': 0.7791456501000001}


## Evaluation Section

### adjusted MIP-DOI

In [127]:
# How much should each weight be worth? Start with equal
params_all =     {"u-u": 1, "r-r": 1, "content": 1, "design": 1, "consume": 1}
params_content = {"u-u": 1, "r-r": 1, "content": 1, "design": 0, "consume": 0}
params_design =  {"u-u": 1, "r-r": 1, "content": 0, "design": 1, "consume": 0}
params_consume = {"u-u": 1, "r-r": 1, "content": 0, "design": 0, "consume": 1}
params = [params_all, params_content, params_design, params_consume]
# Ofras parameters
alpha = .5
beta = .5

In [128]:
# Test helper functions:

In [129]:
adamicAdarProximity(mip.mip, "Angelfirenze", "deadlyvipers/dojo_rules", params)
#TODO rescale both values or learn alpha/beta

array([ 11.68648641,   1.84795528,  10.24145771,   3.05212012])

In [130]:
computeUserInCommonWeight(mip.mip, "Angelfirenze", "deadlyvipers/dojo_rules")

1.9185020100000003

In [131]:
print computeDOI(mip, "Angelfirenze", "deadlyvipers/dojo_rules", params, alpha, beta)


(5.8435424487595036, 0.0005984888157402558, array([ 11.68648641,   1.84795528,  10.24145771,   3.05212012]))


### Autoencoder feature

In [134]:
user_enc_name, x_ae, y_ae = constructAutoEncoderData(udata, label_encodings, 8)

62 Number of sequence length for autoencoder


In [135]:
embsize = 5
encodingsize = 5
autoencoder = LSTMAuto(x_ae.shape[1], embsize, y_ae.shape[2], encodingsize)

In [None]:
# For full models, it is best to train for >5 epochs ~4.5 min per epoch
autoencoder.train(x_ae, y_ae, epochs=1)

Epoch 1/1
  900/16576 [>.............................] - ETA: 286s - loss: 0.8276

In [99]:
# 3. Predict all users hidden
user_embs = autoencoder.encode(x_ae)

In [132]:
# 4. Set Data in mipnet
for ix, uname in enumerate(user_enc_name):
    mip.userEncoding[uname] = user_embs[ix]

## Test: Predict interactions from only DOI vs. linear model using DOI


## Full Model

In [117]:
# First define the features that are actually being used in current model
features = np.array([1, # API from MIP-DOI
                     1, # Distance all
                     1, # Distance content
                     1, # Distance design
                     1, # Distance Consume
                     1, # Weighted User Connectedness
                     1, # Fraction User Connectedness
                     1, # Stars / Forks
                     1, # User Embeddings
                     1, # Owner yes/no
                     1, # previous interaction count content
                     1, # previous interaction count design
                     1, # previous interaction count consume
                     1,]) # Total weight between user and users in common

In [119]:
# Define Y based on desired outcome, all/design/content/design
# Form: {User: {Repo: 1/0}}, possible extension {User: {Repo: {Type: 1/0}}}
def get_Y(goaltype='all', time_as_goal=9):
    test_ints = get_all_interactions_for_timebucket(time_as_goal, data, label_encodings)
    y_truths = defaultdict(Counter)
    for p in test_ints:
        if goaltype == "all" or p[2] == goaltype:
            y_truths[p[0]][p[1]] = 1
    return y_truths
Y = get_Y()

found 19782 interactions


In [120]:
user2idx = {}
idx2user = {}
repo2idx = {}
idx2repo = {}
for ix, v in enumerate(mip.P):
    user2idx[v] = ix
    idx2user[ix] = v
for ix, v in enumerate(mip.R):
    repo2idx[v] = ix
    idx2repo[ix] = v

In [122]:
X_large = construct_X_large(Y, mip, mip.mip, mip.P, mip.R, features)

In [123]:
print X_large

{}


## Model 

In [445]:
from keras.layers import Input, Dense, Activation
from keras.models import Sequential

In [504]:
model = Sequential()
linear1 = Dense(input_shape = (len(X_large[0][0][0][0]),),
                units = 50, activation='tanh', kernel_initializer='uniform')
linear2 = Dense(2, kernel_initializer='uniform')
act = Activation('softmax')

model.add(linear1)
model.add(linear2)
model.add(act)
model.compile(loss='categorical_crossentropy', optimizer='adagrad')

In [505]:
k = 14
epochs = 20

total_empty = 0
total_data = 0

kinds = np.arange(k+1)
for ep in range(epochs):
    for userid, feat in X_large.iteritems(): 
        for xtruefeat, xtrueid in feat[0]:
            #np.random.shuffle(kinds)
            #xtruefeat, xtrueid = feat[0][np.random.choice(len(feat[0]))]
            xfalsefeat, xfalseids = zip(*[feat[1][hihi] for hihi in np.random.choice(len(feat[1]), k)])

            inputs = np.vstack([np.array(xtruefeat), np.array(xfalsefeat)])
            targets = np.array([1,0]+[0,1]*k).reshape(-1,2)
            model.train_on_batch(inputs,targets)

            #print model.predict(np.array(xtruefeat).reshape(1,-1))

            #break
        if userid % 5000 == 0:
            print userid
    print "Done with Epoch", ep

0
10000
Done with Epoch 0
0
10000
Done with Epoch 1
0
10000
Done with Epoch 2
0
10000
Done with Epoch 3
0
10000
Done with Epoch 4
0
10000
Done with Epoch 5
0
10000
Done with Epoch 6
0
10000
Done with Epoch 7
0
10000
Done with Epoch 8
0
10000
Done with Epoch 9
0
10000
Done with Epoch 10
0
10000
Done with Epoch 11
0
10000
Done with Epoch 12
0
10000
Done with Epoch 13
0
10000
Done with Epoch 14
0
10000
Done with Epoch 15
0
10000
Done with Epoch 16
0
10000
Done with Epoch 17
0
10000
Done with Epoch 18
0
10000
Done with Epoch 19


In [571]:
# Evaluate the model
def evalModel():
    ranks = []
    top50PerUser = []
    now = time.clock()
    for userkey, u in idx2user.iteritems(): 
        try:
            _, positive_ids = zip(*X_large[user2idx[u]][0])
        except:
            # user has no data
            positive_ids = []
        positive_ids = list(positive_ids)
        # Don't even predict if not a single positive is in
        if positive_ids:
            crank = []
            all_preds = []
            # Sample 500 other repos, otherwise it will take ~32 hours
            look_at = set(np.random.choice(len(idx2repo), 500)) | set(positive_ids)
            for k in look_at:
                r = idx2repo[k]
                currX = get_X_features(mip, mip.mip, u, r, features).reshape(1,-1)
                currS = model.predict(currX)[0][0]
                curry = 1 if k in positive_ids else 0
                all_preds.append((currS, k, userkey, curry))
            all_preds.sort(reverse=True)
            for predrank, anypred in enumerate(all_preds):
                if anypred[1] in positive_ids:
                    crank.append(predrank+1)
            top50PerUser.append(all_preds[:50])
            ranks.append(crank)
        if userkey > 1000: 
            break
    print 'Ran Prediction in', time.clock() - now, 'Seconds'
    total_ranks = 0
    total_within5 = 0
    total_within10 = 0
    total_within20 = 0
    total_num = 0
    for ex in ranks:
        for num in ex:
            total_num += 1
            total_ranks += num+1 # remove 1 when recomputed
            if num <= 5:
                total_within5 += 1
            if num <= 10:
                total_within10 += 1
            if num <= 20:
                total_within20 += 1
    print total_num, "Total Number "
    print float(total_within5) / total_num, "Within 5"
    print float(total_within10) / total_num, "Within 10"
    print float(total_within20) / total_num, "Within 20"
    print float(total_ranks)/total_num, "MRR"

105.829783


## TODO 

1. Autoencoder Feature per user (Train one until time 8, one until time 9) - DUNZO
2. Split Training and Eval sets. Methodology: (1) precompute all inputs until time 8 and train model to predict time 9, (2) update all measures (including autoencoder) and inputset, predict time 10 and eval recall@, MRR
3. Implement mask for in training and run experiments
4. Evaluate parameters (linear model or saliency?) 