In [1]:
# This pipeline will test collaborative filtering as a means of guessing
# which users will interact in the future.
# Definition of steps:
# Import data,
# create the collaborative filtering scheme, and pass the data through the collaborative filter.
# use the scheme to predict which pairs will interact at a future time.

In [6]:
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('TKAgg')
from matplotlib import pyplot as plt
from scipy.spatial.distance import cosine

from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from scipy.sparse import dok_matrix, csr_matrix
import scipy
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics import roc_curve, auc
from lpproj import LocalityPreservingProjection
from sklearn.decomposition import NMF

In [71]:
dftrain = pd.read_csv('data/txTripletsCounts.txt',
                      header=None,
                      index_col=None,
                      sep=' ',
                      names=['sender','receiver','transaction'])
dftest = pd.read_csv('data/testTriplets.txt', 
                     header=None, 
                     index_col=None, 
                     sep=" ", 
                     names=['sender', 'receiver', 'transaction'])
dftrain['transaction'].describe()

count    3348026.000000
mean           4.725741
std          128.494757
min            1.000000
25%            1.000000
50%            1.000000
75%            2.000000
max        41639.000000
Name: transaction, dtype: float64

In [100]:
dftrain # lets see what our data looks like, we can see that its sender, receiver, transaction counts

Unnamed: 0,sender,receiver,transaction
0,0,1,3
1,0,13,1
2,0,37,1
3,0,51,1
4,0,438481,1
5,1,0,3
6,1,4,354
7,1,10,2602
8,1,11,2689
9,1,12,1


In [5]:
# Our collab filtering pipeline will go as follows (to test)
# first we will construct the count matrix of transmitter2receiver with counts
# then we will apply pca to get the lowest dimensionality representation
# then we will use collaborative filtering to predict if any receiver transmitter pair would have an interaction
# in the future.

In [5]:
# max sender numbe n = 444074
n = 444075
t2r_matrix = csr_matrix((dftrain['transaction'], (dftrain['sender'], dftrain['receiver'])), shape=(n,n), dtype=float)
t2r_matrix[364,:].toarray()

array([[ 0.,  1.,  0., ...,  0.,  0.,  0.]])

In [19]:
# create adjacency-list graph G = (V, E)
N = 444075
G = [[] for i in range(N)]
for (index, sender, receiver, count) in dftrain.itertuples():
    G[sender].append(receiver)


In [20]:
# create transpose adjacency-list graph Gt = (V, E')
Gt = [[] for i in range(N)]
for (index, sender, receiver, count) in dftrain.itertuples():
    Gt[receiver].append(sender)

In [21]:
G_uni = [[] for i in range(N)]
for (index, sender, receiver, count) in dftrain.itertuples():
    G_uni[sender].append(receiver)
    G_uni[receiver].append(sender)

In [22]:
# Lets reduce dimensionality first!
from scipy.sparse.linalg import svds
epsilon = 1e-10
new_dim = 10
u, s, vt = svds(t2r_matrix, k=new_dim, tol=epsilon, which='LM')

In [23]:
v = vt.transpose()

In [24]:
d = 2
def isConnected(S, R, D):
    # S and R are points on the Graph G, and can be indexed via the adjacency-list
    def isConnectedRec(s_un, depthRemaining, isTranspose):
        # iterate through s_un, look at all their neighbors in the corresonding graph
        rec_set = set([])
        if depthRemaining == 0:
            return False
        for v in s_un:
            if isTranspose:
                for sender in Gt[v]:
                    rec_set.add(sender)
            else:
                for child in G[v]:
                    if child == R and depthRemaining % 2 == 1:
                        return True
                    rec_set.add(child)
        return isConnectedRec(rec_set, depthRemaining - 1, ~isTranspose)
    return isConnectedRec(set([S]), D, False)
isConnected(0, 2, 3)

True

In [13]:
from scipy.spatial.distance import cosine

def simConnected(S, R, D):
    threshold = 1.0
    R_vec = v[R]
    S_vec = u[S]
    # S and R are points on the Graph G, and can be indexed via the adjacency-list
    def simConnectedRec(s_un, depthRemaining, isTranspose):
        # iterate through s_un, look at all their neighbors in the corresonding graph
        rec_set = set([])
        if depthRemaining == 0:
            return False
        for vertex in s_un:
            if isTranspose:
                for sender in Gt[vertex]:
                    sender_vec = u[sender]
                    similarity_score = cosine(S_vec, sender_vec)
                    if similarity_score > threshold:
                        rec_set.add(sender)
            else:
                for child in G[vertex]:
                    if child == R and depthRemaining % 2 == 1:
                        return True
                    child_vec = v[child]
                    similarity_score = cosine(R_vec, child_vec)
                    if similarity_score > threshold:
                        rec_set.add(child)
        return simConnectedRec(rec_set, depthRemaining - 1, ~isTranspose)
    return simConnectedRec(set([S]), D, False)
simConnected(1,16,3)

True

In [25]:
import math
mean_score = 100343.0
def volatilityScore(v, isSender):
    # given any vertex v, assess its volatility
    # assumption is that high volatility on a sender or receiver side has implications about their ability to meet
    # or form new connections
    graph = G
    graph_flip = Gt
    if not isSender:
        graph = Gt
        graph_flip = G
        
    v_neigh_list = graph[v]
    score = len(v_neigh_list)
    for neigh in v_neigh_list:
        # you are going to get the indegree
        neigh_list = graph_flip[neigh]
        score = score + len(neigh_list) - 1 # otherwise you count yourself.
    return (score - mean_score)/mean_score
volatilityScore(4000, True)

# calculate mean volatility
# total_score = 0.0
# items = 0.0
# for v in range(len(G)):
#     items = items + 1
#     total_score = total_score + volatilityScore(v, True)

# mean_score = total_score / len(G)


0.9658072810260805

In [None]:
dftest

In [16]:
# # Time to test this crap.
# pred = [isConnected(sender,receiver,3) 
#         for index,sender,receiver,transaction  in dftest.itertuples()]
pred = []
count = 0
for index,sender,receiver,transaction in dftest.itertuples():
    a = isConnected(sender,receiver,3)
    print count
    count = count + 1
    pred.append(a)

label = dftest['transaction']
fpr, tpr, thresholds = roc_curve(label, pred)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc
# matplotlib.rcParams['figure.figsize'] = (10, 10)
# plt.plot(fpr, tpr, color='magenta', label='ROC curve (area = %0.2f)' % roc_auc)
# plt.plot([0, 1], [0, 1], 'k--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic example')
# plt.legend(loc="lower right")
# plt.show()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
Area under the ROC curve : 0.504934


In [17]:
# Test baseline volatility score metric
# # Time to test this crap.
# pred = [isConnected(sender,receiver,3) 
#         for index,sender,receiver,transaction  in dftest.itertuples()]
pred = []
count = 0
for index,sender,receiver,transaction in dftest.itertuples():
    a = volatilityScore(sender, True)
    b = volatilityScore(receiver, False)
    if a > -0.25 and b > -0.25:
        pred.append(1)
    else:
        pred.append(0)
#     print count
    count = count + 1

label = dftest['transaction']
fpr, tpr, thresholds = roc_curve(label, pred)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc
# matplotlib.rcParams['figure.figsize'] = (10, 10)
# plt.plot(fpr, tpr, color='magenta', label='ROC curve (area = %0.2f)' % roc_auc)
# plt.plot([0, 1], [0, 1], 'k--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic example')
# plt.legend(loc="lower right")
# plt.show()

Area under the ROC curve : 0.627467


In [18]:
# first time trial, we will run pure collaborative content based filtering, and see what happens just to get a
# baseline measure done.
# # this will iterate row wise, so first we will compare the interactions of row 1 to row 2 and so forth.
# pearsonr_corr_matrix = dok_matrix((n,n))
# for i in range(n):
#     for j in range(n):
#         row_dense = np.squeeze((t2r_matrix.tocsr()[i,:]).toarray().flatten().transpose())
#         print row_dense.shape
#         row2_dense = np.squeeze((t2r_matrix.tocsr()[j,:]).toarray().flatten().transpose())
#         print row2_dense.shape
#         pearsonr_corr_matrix[i, j] = pearsonr(row_dense, row2_dense)[0] # grab just the correlation value
#         print i, j
# pearsonr_corr_matrix

In [4]:
# Lets reduce dimensionality first!
from scipy.sparse.linalg import svds
epsilon = 1e-10
new_dim = 10
u, s, vt = svds(t2r_matrix, k=new_dim, tol=epsilon, which='LM')
v = vt.transpose()

In [20]:
# Third pipeline
# Given some R, get the senders it interacts with off the Gt graph
# Get the Rset that are in the interaction set for those senders
# compute the pearson similarity on those set of Receivers.
# take top 10 receivers, and see if S has interacted with them in a maximal fashion
def connectionByGraphSimilarity(S, R):
    R_vec = v[R]
    def getReceiverSet():
        sender_set = Gt[R]
        receiver_set = set()
        for sender in sender_set:
            for r in G[sender]:
                receiver_set.add(r)
        return receiver_set
    
    def computeReceiverSimilarity(receiver_set):
        # (score, index) to store the similarity list
        similarity_list = []
        for r in receiver_set:
            vec = v[r] # get the vector from the SVD matrix
            score = cosine(R_vec, vec)
            similarity_list.append((score, r))
        return sorted(similarity_list, key=lambda tup: tup[0])

    def isConnectedViaSimilar(scores, k, p):
        # take top k vectors, and check if the sender is in the set
        top_k_tuples = scores[-k:]
        connected_count = 0.0
        for (sc, index) in top_k_tuples:
            for sender in Gt[index]:
                if sender == S:
                    connected_count = connected_count + 1.0
#         print connected_count
#         print connected_count / float(k)
        if (connected_count / float(k)) > p:
            return True
        return False
        
    if S == R:
        return True
    
    r_set = getReceiverSet()
    sim_scores = computeReceiverSimilarity(r_set)
#     print sim_scores
    return isConnectedViaSimilar(sim_scores, 10, 0.25)
connectionByGraphSimilarity(8,32443)

False

In [21]:
def numeric_connectedBySimilarity(S,R):
    if connectionByGraphSimilarity(S, R):
        return 1
    else:
        return 0

In [None]:
pred = []
count = 0
for index,sender,receiver,transaction in dftest.itertuples():
    if connectionByGraphSimilarity(sender, receiver):
        pred.append(1)
    else:
        pred.append(0)
#     print count
    count = count + 1

label = dftest['transaction']
fpr, tpr, thresholds = roc_curve(label, pred)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

In [79]:
# Jaccard similarity measure
def jaccardSimilarity(S, R):
    S_neighbors = set(G_uni[S])
    R_neighbors = set(G_uni[R])
    SR_intersection = S_neighbors.intersection(R_neighbors)
    SR_union = S_neighbors.union(R_neighbors)
    return float(len(SR_intersection)) / float(len(SR_union))

In [None]:
total_jaccard = 0.0
for i in range(0, 50):
    for j in range(0, 50):
        total_jaccard = total_jaccard + jaccardSimilarity(i, j)
print total_jaccard / 2500.0

In [None]:
# find the median
jaccard_list = []
for i in range(0,50):
    for j in range(0,50):
        jaccard_list.append(jaccardSimilarity(i, j))

median_jaccard = np.median(jaccard_list)

In [None]:
0.08 * median_jaccard

In [170]:
pred = []
count = 0
for index,sender,receiver,transaction in dftest.itertuples():
    if jaccardSimilarity(sender, receiver) > 0:
        pred.append(1)
    else:
        pred.append(0)
#     print count
    count = count + 1

label = dftest['transaction']
fpr, tpr, thresholds = roc_curve(label, pred)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

Area under the ROC curve : 0.679444


In [26]:
import math
def preferentialScore(S, R):
    S_neighbors = set(G_uni[S])
    R_neighbors = set(G_uni[R])
    return math.log(len(S_neighbors) * len(R_neighbors))

In [77]:
def jaccardAndPreferentialScore(S, R):
    return 0.5 * jaccardSimilarity(S, R) + 0.5 * preferentialScore(S, R)

In [74]:
def jaccardAndVolatilityScore(S, R):
    return 0.5 * jaccardSimilarity(S, R) + 0.5 * volatilityScore(S, R)

In [26]:
def jaccardAndVolatilityAndConnectionViaGraphSimilarity(S,R):
    if connectionByGraphSimilarity(S,R):
        return 0.33 + jaccardAndVolatilityScore(S,R)
    else:
        return jaccardAndVolatilityScore(S,R)

In [None]:
1.39776250554 # median pref + jaccard

In [None]:
0.127677740241 # median volatility + jaccard

In [None]:
median_list = []
for i in range(0, 50):
    for j in range(0, 50):
        median_list.append(jaccardAndVolatilityScore(i, j))
median_prefAndJaccard = np.median(median_list)
print median_prefAndJaccard

In [81]:
median_list = []
for i in range(40, 50):
    for j in range(30, 40):
        median_list.append(jaccardAndPreferentialScore(i, j))
median_prefAndJaccard = np.percentile(median_list, 20)
print median_prefAndJaccard

5.57767215576


In [82]:
pred = []
count = 0
for index,sender,receiver,transaction in dftest.itertuples():
    combined_score = jaccardAndPreferentialScore(sender, receiver)
#     print combined_score
    if combined_score > 5:
        pred.append(1)
    else:
        pred.append(0)
#     print count
    count = count + 1

label = dftest['transaction']
fpr, tpr, thresholds = roc_curve(label, pred)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

Area under the ROC curve : 0.717944


In [27]:
# Building latent structure
def classifyWithJaccart(S, R):
    ub = 1.5
    lb = 0.5
    penalty = 1
    bonus = 1
    outDegree_S = len(G[S])
    outDegree_R = len(G[R])
    inDegree_S = len(Gt[S])
    inDegree_R = len(Gt[R])
    if outDegree_S == 0:
        outDegree_S = ub + 10
    if outDegree_R == 0:
        outDegree_R = ub + 10
    ratio_S = float(inDegree_S) / float(outDegree_S)
    ratio_R = float(inDegree_R) / float(outDegree_R)
#     print ratio_S
#     print ratio_R
    if ratio_S < lb and lb < ratio_R < ub:
        score = bonus * jaccardSimilarity(S, R)
        print score
        return score
    elif lb < ratio_S < ub and not (ratio_R < lb):
        score = bonus * jaccardSimilarity(S, R)
        print score
        return score
    else:
        score = penalty * jaccardSimilarity(S, R)
        print score
        return score
        

In [None]:
pred = []
count = 0
for index,sender,receiver,transaction in dftest.itertuples():
    combined_score = classifyWithJaccart(sender, receiver)
#     print combined_score
    if combined_score > 0.0001:
        pred.append(1)
    else:
        pred.append(0)
#     print count
    count = count + 1

label = dftest['transaction']
fpr, tpr, thresholds = roc_curve(label, pred)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

In [10]:
# Page Rank algorithm
import networkx as nx

G_networkx = nx.DiGraph()
count_list = []
for (index, sender, receiver, count) in dftrain.itertuples():
    count_list.append(count)
    G_networkx.add_weighted_edges_from([(sender, receiver, count)])


In [85]:
G_networkx_undirected = nx.Graph()
for (index, sender, receiver, count) in dftrain.itertuples():
    G_networkx_undirected.add_weighted_edges_from([(sender, receiver, count)])

In [87]:
G_networkx.size()

3348026

In [55]:
pr = nx.pagerank(G_networkx)

In [66]:
# find number of components
nx.number_strongly_connected_components(G_networkx)

9254

In [71]:
scc = nx.strongly_connected_components(G_networkx)
number_nodes = len(G_networkx.nodes())
v_scc = [-1 for i in range(number_nodes)]
index = 0
for c in scc:
    for v in c:
        v_scc[v] = index
    index = index + 1
print v_scc

[1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 354, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1729, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923, 1923

In [73]:
index = 0
for c in scc:
    if index == 1923:
        print len(c)
    index = index + 1

In [56]:
# now that we have calculated the page rank, we should investigate what this page rank represents
pr_median = np.median(pr.values())
print pr_median

6.30384417422e-07


In [57]:
pr_mean = np.mean(pr.values())
print pr_mean

2.25187186849e-06


In [60]:
pr_3rd_percentile = np.percentile(pr.values(), 100)
print pr_3rd_percentile

0.0808518536077


In [61]:
print max(pr_values)

0.0941045460319


In [43]:
pr_values = pr.values()
plt.hist(pr.values(), bins=np.arange(min(pr_values) - 0.0001, 0.01, 0.0001))
plt.show()

In [45]:
# use blind PageRank first
def pageRankLikelihood(S, R):
    prscore_R = pr[R]
    return prscore_R

In [52]:
pred = []
count = 0
for index,sender,receiver,transaction in dftest.itertuples():
    combined_score = pageRankLikelihood(sender, receiver)
#     print combined_score
    if combined_score > 0.0002:
        pred.append(1)
    else:
        pred.append(0)
#     print count
    count = count + 1

label = dftest['transaction']
fpr, tpr, thresholds = roc_curve(label, pred)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

Area under the ROC curve : 0.468750


In [159]:
clf = AdaBoostClassifier()

In [181]:
##### TIME TO CLASSIFY THIS BITCH
import random
from sklearn.ensemble import AdaBoostClassifier

def create_vectors(S, R):
    # first step, create vector for S and R
    # [SCC_ID, in_degree, out_degree, page_rank,...]
#     S_vec = np.concatenate((np.array([v_scc[S], len(G[S]), len(Gt[S]), pr[S]]), u[S]))
#     R_vec = np.concatenate((np.array([v_scc[R], len(G[R]), len(Gt[R]), pr[R]]), v[R]))
    S_vec = np.array([v_scc[S], len(G[S]), len(Gt[S]), pr[S]])
    R_vec = np.array([v_scc[R], len(G[R]), len(Gt[R]), pr[R]])
    vec = np.concatenate((S_vec, R_vec, [jaccardSimilarity(S, R), preferentialScore(S, R)]))
    return vec

def hasInteracted(S, R):
    S_set = G[S]
    for rs in S_set:
        if rs == R:
            return True
    return False

def create_samples():
    sr_pairs = []
    interacts_list = []
    for i in range(600000):
        list_o_matrix = random.random()
        if list_o_matrix > 0.5:
            # draw from the list
            row_length = 3348025
            draw_index = random.randint(0, row_length)
            S_index = dftrain.iloc[draw_index][0]
            R_index = dftrain.iloc[draw_index][1]
            sr_pairs.append((S_index, R_index))
            interacts_list.append(1)
        else:
            S_index = random.randint(0, 444074)
            R_index = random.randint(0, 444074)
            sr_pairs.append((S_index, R_index))
            if hasInteracted(S_index, R_index):
                interacts_list.append(1)
            else:
                interacts_list.append(0)
    return sr_pairs, interacts_list

def train():
    sample_list, hasInteraction_list = create_samples()
    vectors = np.array([create_vectors(S, R) for S, R in sample_list])
    clf.fit(vectors, hasInteraction_list)

In [173]:
def test():
    pred = []
    count = 0
    for index,sender,receiver,transaction in dftest.itertuples():
        combined_score = clf.predict(create_vectors(sender, receiver).reshape(1,-1))
    #     print combined_score
        if combined_score == 1:
            pred.append(1)
        else:
            pred.append(0)
    #     print count
        count = count + 1

    label = dftest['transaction']
    fpr, tpr, thresholds = roc_curve(label, pred)
    roc_auc = auc(fpr, tpr)
    print "Area under the ROC curve : %f" % roc_auc

In [182]:
train()

In [183]:
test()

Area under the ROC curve : 0.665222


In [11]:
t2r_matrix[0, 7]

0.0

In [5]:
# Test Negative Matrix Factorization method
import nimfa

def createNMF():
    print "reached"
    Snmf = nimfa.(t2r_matrix, seed="random_vcol", rank=10, max_iter=12, version='l',
                  eta=1., beta=1e-4, i_conv=10, w_min_change=0)
    print "nmf created"
    Snmf_fit = Snmf()
    print "fit"
    W = Snmf_fit.basis()
    H = Snmf_fit.coef()
    print "returned"
    return W.dot(H)

def predictUsingNMF(S, R, r_matrix):
    return r_matrix[S, R]

In [6]:
def testNMF():
    pred = []
    count = 0
    for index,sender,receiver,transaction in dftest.itertuples():
        combined_score = predictUsingNMF(sender, receiver, recon_matrix)
        print combined_score
        if combined_score >= 1:
            pred.append(1)
        else:
            pred.append(0)
        print count
        count = count + 1

    label = dftest['transaction']
    fpr, tpr, thresholds = roc_curve(label, pred)
    roc_auc = auc(fpr, tpr)
    print "Area under the ROC curve : %f" % roc_auc

In [2]:
from lpproj import LocalityPreservingProjection
lpp = LocalityPreservingProjection(n_components=2)

X_2D = lpp.fit_transform(t2r_matrix.toarray())
print X_2D.shape

NameError: name 't2r_matrix' is not defined

In [7]:
model = NMF(n_components=10, init='random', random_state=0)
W = model.fit_transform(t2r_matrix)
H = model.components_

In [8]:
pred = [np.sum(W[row['sender'],:] * H[:,row['receiver']]) 
        for index,row in dftest.iterrows()]
label = dftest['transaction']
fpr, tpr, thresholds = roc_curve(label, pred)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc
matplotlib.rcParams['figure.figsize'] = (10, 10)
plt.plot(fpr, tpr, color='magenta', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

Area under the ROC curve : 0.650759


In [17]:
# to find the right component count
import sys

for i in range(45, 65, 5):
    print i
    model = NMF(n_components=i, init='random', random_state=0)
    W = model.fit_transform(t2r_matrix)
    H = model.components_
    pred = [np.sum(W[row['sender'],:] * H[:,row['receiver']]) 
        for index,row in dftest.iterrows()]
    label = dftest['transaction']
    fpr, tpr, thresholds = roc_curve(label, pred)
    roc_auc = auc(fpr, tpr)
    print "Area under the ROC curve : %f" % roc_auc
    sys.stdout.flush()

45
Area under the ROC curve : 0.662135
50


KeyboardInterrupt: 