In [1]:
# This pipeline will test collaborative filtering as a means of guessing
# which users will interact in the future.
# Definition of steps:
# Import data,
# create the collaborative filtering scheme, and pass the data through the collaborative filter.
# use the scheme to predict which pairs will interact at a future time.

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from scipy.sparse import dok_matrix, csr_matrix
import scipy
from sklearn.cluster import KMeans
from collections import Counter

In [3]:
dftrain = pd.read_csv('data/txTripletsCounts.txt',
                      header=None,
                      index_col=None,
                      sep=' ',
                      names=['sender','receiver','transaction'])
dftrain['transaction'].describe()

count    3348026.000000
mean           4.725741
std          128.494757
min            1.000000
25%            1.000000
50%            1.000000
75%            2.000000
max        41639.000000
Name: transaction, dtype: float64

In [4]:
dftrain # lets see what our data looks like, we can see that its sender, receiver, transaction counts

Unnamed: 0,sender,receiver,transaction
0,0,1,3
1,0,13,1
2,0,37,1
3,0,51,1
4,0,438481,1
5,1,0,3
6,1,4,354
7,1,10,2602
8,1,11,2689
9,1,12,1


In [5]:
# Our collab filtering pipeline will go as follows (to test)
# first we will construct the count matrix of transmitter2receiver with counts
# then we will apply pca to get the lowest dimensionality representation
# then we will use collaborative filtering to predict if any receiver transmitter pair would have an interaction
# in the future.

In [6]:
# max sender numbe n = 444074
n = 444075
t2r_matrix = csr_matrix((dftrain['transaction'], (dftrain['sender'], dftrain['receiver'])), shape=(n,n), dtype=float)
print t2r_matrix.shape

(444075, 444075)


In [7]:
# run pca on the t2r_matrix
# pca = PCA()
# pca.fit(t2r_matrix.toarray())
# print pca.explained_variance_ratio_

In [8]:
# for (title, sender, receiver, transaction_count) in dftrain.itertuples():
#     t2r_matrix[sender, receiver] = t2r_matrix[sender, receiver] + transaction_count

# # this should set up our t2r_matrix
# print t2r_matrix

In [9]:
# first time trial, we will run pure collaborative content based filtering, and see what happens just to get a
# baseline measure done.
# # this will iterate row wise, so first we will compare the interactions of row 1 to row 2 and so forth.
# pearsonr_corr_matrix = dok_matrix((n,n))
# for i in range(n):
#     for j in range(n):
#         row_dense = np.squeeze((t2r_matrix.tocsr()[i,:]).toarray().flatten().transpose())
#         print row_dense.shape
#         row2_dense = np.squeeze((t2r_matrix.tocsr()[j,:]).toarray().flatten().transpose())
#         print row2_dense.shape
#         pearsonr_corr_matrix[i, j] = pearsonr(row_dense, row2_dense)[0] # grab just the correlation value
#         print i, j
# pearsonr_corr_matrix

In [10]:
# Lets reduce dimensionality first!
from scipy.sparse.linalg import svds
epsilon = 1e-10
new_dim = 10
u, s, vt = svds(t2r_matrix, k=new_dim, tol=epsilon, which='LM')

In [11]:
u[0].shape

(10,)

In [None]:
# use u as the reduced dimensionality features for the senders
# use these features to perform collaborative filtering

pearsonr_corr_matrix = dok_matrix((n,n))
for i in range(n):
    for j in range(n):
        row_dense = u[i]
#         print row_dense.shape
        row2_dense = u[j]
#         print row2_dense.shape
        pearsonr_corr_matrix[i, j] = pearsonr(row_dense, row2_dense)[0] # grab just the correlation value
#         print i, j
pearsonr_corr_matrix

In [None]:
label_counts = Counter(labels) # create a dictionary of labels
label_i = [0]*num_clusters

# Create smaller matrices
matrix_dict = dict()
for i in range(num_clusters):
    matrix_dict[i] = dok_matrix((label_counts[labels[i]], n))

# now we put in rows based on label data

for i in range(num_clusters):
    l = labels[i]
    matrix_dict[l]