In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

train_df = pd.read_csv('data/train/train.csv')

train_df.node1_id = train_df.node1_id.astype('int32')
train_df.node2_id = train_df.node2_id.astype('int32')
train_df.is_chat  = train_df.is_chat.astype('int8')


test_df = pd.read_csv('data/test.csv')


test_df.id = test_df.id.astype('int32')
test_df.node1_id = test_df.node1_id.astype('int32')
test_df.node2_id = test_df.node2_id.astype('int32')

# test_df.info()

grouping_idx = train_df.groupby('node1_id')['node2_id'].count() > 60
train_df = train_df.iloc[grouping_idx[grouping_idx == False].index]


combine_all = pd.concat([train_df[['node1_id','node2_id']], 
                         test_df[['node1_id','node2_id']]], axis=0)

train_df.shape, test_df.shape, combine_all.shape

((3392012, 3), (11776968, 3), (15168980, 2))

In [2]:
del train_df, test_df, grouping_idx
import gc
gc.collect()

32

# Build Graph

In [3]:
import networkx as nx

G = nx.from_pandas_edgelist(combine_all,'node1_id', 'node2_id')#,['is_chat'])#, edge_attr=True)
G.number_of_nodes()

5382233

# Katz Rank features
- weighted
- unweighted

In [4]:
katzing_11 = nx.katz_centrality(G, alpha=0.01, beta=0.05, weight='weight')

katzing_11 = pd.DataFrame(data=[list(katzing_11.keys()), list(katzing_11.values())]).T
katzing_11.columns = ['node1_id','katz1_05']
katzing_11.head()

Unnamed: 0,node1_id,katz1_05
0,4529348.0,0.00049
1,894645.0,0.000477
2,8325853.0,0.000452
3,1305287.0,0.000463
4,7218175.0,0.000427


In [5]:
katzing_22 = nx.katz_centrality(G, alpha=0.01, beta=0.005, weight=None)

katzing_22 = pd.DataFrame(data=[list(katzing_22.keys()), list(katzing_22.values())]).T
katzing_22.columns = ['node1_id','katz2_05']
katzing_22.head()

KeyboardInterrupt: 

In [None]:
katzing_11.to_csv('data/1_katzing_11.csv', index=None)
katzing_22.to_csv('data/1_katzing_22.csv', index=None)

# Unsupervised features
- Resource Allocation
- jaccard Coefficient
- Adamic Adar
- Prefrential attachment

In [6]:
combine_all['resource_alloc'] = combine_all[['node1_id','node2_id']].apply(
    lambda row: next(nx.resource_allocation_index(G,[(row)]))[2], axis=1)


In [7]:
combine_all['jaccard_coeff'] = combine_all[['node1_id','node2_id']].apply(
    lambda row: next(nx.jaccard_coefficient(G,[(row)]))[2], axis=1)


In [8]:
combine_all_new = combine_all[~np.equal(combine_all.node1_id, combine_all.node2_id)]

In [9]:
combine_all_new['adamic_adar'] = combine_all_new[['node1_id','node2_id']].apply(
    lambda row: next(nx.adamic_adar_index(G,[(row)]))[2], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
combine_all_new['preferential_attach'] = combine_all_new[['node1_id','node2_id']].apply(
    lambda row: next(nx.preferential_attachment(G,[(row)]))[2], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
combine_all_new1 = pd.merge(combine_all, 
         combine_all_new[['node1_id','node2_id','preferential_attach', 'adamic_adar']],
         on=['node1_id','node2_id'], how='left')

combine_all_new1.fillna(0, inplace=True)

In [None]:
combine_all_new1.resource_alloc = combine_all_new1.resource_alloc/combine_all_new1.resource_alloc.max()
combine_all_new1.jaccard_coeff = combine_all_new1.jaccard_coeff/combine_all_new1.jaccard_coeff.max()
combine_all_new1.adamic_adar = combine_all_new1.adamic_adar/combine_all_new1.adamic_adar.max()
combine_all_new1.preferential_attach = combine_all_new1.preferential_attach/combine_all_new1.preferential_attach.max()


In [None]:
combine_all_new1.to_csv('data/1_combine_all.csv', index=None)

In [None]:
del combine_all_new1, combine_all_new, combine_all
gc.collect()

# Page Rank feature

In [None]:
ranking = nx.pagerank(G, alpha=0.9)

ranking_df = pd.DataFrame(data=[list(ranking.keys()), list(ranking.values())]).T
ranking_df.columns = ['node1_id','page_ranking']
ranking_df.head()

In [None]:
ranking_df.to_csv('data/1_ranking_df.csv', index=None)
ranking_df.shape

# User specific features

In [None]:
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds, eigs

A = nx.to_scipy_sparse_matrix(G)
A = A.astype('float')
u, s, vt = svds(A, k=50)

np.save('data/50_u_vec.npy',u)
# np.save('data/50_vt_vec.npy',vt)
# np.save('data/50_s_vec.npy',s)