In [4]:
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt
import networkx as nx
from networkx.algorithms.approximation.clustering_coefficient import average_clustering
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from node2vec import Node2Vec

# from parser_br_comps import BSiParser 

# 1. Load the data
## Transactions Data - Load

In [32]:
folder = '/dfs/scratch2/palovics/shared/'
transactions = pd.read_pickle(folder + '/data.pkl') # data_final = pd.read_pickle(folder + '/data_withInd.pkl')
transactions = transactions.drop_duplicates()

metadata = pickle.load(open(folder + 'meta_data.pkl', "rb"))
metadata = metadata.reset_index().rename({'index': 'name'}, axis='columns')
skds = pd.read_csv(folder + 'skd.tsv', sep='\t')
print(transactions.shape)

(2956956, 10)


## subset for the domestic, non-fin/non-govt

In [34]:
skdl2 = skds[skds.Level == 2].copy()
skdl2.Code = skdl2.Code.astype(int)
metadata = metadata.merge(
    skdl2[['Code', 'Parrent code']].rename({'Parrent code': 'skdl1'}, axis='columns'),
    left_on='skdl2', right_on='Code')
companies_withMetaData = metadata['name']

filter_a = (transactions['source_country'] == 'SI') & (transactions['target_country'] == 'SI')
filter_b = (~transactions['source_gov']) & (~transactions['target_gov'])
filter_c = (~transactions['source_financial']) & (~transactions['target_financial'])
filter_d = ((transactions['source'].isin(companies_withMetaData)) & (transactions['target'].isin(companies_withMetaData))) 


# don't just subset for single instances
print(transactions.shape)
transactions_filtered = transactions[filter_a & filter_b & filter_c ]# & filter_d]
print(transactions_filtered.shape)
transactions_filtered['year'] = pd.DatetimeIndex(pd.to_datetime(transactions_filtered['time'], unit='s')).year
transactions_filtered = transactions_filtered[transactions_filtered['year'] >= 2008]
print(transactions_filtered.shape)
graph = transactions_filtered[['source','target','amount','year']].drop_duplicates()
'''graph = graph_preagg.groupby(['source','target']).sum().reset_index()[['source','target','amount']]
graph.columns = ['source','target','weight']

graph=graph.set_index('year')
print(graph.shape)'''


(2956956, 10)
(859423, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(845067, 11)


"graph = graph_preagg.groupby(['source','target']).sum().reset_index()[['source','target','amount']]\ngraph.columns = ['source','target','weight']\n\ngraph=graph.set_index('year')\nprint(graph.shape)"

In [35]:
## get the descriptions we need here

Index(['name', 'dtime', 'default', 'skis', 'skd', 'skisl2', 'skdl2',
       'financial', 'gov', 'Code', 'skdl1'],
      dtype='object')


## 2. Get Node Dataset

In [113]:
data = graph.copy()
networks = {}

for year in np.unique(data["year"]):  
    dir_graph   = nx.from_pandas_edgelist(data[data["year"]==year],create_using=nx.DiGraph())
    undir_graph = nx.from_pandas_edgelist(data[data["year"]==year])

    
    # make a new dataset of identifiers for individual firms
    firm_identifiers = pd.DataFrame()

    # get node attributes
    in_degrees = [val for (node,val) in dir_graph.in_degree()]
    out_degrees = [val for (node,val) in dir_graph.out_degree()]
    nodes = list(dir_graph.nodes())

    # and put in the dataset
    firm_identifiers['names'] = nodes
    firm_identifiers['in_deg'] = in_degrees
    firm_identifiers['out_deg'] = out_degrees
    
    clusters = nx.clustering(undir_graph).values()
    centrals = nx.pagerank(undir_graph).values()
    
    firm_identifiers['clusters'] = clusters
    firm_identifiers['centrals'] = centrals
    firm_identifiers['year']     = year
    networks[year] = firm_identifiers
    

In [114]:
all_years = [networks[2008],networks[2009],networks[2009],networks[2010],
            networks[2011],networks[2012],networks[2013],networks[2014],
            networks[2015],networks[2016],networks[2017]]

all_firm_info = pd.concat(all_years)

Graph is here all unique source-target dyads. We can use this to construct our new dataset.

## Now put in firm embeddings

In [16]:
# make one of these node2vec things for each year and stash it
embeddings = {}
for year in np.unique(data["year"]):
    # Learn embeddings following this towards data science tutorial: https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef
    dir_graph   = nx.from_pandas_edgelist(data[data["year"]==year],create_using=nx.DiGraph())

    # Generate walks
    node2vec = Node2Vec(dir_graph, dimensions=20, walk_length=16, num_walks=100)
    model = node2vec.fit(window=10, min_count=1)
    embeddings[year] = model

Computing transition probabilities: 100%|██████████| 13048/13048 [00:46<00:00, 281.89it/s]
Generating walks (CPU: 1): 100%|██████████| 100/100 [04:59<00:00,  3.11s/it]
Computing transition probabilities: 100%|██████████| 10754/10754 [00:28<00:00, 375.00it/s]
Generating walks (CPU: 1): 100%|██████████| 100/100 [04:05<00:00,  2.15s/it]
Computing transition probabilities: 100%|██████████| 10448/10448 [00:23<00:00, 442.09it/s]
Generating walks (CPU: 1): 100%|██████████| 100/100 [03:01<00:00,  1.87s/it]
Computing transition probabilities: 100%|██████████| 10268/10268 [00:24<00:00, 419.45it/s]
Generating walks (CPU: 1): 100%|██████████| 100/100 [03:28<00:00,  1.87s/it]
Computing transition probabilities: 100%|██████████| 9502/9502 [00:25<00:00, 379.55it/s]
Generating walks (CPU: 1): 100%|██████████| 100/100 [03:12<00:00,  1.67s/it]
Computing transition probabilities: 100%|██████████| 8728/8728 [00:21<00:00, 399.43it/s]
Generating walks (CPU: 1): 100%|██████████| 100/100 [03:07<00:00,  1.93s/

In [17]:
with open('node2vec_embeddings_yearly.pickle', 'wb') as node2vec_embeddings_yearly:
    pickle.dump(embeddings, node2vec_embeddings_yearly, protocol=pickle.HIGHEST_PROTOCOL)


In [22]:
# Load data (deserialize)
with open('node2vec_embeddings_yearly.pickle', 'rb') as node2vec_embeddings:
    node2vec_embeddings = pickle.load(node2vec_embeddings)
print(node2vec_embeddings)

{2008: <gensim.models.word2vec.Word2Vec object at 0x7f38303beba8>, 2009: <gensim.models.word2vec.Word2Vec object at 0x7f3984c4d518>, 2010: <gensim.models.word2vec.Word2Vec object at 0x7f383cde51d0>, 2011: <gensim.models.word2vec.Word2Vec object at 0x7f383c9e1470>, 2012: <gensim.models.word2vec.Word2Vec object at 0x7f382e362128>, 2013: <gensim.models.word2vec.Word2Vec object at 0x7f382e2c4f60>, 2014: <gensim.models.word2vec.Word2Vec object at 0x7f384993e400>, 2015: <gensim.models.word2vec.Word2Vec object at 0x7f38326f6400>, 2016: <gensim.models.word2vec.Word2Vec object at 0x7f3834a4f080>, 2017: <gensim.models.word2vec.Word2Vec object at 0x7f383d6f7a20>, 2018: <gensim.models.word2vec.Word2Vec object at 0x7f383230c8d0>}


In [18]:
embeddings = pd.DataFrame()#np.empty((all_firms.shape[0],22))
print(embeddings.shape)
embeddings.loc[:,'firm'] = all_firms.loc[:,'firm']
embeddings.loc[:,'year'] = all_firms.loc[:,'year']

for i in range(20):
    name = 'firm_emb_' + str(i)
    embeddings.loc[:,name] = 0

(0, 0)


In [21]:
print(node2vec_embeddings_yearly)

<_io.BufferedWriter name='node2vec_embeddings_yearly.pickle'>


In [23]:
for i in range(embeddings.shape[0]):
    embeddings.loc[i,2:22] = node2vec_embeddings[embeddings.loc[i,'year']][embeddings.loc[i,'firm']]

  


In [24]:
embeddings.to_pickle('allGraph_embeddings_table_yearly.pkl')

In [112]:
print(embeddings)

                                   firm  year  firm_emb_0  firm_emb_1  \
0       4vy/KO6/HbcIRBxuF7cQkEqlqWS5zfa  2012   -0.009318   -0.020780   
1       Ompokzt6Vla4rjRk3KEDBWz2g5nB8s6  2008    1.895226   -2.698637   
2       Qz9fa1u8Bhx90tCa93se6/FVvYaks5m  2016    0.270055    0.061731   
3       kcQpOUBW2Ydf64B6JTGFEOTt9xyUpzW  2012    2.195605    0.988822   
4       .9XlbUivZWnp0Sf2l8I2TjXExKI6SrO  2015   -0.655221    0.325541   
5       .9XlbUivZWnp0Sf2l8I2TjXExKI6SrO  2016   -0.351116    2.028238   
6       .gehy.5So2iECtqEEP.mWmDhXS1Chiu  2013   -0.480325   -1.438325   
7       /cbE1jAM//3hCIfrk.vHRmFLmLG73ga  2017    3.157319    0.925827   
8       0D4OAQPFUrHpchtvpYMuXPHdTKiqceq  2011    0.015911   -0.021058   
9       1hC0v845KZ.T5.BD2mcbb0ziagQohHi  2012   -1.227352    0.125358   
10      2e51ol2w86okQbABB3tf9/hJ8tSLNLS  2009    0.017948   -0.013580   
11      2e51ol2w86okQbABB3tf9/hJ8tSLNLS  2011    0.017948   -0.013580   
12      5shJlRvjcybJ.Fuyi1foXhKHe2SPtjy  2012    0.

## Now merge embeddings with industry data

In [117]:
firm_ids = metadata[['name','skdl1']]
firm_ids.columns = ['name','Code']
descriptive_labels = firm_ids.merge(skds,on='Code')[['name','Description','Code']]
print(len(np.unique(descriptive_labels.loc[:,'Description'])))
print(len(np.unique(descriptive_labels.loc[:,'Code'])))
print(descriptive_labels.columns)
print(descriptive_labels.shape)

20
20
Index(['name', 'Description', 'Code'], dtype='object')
(37477, 3)


In [118]:
# subset to the firms whose industries we know
firm_identifiers_hasInd = all_firm_info[all_firm_info['names'].isin(companies_withMetaData)]
metadata_of_int = metadata[['name','skdl1']]
firm_identifiers_hasInd['name'] = firm_identifiers_hasInd['names']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [120]:
print(firm_identifiers_hasInd.shape)
firm_inds = firm_identifiers_hasInd.merge(descriptive_labels,on='name')
firm_inds = firm_inds.reset_index(drop=True)
print(firm_inds.shape)

(96633, 7)
(96633, 9)


In [121]:
print(len(np.unique(firm_inds.Code)))

16


In [122]:
translator = pd.DataFrame()
translator['Code'] = np.unique(firm_inds.Code)
translator['Code_numeric'] = translator.index.values
print(firm_inds.Code)

0        M
1        M
2        M
3        M
4        M
5        M
6        M
7        M
8        M
9        G
10       G
11       G
12       G
13       F
14       F
15       F
16       F
17       F
18       F
19       F
20       F
21       F
22       F
23       F
24       G
25       G
26       G
27       G
28       G
29       L
        ..
96603    M
96604    G
96605    R
96606    L
96607    C
96608    H
96609    G
96610    M
96611    H
96612    I
96613    M
96614    F
96615    Q
96616    H
96617    F
96618    R
96619    G
96620    N
96621    C
96622    C
96623    M
96624    M
96625    G
96626    F
96627    M
96628    G
96629    C
96630    G
96631    H
96632    M
Name: Code, Length: 96633, dtype: object


In [123]:
print(firm_inds.shape)
firm_inds = firm_inds.merge(translator)
print(firm_inds.shape)
firm_inds.to_pickle('firm_inds_yearly')


(96633, 9)
(96633, 10)


In [93]:
print(np.unique(firm_inds['Code_numeric']))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [13]:
embeddings = pd.read_pickle('allGraph_embeddings_table_yearly.pkl') # data_final = pd.read_pickle(folder + '/data_withInd.pkl')


In [125]:
firm_inds = firm_inds.rename(columns={'names': 'firm'})
print(firm_inds.columns)

Index(['firm', 'in_deg', 'out_deg', 'clusters', 'centrals', 'year', 'name',
       'Description', 'Code', 'Code_numeric'],
      dtype='object')


In [126]:
firms = firm_inds.drop('name',axis=1)
final_data = firms.merge(embeddings,on=['firm','year'])



In [127]:
print(final_data.shape)

(96633, 29)


In [128]:
final_data.to_pickle('final_classification_yearly.pkl')