# Graph Neural Nets for Credit Ratings

Credit scoring may be improved by exploiting business linkages, for which we construct a graph, denoted as `CorpNet`. We then apply classification using GNNs on this graph and a tabular feature set for the nodes, to see if we can build a better ML model by further exploiting the information in network relationships. 

Kernel: MXNet 1.6 Python 3.6 CPU Optimized

In [None]:
%pylab inline
import pickle
import numpy as np
import pandas as pd
from numpy.random import exponential

!pip install scipy
from scipy.spatial.distance import cosine
# from utils import get_text_embedding_from_doc2vec, construct_network_data

In [2]:
!pip3 install -U pip
!pip3 install -U setuptools wheel

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip3 install autogluon

Defaulting to user installation because normal site-packages is not writeable


## Tabular Feature Set

We read in a file that contains all the data. It has the 5 Altman ratios (A,B,C,D,E) and the binary rating class, as well as one-hot encoding for the industry categories. 

In [4]:
target_column = 'binary_rating'

In [5]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

# Dataset 
rating_df = pd.read_csv('CCR_data.csv')

# Text data for graph
text_df = rating_df[['MDNA']]

# Data w/o text
rating_df = rating_df.drop(['MDNA','TotalAssets'], axis=1)

# LABELS
# Binary labels
# rating_df[target_column] = [1 if x in ['AAA','AA','A','BBB'] else 0 for x in rating_df['Rating']]

# Multicategory labels
label3 = [3 if x in ['AAA','AA','A'] else 0 for x in rating_df['Rating']]
label2 = [2 if x in ['BBB'] else 0 for x in rating_df['Rating']]
label1 = [1 if x in ['BB'] else 0 for x in rating_df['Rating']]
rating_df[target_column] = [j1+j2+j3 for j1,j2,j3 in zip(label1,label2,label3)]

# Dummies for categorical column 
rating_df = pd.get_dummies(data=rating_df, columns=['industry_code'])
rating_df = rating_df.drop(['Rating'], axis=1)
rating_df.reset_index(inplace=True, drop=True)
rating_df

Unnamed: 0,CurrentLiabs,TotalLiabs,RetainedEarnings,CurrentAssets,NetSales,EBIT,MktValueEquity,binary_rating,industry_code_B,industry_code_D,industry_code_E,industry_code_F,industry_code_G,industry_code_H,industry_code_I
0,20.868258,50.501544,32.547303,24.871286,14.774910,1.334931,238.571513,3,0,1,0,0,0,0,0
1,19.622967,50.352084,28.760326,29.103598,12.447340,1.137064,215.396468,3,0,1,0,0,0,0,0
2,21.474133,55.007104,29.903571,24.560932,13.074464,1.390076,232.182228,3,0,1,0,0,0,0,0
3,20.135293,50.225308,29.748943,26.070400,15.765115,1.203600,228.732074,3,0,0,0,0,0,0,1
4,17.417811,49.893589,29.480277,24.160795,12.612130,1.189818,236.955918,3,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3281,17.853466,58.028358,25.494603,24.312514,15.200251,1.245472,202.165890,0,0,0,0,0,0,1,0
3282,18.625728,60.543515,25.852383,25.745013,12.280980,1.502558,165.349046,0,0,0,0,0,0,1,0
3283,22.220044,59.374450,28.210000,29.975709,15.388667,1.495376,167.368922,0,0,1,0,0,0,0,0
3284,17.237602,56.490629,25.207663,26.974804,12.162667,1.287359,171.050929,0,0,1,0,0,0,0,0


## Preparing graph and graph statistics as tabular features

In [6]:
%%capture 
!pip install gensim

In [7]:
%pylab inline
import pickle
import numpy as np
import pandas as pd
from numpy.random import exponential
from scipy.spatial.distance import cosine
from utils import construct_network_data

Populating the interactive namespace from numpy and matplotlib


In [8]:
%%time
# Only run once and use the pickle file in the next block otherwise
# src_dst_dict = construct_network_data(text_df, text_column_name="MDNA", embedding_size=300, cutoff=0.5)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


In [9]:
# Pickle the src_dst lists 
# pickle.dump({'src': src_dst_dict['src'], 'dst': src_dst_dict['dst']}, open( "TEST_src_dst.p", "wb" ) )
src_dst_dict = pickle.load( open( "TEST_src_dst.p", "rb" ) )

In [10]:
## Make network
import networkx as nx
G = nx.Graph()

# Get edges 
src = src_dst_dict['src']
dst = src_dst_dict['dst']
e_list = [(src[j], dst[j], {'weight':1}) for j in range(len(src))]

# Find singleton nodes
s_list1 = set(range(3286))-set(src)
s_list2 = set(range(3286))-set(src)
s_list = list(s_list1.intersection(s_list2))
print('Singleton nodes :', s_list)
s_list = [(j,j,{'weight':0}) for j in s_list] # a blank entry for each singleton

# Add all nodes and edges
G.add_edges_from(e_list)
G.add_edges_from(s_list)

# Check stats
print("#nodes =",G.number_of_nodes())
print("#edges =",G.number_of_edges())

Singleton nodes : [2749]
#nodes = 3286
#edges = 179924


In [11]:
%%time
# CREATE NETWORK FEATURES
# Degree for each node
tmp = nx.degree_centrality(G)
G_degree = [tmp[j] for j in range(G.number_of_nodes())]

# Eigen Centrality of each node
tmp = nx.eigenvector_centrality(G)
G_EVcent = [tmp[j] for j in range(G.number_of_nodes())]

# Get clustering coefficient of each node
tmp = nx.clustering(G)
G_ClustCoef = [tmp[j] for j in range(G.number_of_nodes())]

CPU times: user 9.63 s, sys: 3.85 ms, total: 9.64 s
Wall time: 9.64 s


In [12]:
# Add new features to tabular dataset
rating_df["G_degree"] = G_degree
rating_df["G_EVcent"] = G_EVcent
rating_df["G_ClustCoef"] = G_ClustCoef