In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

In [85]:
df = pd.read_csv('WDICountry.csv', encoding='latin1')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 32 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Index                                              263 non-null    int64  
 1   Country Code                                       263 non-null    object 
 2   Short Name                                         263 non-null    object 
 3   Table Name                                         263 non-null    object 
 4   Long Name                                          263 non-null    object 
 5   2-alpha code                                       261 non-null    object 
 6   Currency Unit                                      217 non-null    object 
 7   Special Notes                                      150 non-null    object 
 8   Region                                             217 non-null    object 
 9   Income Gro

Unnamed: 0,Index,Country Code,Short Name,Table Name,Long Name,2-alpha code,Currency Unit,Special Notes,Region,Income Group,...,Government Accounting concept,IMF data dissemination standard,Latest population census,Latest household survey,Source of most recent Income and expenditure data,Vital registration complete,Latest agricultural census,Latest industrial data,Latest trade data,Latest water withdrawal data
0,1,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,SNA data for 2000-2011 are updated from offici...,Latin America & Caribbean,High income,...,,,2010,,,Yes,,,2015.0,
1,2,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,Fiscal year end: March 20; reporting period fo...,South Asia,Low income,...,Consolidated central government,Enhanced General Data Dissemination System (e-...,1979,"Demographic and Health Survey, 2015","Integrated household survey (IHS), 2011",,,,2015.0,2000.0
2,3,AGO,Angola,Angola,People's Republic of Angola,AO,Angolan kwanza,,Sub-Saharan Africa,Lower middle income,...,Budgetary central government,Enhanced General Data Dissemination System (e-...,2014,"Demographic and Health Survey, 2015/16","Integrated household survey (IHS), 2008/09",,,,2015.0,2005.0
3,4,ALB,Albania,Albania,Republic of Albania,AL,Albanian lek,,Europe & Central Asia,Upper middle income,...,Consolidated central government,Enhanced General Data Dissemination System (e-...,2011,"Demographic and Health Survey, 2008/09",Living Standards Measurement Study Survey (LSM...,Yes,2012.0,2013.0,2015.0,2006.0
4,5,AND,Andorra,Andorra,Principality of Andorra,AD,Euro,WB-3 code changed from ADO to AND to align wit...,Europe & Central Asia,High income,...,,,2011. Population data compiled from administra...,,,Yes,,,2014.0,


In [86]:
# Extract source and target nodes
# In this case source = Income group; target = Short Name
source_target = df[['Income Group', 'Short Name']].values

source_target[:5]

array([['High income', 'Aruba'],
       ['Low income', 'Afghanistan'],
       ['Lower middle income', 'Angola'],
       ['Upper middle income', 'Albania'],
       ['High income', 'Andorra']], dtype=object)

In [87]:
# Create Network Graph Coordinates
Q = nx.Graph()
Q.add_edges_from(source_target)
coordinates = nx.spring_layout(Q)

coordinates

{'High income': array([-0.15531415,  0.70793178]),
 'Aruba': array([-0.09188307,  0.89584514]),
 'Low income': array([-0.59964718, -0.1534566 ]),
 'Afghanistan': array([-0.59712142, -0.2565592 ]),
 'Lower middle income': array([-0.28425065, -0.80698978]),
 'Angola': array([-0.22509584, -0.89693952]),
 'Upper middle income': array([ 0.81317364, -0.03755886]),
 'Albania': array([0.9299431, 0.0626572]),
 'Andorra': array([-0.20535625,  0.80546878]),
 nan: array([ 0.01353315, -0.09346997]),
 'Arab World': array([ 0.09914926, -0.01228793]),
 'United Arab Emirates': array([-0.12759529,  0.8280381 ]),
 'Argentina': array([ 0.74112087, -0.10730976]),
 'Armenia': array([-0.34555649, -0.89152561]),
 'American Samoa': array([ 0.77953285, -0.09058089]),
 'Antigua and Barbuda': array([-0.17677302,  0.87604228]),
 'Australia': array([-0.09010532,  0.81758476]),
 'Austria': array([-0.17184162,  0.59815746]),
 'Azerbaijan': array([ 0.99509251, -0.08331958]),
 'Burundi': array([-0.70871148, -0.05385776

In [88]:
# Create dataframe graph coordinate
df_coordinates = pd.DataFrame(coordinates).T
df_coordinates.columns = ['X', 'Y']
df_coordinates.to_csv('CoorFile.csv', index_label='NodeName')
df_coordinates.head()

Unnamed: 0,X,Y
High income,-0.155314,0.707932
Aruba,-0.091883,0.895845
Low income,-0.599647,-0.153457
Afghanistan,-0.597121,-0.256559
Lower middle income,-0.284251,-0.80699


In [89]:
# Create Bridge file (Connection between nodes)
# Tableau Code (Clarify which one is source and target): 
#   IF [idxNodes]/2 = ROUND([idxNodes]/2) THEN 'Source' ELSE 'Target' END

# src_target = source_target.reshape(1, (len(source_target) * 2))
# src_target = src_target.reshape(-1)
src_target = source_target.reshape(len(source_target) * 2)
print(src_target.shape)
df_nodes = pd.DataFrame(src_target, columns=['NodeName'])

# Define the connection index
# The connection index is the index of data in its 'main' file.
array_idx = []
for i in range(1, len(src_target) + 1):
    array_idx.append((i + 1) // 2)

print(len(array_idx))
df_nodes['c_index'] = array_idx
df_nodes.to_csv('BridgeFile.csv', index_label='idxNodes')
# NOTE: 
# - idxNodes is used for clarifying which one is source and target
# - c_index, beside for label connection, is used for key connector between current and main file
df_nodes.head(6)

(526,)
526


Unnamed: 0,NodeName,c_index
0,High income,1
1,Aruba,1
2,Low income,2
3,Afghanistan,2
4,Lower middle income,3
5,Angola,3


In [90]:
df_nodes.tail(6)

Unnamed: 0,NodeName,c_index
520,Upper middle income,261
521,South Africa,261
522,Lower middle income,262
523,Zambia,262
524,Low income,263
525,Zimbabwe,263
