In [2]:
from common import *
from mongodb_helper import *

import pandas as pd

from copy import deepcopy

def cosine_similarity(a,b):
    def standardize(a,b):
        """
        makes sure every key is present in a and b
        """
        a = deepcopy(a)
        b = deepcopy(b)
        
        keys = set(list(a.keys()) + list(b.keys()))
        for k in keys:
            if k not in a:
                a[k] = 0
            if k not in b:
                b[k] = 0
        return a,b
    
    def dot(a,b):
        out = 0
        for k in a:
            out += a[k]*b[k]
        return out
    
    def magnitude(vector):
        out = 0
        for k,v in vector.items():
            out += v ** 2
        
        return out ** 0.5
    
    a,b = standardize(a,b)
    dotab = dot(a,b)
    maga, magb = magnitude(a), magnitude(b)
    
    return dotab / maga / magb if 0 not in [maga, magb] else 0

#query current datasets
db = get_database()
embeddings_countries_collection = db["aggregate.embeddings"]

embeddings = {}
for row in embeddings_countries_collection.find():
    country = row["_id"]
    data = row["data"]
    embeddings[country] = data

df = pd.DataFrame(embeddings)

df2 = df.T

#Asset Quality
df3 = df2['Financial, Financial Soundness Indicators, Core Set, Deposit Takers, Asset Quality, Non-performing Loans to Total Gross Loans, Percent']

#All Countries
all_countries = df3.sort_values().keys()
#Top 20 Countries
top_20 = df3.sort_values().keys()[:10]
#Bottom 20 Countries
bottom_20 = df3.sort_values().keys()[-10:]

print(bottom_20)

ServerSelectionTimeoutError: localhost:27017: [WinError 10061] No connection could be made because the target machine actively refused it

In [18]:
#Top 20 countries Top 10 Similar Features to Non-Performing Loans
df5 = df[top_20].T

country_data = df5.to_dict()
country_data = sorted(country_data.items(), key=lambda x: x[0])
country_data

out = {}

for cname,cvalue in country_data:
    out[cname] = {}
    for dname,dvalue in country_data:
        out[cname][dname] = cosine_similarity(cvalue,dvalue)
    out[cname] = sorted(out[cname].items(), key=lambda x: x[1],reverse=True)[1:10]

print('Top 20 countries Top 10 Similar Features to Non-Performing Loans:')
out['Financial, Financial Soundness Indicators, Core Set, Deposit Takers, Asset Quality, Non-performing Loans to Total Gross Loans, Percent']

Top 20 countries Top 10 Similar Features to Non-Performing Loans:


[('Gini', 0.954761366556288),
 ('Age structure (0-14 years)', 0.9547327471905434),
 ('Male literacy rate', 0.9542361326736056),
 ('Male suicide rate', 0.9540287488792251),
 ('Life expectancy (female)', 0.9530463489443388),
 ('Life expectancy (overall)', 0.952975764810369),
 ('Life expectancy (male)', 0.9527927043857854),
 ('Male to female income ratio', 0.9526342828498704),
 ('Female literacy rate', 0.9523809288361029)]

In [13]:
#Bottom 20 countries Top 10 Similar Features to Non-Performing Loans
df5 = df[bottom_20].T

country_data = df5.to_dict()
country_data = sorted(country_data.items(), key=lambda x: x[0])
country_data

out = {}

for cname,cvalue in country_data:
    out[cname] = {}
    for dname,dvalue in country_data:
        out[cname][dname] = cosine_similarity(cvalue,dvalue)
    out[cname] = sorted(out[cname].items(), key=lambda x: x[1],reverse=True)[1:10]

print('Bottom 20 countries Top 10 Similar Features to Non-Performing Loans:')
out['Financial, Financial Soundness Indicators, Core Set, Deposit Takers, Asset Quality, Non-performing Loans to Total Gross Loans, Percent']

Bottom 20 countries Top 10 Similar Features to Non-Performing Loans:


[('Median female age', 0.9651311776474643),
 ('Median age', 0.9602612981933506),
 ('Median male age', 0.9540756323208196),
 ('Gdp ppp', 0.9528232685773219),
 ('Age structure (over 65 years)', 0.9471048583269582),
 ('Dependency ratio (elderly)', 0.9355234526135),
 ('Proportion of population using improved sanitation', 0.9279024826918515),
 ('Geographical Outreach, Number of Institutions, Insurance corporations',
  0.9226691526970452),
 ('Mortality rate (per 1000 people)', 0.9167659277157774)]

In [6]:
import requests
from bs4 import BeautifulSoup
from pandas_datareader import wb
# matches = wb.search('inflation')

from sklearn import preprocessing

def match(target, iterable):
    target = clean(target)
    bestword = 0
    bestscore = 0
    
    for word in iterable:
        
        if word in target:
            return word, 2

        sim = cosine_similarity(target, word)
        if sim > bestscore:
            bestword, bestscore = word, sim

    return bestword, bestscore

def clean(word):
    def clean_brackets(word):
        for i,ch in enumerate(word):
            if ch in "[{(":
                return word[:i].strip()
        return word.strip()

    def clean_space(word):
        return word.replace(" ", "_")
    
    return clean_space(clean_brackets(word))

def cosine_similarity(a, b):
    """
    input: 2 words
    output: cosine similarity between 2 words
    """
    a, b = a.lower(), b.lower()

    def vectorize(word):
        """
        input: word
        output: vector representation of word
            - unigram and bigram vectors and frequencies
            - eg. apple --> {a:1, p:2, l:1, e:1, ap:1, pp:1, pl:1, le:1}
        """
        out = {}
        for ch in word:
            if ch not in out: out[ch] = 1
            else: out[ch] += 1

        for i in range(len(word)-1):
            bigram = word[i:i+2]

            if bigram not in out: out[bigram] = 1
            else: out[bigram] += 1
        
        return out

    def standardize_vectors(a,b):
        """
        input: vectors a and b
        output: vectors a and b, but standardized
        """

        for k in list(a.keys()) + list(b.keys()):
            if k not in a: a[k] = 0
            if k not in b: b[k] = 0
        
        assert len(a) == len(b)
        return a, b

    def dot(a, b):
        """
        input: standardized vectors a and b (assume keys are the same)
        output: dot product of a and b
        """
        out = 0
        for k in a:
            out += a[k] * b[k]
        return out

    def magnitude(a):
        """
        input: vector a
        output: magnitude of vector
        """
        return sum([i**2 for i in a.values()])**0.5


    a, b = vectorize(a), vectorize(b)
    a, b = standardize_vectors(a, b)

    dot_ab = dot(a,b)
    a, b = magnitude(a), magnitude(b)

    return dot_ab / a / b if 0 not in [a,b] else 0

page = requests.get('http://api.worldbank.org/v2/country?format=json&per_page=1000')

country_details = page.json()[1]

countries = {}
for country in country_details:
    countries[country['name']] = country['iso2Code']
              
country_list = countries.keys()

country_iso = []
wiki_countries = {}
for country in all_countries:
    #match country
    best_match, score = match(country, country_list)
    if score > 0.70:
        wiki_countries[best_match] = country
        country_iso.append(countries[best_match]) 
        


page = requests.get('https://data.worldbank.org/indicator/')
soup = BeautifulSoup(page.content, 'html.parser')

th_all = soup.find_all('a')
all_indicators = {}
for th in th_all:
    try:
        if '/indicator/' in th['href'] and 'view=chart' in th['href']:
            all_indicators[th['href'].replace('/indicator/','').replace('?view=chart','')] = th.get_text()
    except:
        pass
    
indicators = {k: all_indicators[k] for k in list(all_indicators)[:]}

all_results = {}
for tag,description in indicators.items():
    try: 
        #Top 10 countries with Worst Non-Performing Loans
        dat = wb.download(indicator= tag, country=country_iso, start=2019, end=2019)
        print(tag)
        dat = dat[tag].groupby(level=0).mean()
        all_results[description] = dat
    except:
        pass

df = pd.DataFrame(all_results)

dataset = df.dropna(axis='columns')

country_data = dataset.T.to_dict()

country_data = sorted(country_data.items(), key=lambda x: x[0])

def cosine_similarity(a,b):
    def standardize(a,b):
        """
        makes sure every key is present in a and b
        """
        a = deepcopy(a)
        b = deepcopy(b)
        
        keys = set(list(a.keys()) + list(b.keys()))
        for k in keys:
            if k not in a:
                a[k] = 0
            if k not in b:
                b[k] = 0
        return a,b
    
    def dot(a,b):
        out = 0
        for k in a:
            out += a[k]*b[k]
        return out
    
    def magnitude(vector):
        out = 0
        for k,v in vector.items():
            out += v ** 2
        
        return out ** 0.5
    
    a,b = standardize(a,b)
    dotab = dot(a,b)
    maga, magb = magnitude(a), magnitude(b)
    
    return dotab / maga / magb if 0 not in [maga, magb] else 0

out = {}

for cname,cvalue in country_data:
    out[cname] = {}
    for dname,dvalue in country_data:
        out[cname][dname] = cosine_similarity(cvalue,dvalue)

out = pd.DataFrame(out)

# plot correlation matrix
# fig, ax = plt.subplots(figsize=(20, 7))
# sns.heatmap(out, cmap='YlGnBu', linewidths=.7, annot=True, fmt='.5f')
# plt.show()



AG.LND.IRIG.AG.ZS
AG.LND.AGRI.ZS
AG.LND.TRAC.ZS
NV.AGR.TOTL.ZS
AG.LND.ARBL.ZS
AG.LND.ARBL.HA.PC
AG.YLD.CREL.KG
AG.PRD.CROP.XD
SL.AGR.EMPL.FE.ZS
SL.AGR.EMPL.MA.ZS
AG.CON.FERT.ZS
AG.PRD.FOOD.XD
AG.LND.FRST.ZS
AG.LND.FRST.K2
AG.LND.TOTL.K2
AG.LND.CREL.HA
AG.PRD.LVSK.XD
AG.LND.CROP.ZS
SP.RUR.TOTL
SP.RUR.TOTL.ZS
AG.SRF.TOTL.K2
BX.GRT.EXTA.CD.WD
SH.TBS.INCD
SI.DST.FRST.20
SH.STA.MMRT
SH.DYN.MORT
DT.ODA.ODAT.GN.ZS
DT.ODA.ODAT.XP.ZS
DT.ODA.ODAT.GI.ZS
DT.ODA.ODAT.MP.ZS
DT.ODA.ODAT.PC.ZS
SM.POP.NETM
DT.ODA.ODAT.CD
SI.POV.DDAY
SH.STA.ANVC.ZS
SH.DYN.AIDS.ZS
SH.STA.STNT.ZS
SE.PRM.CMPT.ZS
SE.ENR.PRSC.FM.ZS
BX.GRT.TECH.CD.WD
EG.ELC.ACCS.ZS
ER.H2O.FWTL.ZS
ER.H2O.FWTL.K3
EN.ATM.CO2E.KT
EN.ATM.CO2E.PC
IQ.CPA.PUBS.XQ
IC.BUS.EASE.XQ
EG.USE.ELEC.KH.PC
EG.USE.PCAP.KG.OE
AG.LND.EL5M.ZS
EN.ATM.METH.KT.CE
EN.ATM.NOXE.KT.CE
SP.POP.GROW
EN.URB.MCTY.TL.ZS
EN.POP.EL5M.ZS
SP.POP.TOTL
SH.STA.MALN.ZS
EG.ELC.RNEW.ZS
EG.FEC.RNEW.ZS
ER.PTD.TOTL.ZS
EN.ATM.GHGT.KT.CE
SP.URB.TOTL
SP.URB.TOTL.IN.ZS
NY.ADJ.SVNG.GN.ZS
GC.DOD.

In [7]:
def map_fields(init_dict, map_dict):

    res_dict = {}
    for k, v in init_dict.items():
        res_dict[map_dict[k]] = v
    return res_dict

unmapped_countries = dataset.T.to_dict()

mapped_countries = map_fields(unmapped_countries, wiki_countries)

In [14]:
#Top 20 countries Top 10 Similar Features to Non-Performing Loans
df = pd.DataFrame(mapped_countries)

df5 = df[top_20]

country_data = df5.to_dict()
country_data = sorted(country_data.items(), key=lambda x: x[0])
country_data

out = {}

for cname,cvalue in country_data:
    out[cname] = {}
    for dname,dvalue in country_data:
        out[cname][dname] = cosine_similarity(cvalue,dvalue)
    out[cname] = sorted(out[cname].items(), key=lambda x: x[1],reverse=True)[1:10]

out

KeyError: "None of [Index(['Canada', 'Sweden', 'Luxembourg', 'Norway', 'United_States',\n       'Australia', 'United_Kingdom', 'Singapore', 'Israel', 'Finland'],\n      dtype='object')] are in the [columns]"

In [9]:
#Top 20 countries Top 10 Similar Features to Non-Performing Loans
df = pd.DataFrame(mapped_countries)

df5 = df[bottom_20]

country_data = df5.to_dict()
country_data = sorted(country_data.items(), key=lambda x: x[0])
country_data

out = {}

for cname,cvalue in country_data:
    out[cname] = {}
    for dname,dvalue in country_data:
        out[cname][dname] = cosine_similarity(cvalue,dvalue)
    out[cname] = sorted(out[cname].items(), key=lambda x: x[1],reverse=True)[1:10]

out

KeyError: "['Democratic_Republic_of_the_Congo'] not in index"

In [16]:
out.keys()

dict_keys(['Age structure (0-14 years)', 'Age structure (15-64 years)', 'Age structure (over 65 years)', 'Area rank', 'Area total', 'Capital population', 'Cigarette consumption (per year per capita)', 'Co2 emissions (tonnes)', 'Cocaine use', 'Consumer Price Index, All items', 'Dependency ratio (elderly)', 'Dependency ratio (total)', 'Dependency ratio (youth)', 'Female literacy rate', 'Female mean BMI', 'Female suicide rate', 'Fertility rate', 'Financial Development Index', 'Financial Institutions Access Index', 'Financial Institutions Depth Index', 'Financial Institutions Efficiency Index', 'Financial Institutions Index', 'Financial Markets Access Index', 'Financial Markets Depth Index', 'Financial Markets Efficiency Index', 'Financial Markets Index', 'Financial, Financial Soundness Indicators, Core Set, Deposit Takers, Asset Quality, Non-performing Loans to Total Gross Loans, Percent', 'Financial, Financial Soundness Indicators, Core Set, Deposit Takers, Capital Adequacy, Non-performi