In [1]:
import pandas as pd
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [2]:
import numpy as np

In [None]:
folder_loc = 'F:/class/BANA 698/week 8'


file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
for file in file_list:
    print(file)

In [4]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [5]:

cols_to_exclude = [
    'CountryShortName'
]

In [6]:
def corr_all_lister(outputfile = False, cols_to_exclude = cols_to_exclude, folder_loc = folder_loc, df = df):
    df_filtered = df.drop(columns=cols_to_exclude, errors='ignore')
    corr_matrix = df_filtered.corr(numeric_only=True)
    mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool) 

    corr_pairs = corr_matrix.where(mask).stack().reset_index()
    corr_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']
    corr_pairs['Abs Correlation'] = corr_pairs['Correlation'].abs()
    corr_pairs = corr_pairs.sort_values(by='Abs Correlation', ascending=False)


    total_pairs = corr_pairs.shape[0]
    corr_pairs_80plus = corr_pairs[corr_pairs['Abs Correlation'] >= 0.8]
    num_corr_pairs_80plus = corr_pairs_80plus.shape[0]
    percent_corr_pairs_80plus = (num_corr_pairs_80plus / total_pairs) * 100

    corr_pairs_90plus = corr_pairs[corr_pairs['Abs Correlation'] >= 0.9]
    num_corr_pairs_90plus = corr_pairs_90plus.shape[0]
    percent_corr_pairs_90plus = (num_corr_pairs_90plus / total_pairs) * 100

    corr_pairs_99plus = corr_pairs[corr_pairs['Abs Correlation'] >= 0.99]
    num_corr_pairs_99plus = corr_pairs_99plus.shape[0]
    percent_corr_pairs_99plus = (num_corr_pairs_99plus / total_pairs) * 100

    print(f"There are {total_pairs} pairs")
    print(f"Number, percentage of pairs with correlation coefficient...")
    
    print(f">= 0.8: {num_corr_pairs_80plus}, {percent_corr_pairs_80plus:.2f}%")
    print(f">= 0.9: {num_corr_pairs_90plus}, {percent_corr_pairs_90plus:.2f}%")
    print(f">= 0.99: {num_corr_pairs_99plus}, {percent_corr_pairs_99plus:.2f}%")

    if outputfile:
        corr_pairs.to_csv(os.path.join(folder_loc, outputfile), index=False)

    return corr_pairs

In [7]:
dfcorr = corr_all_lister()
dfcorr.head()

There are 40470 pairs
Number, percentage of pairs with correlation coefficient...
>= 0.8: 1400, 3.46%
>= 0.9: 518, 1.28%
>= 0.99: 55, 0.14%


Unnamed: 0,Feature_1,Feature_2,Correlation,Abs Correlation
30176,Rural population (% of total population),Urban population (% of total population),-1.0,1.0
40205,"Self-employed, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)",-1.0,1.0
40156,"Self-employed, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)",-1.0,1.0
40181,"Self-employed, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)",-1.0,1.0
11311,GNI (current US$),GDP (current US$),0.999903,0.999903


In [8]:
#identify high correlates so that redundant features can be dropped from the model
corr_pairs_90plus = dfcorr[dfcorr['Abs Correlation'] >= 0.9]
#print(corr_pairs_90plus)

In [9]:
corr_pairs_90plus.head()

Unnamed: 0,Feature_1,Feature_2,Correlation,Abs Correlation
30176,Rural population (% of total population),Urban population (% of total population),-1.0,1.0
40205,"Self-employed, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)",-1.0,1.0
40156,"Self-employed, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)",-1.0,1.0
40181,"Self-employed, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)",-1.0,1.0
11311,GNI (current US$),GDP (current US$),0.999903,0.999903


In [10]:
#sort by most common in Feature 1
col_to_sort = 'Feature_1'
sorted_df = corr_pairs_90plus.groupby(col_to_sort).size().reset_index(name='count')
sorted_df = sorted_df.sort_values(by='count', ascending=False)
print(sorted_df)

                                                                                                                 Feature_1  count
124                                                              Total greenhouse gas emissions including LULUCF (Mt CO2e)     18
40                                                                                GDP, PPP (constant 2021 international $)     16
47                                                    General government final consumption expenditure (constant 2015 US$)     14
46                                                                                      GNI, PPP (current international $)     14
7                                                        Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e)     13
41                                                                                      GDP, PPP (current international $)     13
15                                                                                        

In [11]:
for i in sorted_df[col_to_sort].tolist():
    print(f"    '{i}',")

    'Total greenhouse gas emissions including LULUCF (Mt CO2e)',
    'GDP, PPP (constant 2021 international $)',
    'General government final consumption expenditure (constant 2015 US$)',
    'GNI, PPP (current international $)',
    'Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e)',
    'GDP, PPP (current international $)',
    'Cereal production (metric tons)',
    'Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e)',
    'Life expectancy at birth, female (years)',
    'Adjusted net national income (current US$)',
    'Fluorinated greenhouse gases (F-gases) emissions from Industrial Processes (Mt CO2e)',
    'Primary education, pupils',
    'Final consumption expenditure (constant 2015 US$)',
    'Adjusted net national income per capita (current US$)',
    'Number of deaths ages 15-19 years',
    'Self-employed, female (% of female employment) (modeled ILO estimate)',
    'Nitrous oxide (N2O) emissions (total) excluding LULUCF (Mt CO2e)',
    'Self-employed

In [12]:
#sort by most common in Feature 2
col_to_sort = 'Feature_2'
sorted_df2 = corr_pairs_90plus.groupby(col_to_sort).size().reset_index(name='count')
sorted_df2 = sorted_df2.sort_values(by='count', ascending=False)
print(sorted_df2)

                                                                                                               Feature_2  count
118                                                            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)     21
80                                                                    Nitrous oxide (N2O) emissions from Waste (Mt CO2e)     16
125                                                                                                     Urban population     16
50                                                                                                    Labor force, total     15
79                                                       Nitrous oxide (N2O) emissions from Transport (Energy) (Mt CO2e)     14
60                                                        Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e)     13
10                                                 Carbon dioxide (CO2) emissions from Power Industry (E

In [13]:
for i in sorted_df2[col_to_sort].tolist():
    print(f"    '{i}',")

    'Total greenhouse gas emissions excluding LULUCF (Mt CO2e)',
    'Nitrous oxide (N2O) emissions from Waste (Mt CO2e)',
    'Urban population',
    'Labor force, total',
    'Nitrous oxide (N2O) emissions from Transport (Energy) (Mt CO2e)',
    'Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Power Industry (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Building (Energy) (Mt CO2e)',
    'GDP (current US$)',
    'GNI (current US$)',
    'Mortality rate, under-5, male (per 1,000 live births)',
    'Nitrous oxide (N2O) emissions from Power Industry (Energy) (Mt CO2e)',
    'GNI, Atlas method (current US$)',
    'Nitrous oxide (N2O) emissions from Agriculture (Mt CO2e)',
    'Nitrous oxide (N2O) emissions (total) excluding LULUCF (Mt CO2e)',
    'Wage and salaried workers, total (% of total employment) (modeled ILO estimate)',
    'Carbon dioxide (CO2) emissions from Transport (Energy) (Mt CO2e)',
    'Nitrous oxid

In [None]:
identical_values_list = sorted_df['Feature_1'][sorted_df['Feature_1'].isin(sorted_df2['Feature_2'])].tolist()
identical_values_list = sorted(list(set(identical_values_list)))
for i in identical_values_list:
    print(i)

Access to electricity (% of population)
Adjusted net national income (current US$)
Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e)
Carbon dioxide (CO2) emissions from Building (Energy) (Mt CO2e)
Carbon dioxide (CO2) emissions from Fugitive Emissions (Energy) (Mt CO2e)
Carbon dioxide (CO2) emissions from Industrial Combustion (Energy) (Mt CO2e)
Carbon dioxide (CO2) emissions from Industrial Processes (Mt CO2e)
Carbon dioxide (CO2) emissions from Power Industry (Energy) (Mt CO2e)
Carbon dioxide (CO2) emissions from Transport (Energy) (Mt CO2e)
Cereal production (metric tons)
Current health expenditure per capita (current US$)
Current health expenditure per capita, PPP (current international $)
Domestic general government health expenditure per capita (current US$)
Employment to population ratio, ages 15-24, total (%) (modeled ILO estimate)
Final consumption expenditure (constant 2015 US$)
Fluorinated greenhouse gases (F-gases) emissions from Industrial Processes (Mt CO2

In [16]:
print(len(identical_values_list))

95


In [19]:
sorted_df.shape

(136, 2)

In [17]:
sorted_df2.shape

(133, 2)