In [None]:
import pandas as pd
import os 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [27]:
import numpy as np

In [None]:
folder_loc = 'F:/class/BANA 698/week 8'


file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
for file in file_list:
    print(file)

In [29]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [30]:
def corr_coeff_list(exclude_cols, output_file = False, folder_loc = folder_loc, df = df):
    df_filtered = df.drop(columns=exclude_cols, errors='ignore')
    corr_matrix = df_filtered.corr(numeric_only=True)
    mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)

    corr_pairs = corr_matrix.where(mask).stack().reset_index()
    corr_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']
    corr_pairs['AbsCorrelation'] = corr_pairs['Correlation'].abs()
    corr_pairs = corr_pairs.sort_values(by='AbsCorrelation', ascending=False)

    print(corr_pairs.head(10))
    if output_file:
        corr_pairs.to_csv(os.path.join(folder_loc, output_file), index=False)


    total_pairs = corr_pairs.shape[0]
    corr_pairs_80plus = corr_pairs[corr_pairs['Correlation'].abs() >= 0.8]
    num_corr_pairs_80plus = corr_pairs_80plus.shape[0]
    percent_corr_pairs_80plus = (num_corr_pairs_80plus / total_pairs) * 100

    corr_pairs_90plus = corr_pairs[corr_pairs['Correlation'].abs() >= 0.9]
    num_corr_pairs_90plus = corr_pairs_90plus.shape[0]
    percent_corr_pairs_90plus = (num_corr_pairs_90plus / total_pairs) * 100

    corr_pairs_99plus = corr_pairs[corr_pairs['Correlation'].abs() >= 0.99]
    num_corr_pairs_99plus = corr_pairs_99plus.shape[0]
    percent_corr_pairs_99plus = (num_corr_pairs_99plus / total_pairs) * 100


    print(f"\nTotal number of pairs: {total_pairs}")
    print(f"Number of pairs with correlation >= 0.8: {num_corr_pairs_80plus}")
    print(f"Percentage of pairs with correlation >= 0.8: {percent_corr_pairs_80plus:.2f}%")
    print(f"Number of pairs with correlation >= 0.9: {num_corr_pairs_90plus}")
    print(f"Percentage of pairs with correlation >= 0.9: {percent_corr_pairs_90plus:.2f}%")
    print(f"Number of pairs with correlation >= 0.99: {num_corr_pairs_99plus}")
    print(f"Percentage of pairs with correlation >= 0.99: {percent_corr_pairs_99plus:.2f}%")

In [31]:
#for col in sorted(df.columns): print(col)

In [32]:
cols_to_exclude1 = [
    'CountryShortName',
    'Year'
]

In [33]:
cols_to_exclude2 = [
    'CountryShortName',
    'Year',
    'Death rate, crude (per 1,000 people)',
    'Lifetime risk of maternal death (%)',
    'Lifetime risk of maternal death (1 in: rate varies by country)',
    'Maternal mortality ratio (modeled estimate, per 100,000 live births)',
    'Mortality rate, adult, female (per 1,000 female adults)',
    'Mortality rate, adult, male (per 1,000 male adults)',
    'Mortality rate, infant (per 1,000 live births)',
    'Mortality rate, infant, female (per 1,000 live births)',
    'Mortality rate, infant, male (per 1,000 live births)',
    'Mortality rate, neonatal (per 1,000 live births)',
    'Mortality rate, under-5 (per 1,000 live births)',
    'Mortality rate, under-5, female (per 1,000 live births)',
    'Mortality rate, under-5, male (per 1,000 live births)',
    'Number of deaths ages 10-14 years',
    'Number of deaths ages 15-19 years',
    'Number of deaths ages 20-24 years',
    'Number of deaths ages 5-9 years',
    'Number of infant deaths',
    'Number of maternal deaths',
    'Number of neonatal deaths',
    'Number of under-five deaths',
    'Probability of dying among adolescents ages 10-14 years (per 1,000)',
    'Probability of dying among adolescents ages 15-19 years (per 1,000)',
    'Probability of dying among children ages 5-9 years (per 1,000)',
    'Probability of dying among youth ages 20-24 years (per 1,000)'
]

In [34]:
corr_coeff_list(cols_to_exclude1, 'correlation_all.csv')

                                                                   Feature_1                                                                          Feature_2  Correlation  AbsCorrelation
29892                               Rural population (% of total population)                                           Urban population (% of total population)    -1.000000        1.000000
39921    Self-employed, total (% of total employment) (modeled ILO estimate)    Wage and salaried workers, total (% of total employment) (modeled ILO estimate)    -1.000000        1.000000
39872  Self-employed, female (% of female employment) (modeled ILO estimate)  Wage and salaried workers, female (% of female employment) (modeled ILO estimate)    -1.000000        1.000000
39897      Self-employed, male (% of male employment) (modeled ILO estimate)      Wage and salaried workers, male (% of male employment) (modeled ILO estimate)    -1.000000        1.000000
11027                                                  

In [35]:
corr_coeff_list(cols_to_exclude2, 'correlation_no_mortality_related.csv')

                                                                   Feature_1                                                                          Feature_2  Correlation  AbsCorrelation
26392                               Rural population (% of total population)                                           Urban population (% of total population)    -1.000000        1.000000
33146    Self-employed, total (% of total employment) (modeled ILO estimate)    Wage and salaried workers, total (% of total employment) (modeled ILO estimate)    -1.000000        1.000000
33097  Self-employed, female (% of female employment) (modeled ILO estimate)  Wage and salaried workers, female (% of female employment) (modeled ILO estimate)    -1.000000        1.000000
33122      Self-employed, male (% of male employment) (modeled ILO estimate)      Wage and salaried workers, male (% of male employment) (modeled ILO estimate)    -1.000000        1.000000
9977                                                   