In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

## Data loading and sanity checks

In [None]:
df = pd.read_csv('data\Life-Expectancy-Data-Updated.csv')

# Simple renaming to improve readability
df=df.rename(columns={'Thinness_ten_nineteen_years':'Thinness (10-19 years)',\
                      'Thinness_five_nine_years':'Thinness (5-9 years)', \
                      'Economy_status_Developed' : 'Developed', \
                      'Economy_status_Developing' : 'Developing'                  
                     })

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

**Confirm that Developed and Developing are complements of each other**

In [None]:
(df['Developed']+df['Developing']==1).sum()==df.shape[0]

In [None]:
df=df.drop('Developing', axis=1)

In [None]:
df_developed = df[df['Developed']==1]
df_developed = df_developed.drop('Developed', axis=1)

df_developing = df[df['Developed']==0]
df_developing = df_developing.drop('Developed', axis=1)

df_developed.head()

**Confirm that all countries have entries for every year**

In [None]:
np.sort(df['Year'].unique())

In [None]:
df_subset = df[['Year', 'Country']]
grouped=df_subset.groupby(['Country']).count()
# grouped.pivot(index='Country', columns='Year')
grouped[grouped['Year']!=16].shape[0]==0

## Trend Analysis

In [None]:
def pivot_table_op(df, groupBy, value, fun):
    '''Produces a pivot table of specified value grouped by a column'''  
    df_subset = df[['Year', groupBy, value]]
    grouped=fun(df_subset.groupby([groupBy, 'Year'])).reset_index()
    return grouped.pivot(index='Year', columns=groupBy, values=value)

We see overall all regions have increasing life expectancy with Africa considerably below the rest

In [None]:
pivot = pivot_table_op(df, 'Region', 'Life_expectancy', lambda x: x.mean())
plt.figure(figsize=(8,6))
for col in pivot.columns:
    plt.plot(pivot.index, pivot[col], label=col)

plt.xlabel('Year')
plt.ylabel('Average Life expectancy')
plt.title('Trend in Life expectancy')
plt.legend(fontsize='small')
plt.show()

In [None]:
pivot_mean = pivot_table_op(df, 'Developed', 'Life_expectancy', lambda x: x.mean())
pivot_std = pivot_table_op(df, 'Developed', 'Life_expectancy', lambda x: x.std())

plt.figure(figsize=(8,6))
mean_developed = pivot_mean[int(True)]
std_developed = pivot_std[int(True)]
plt.plot(pivot_mean.index, mean_developed, label='Developed')
plt.fill_between(pivot_std.index, mean_developed - std_developed, mean_developed + std_developed, alpha=0.2)

mean_developing = pivot_mean[int(False)]
std_developing = pivot_std[int(False)]
plt.plot(pivot_mean.index, mean_developing, label='Developing')
plt.fill_between(pivot_std.index, mean_developing - std_developing, mean_developing + std_developing, alpha=0.2)

plt.xlabel('Year')
plt.ylabel('Life expectancy')
plt.title('Life expectancy')
plt.legend(fontsize='small')
plt.show()

In [None]:
ax = sns.kdeplot(df_developed, x="Life_expectancy", fill=True, legend=True)
ax = sns.kdeplot(df_developing, x="Life_expectancy", fill=True, legend=True)
# sns.move_legend(ax, "upper left")
ax.set_title("Distribution of Life expectancy: Developing vs Developed", pad=10)
ax.set_xlabel("Life expectancy")     
plt.show()

## Outliers

In [None]:
def find_outliers(pivot, threshold):
    output = []
    trend = pivot.mean(axis=1)
    for col in pivot.columns:
        corr = np.corrcoef(pivot[col], trend)
        output.append([col, corr[0,1]])
    
    corr_df = pd.DataFrame(output, columns=['Country' ,'Correlation'])
    corr_df.sort_values('Correlation', inplace=True)
    return corr_df[corr_df['Correlation']<threshold]

def plot_outliers(pivot, outliers):
    outliers = pivot[outliers['Country']]
    
    plt.figure(figsize=(8,6))
    for col in outliers.columns:
        plt.plot(outliers.index, outliers[col], label=col)
    
    trend = pivot.mean(axis=1)
    plt.plot(trend.index, trend, '--', label='Average Trend')
    plt.legend()
    plt.show()
    

In [None]:
pivot = pivot_table_op(df_developed, 'Country', 'Life_expectancy', lambda x: x.mean())
outliers = find_outliers(pivot, threshold=0)
outliers

In [None]:
pivot = pivot_table_op(df_developing, 'Country', 'Life_expectancy', lambda x: x.mean())
outliers = find_outliers(pivot, threshold=0.5)
plot_outliers(pivot, outliers)

## Correlation analysis

In [None]:
def remove_small_entries(df, tol=0.5):
    def f(x): return 0.0 if(abs(x)<tol) else x
    return df.map(f)

def corr_heatmap(df, remove_cols):
    X = df.drop(remove_cols, axis=1)
    X_scaled = StandardScaler().fit_transform(X)
    X_scaled=pd.DataFrame(X_scaled, columns=X.columns)
    correlation = X_scaled.corr()
    # correlation = remove_small_entries(correlation)
    fig=plt.figure(figsize=(16,8))
    sns.heatmap(correlation, annot=True, linewidths=0.25)

corr_heatmap(df, ['Country', 'Region', 'Year'])

We note the following are highly correlated: 
- Infant_deaths, Under_five_deaths, and Adult_mortality
- Diphtheria and Polio
- Thinness (5-9 years) and Thinness (10-19 years)

We therefore drop: Infant_deaths, Under_five_deaths, Diphtheria, and Thinness (5-9 years)

In [None]:
df_reduced = df.drop(['Infant_deaths', 'Under_five_deaths', 'Diphtheria', 'Thinness (5-9 years)'], axis=1)

In [None]:
corr_heatmap(df_reduced, ['Country', 'Region', 'Year'])

In [None]:
# Developing
corr_heatmap(df_reduced[df_reduced['Developed']==0].drop('Developed', axis=1), ['Country', 'Region', 'Year'])

In [None]:
# Developed
corr_heatmap(df_reduced[df_reduced['Developed']==1].drop('Developed', axis=1), ['Country', 'Region', 'Year'])

## More graphs

In [None]:
df_reduced.columns

In [None]:
fig, ax = plt.subplots(figsize=(12,15), nrows = 4, ncols=3)

def my_plot(row, col, col_name):
    sns.scatterplot(ax=ax[row, col],
                data=df, 
                x=col_name, 
                y="Life_expectancy",
                hue="Developed",  
                palette=['Black','Red'])
    ax[row, col].legend([],[], frameon=False)

my_plot(0, 0, 'Adult_mortality')
my_plot(0, 1, 'Alcohol_consumption')
my_plot(0, 2, 'Hepatitis_B')

my_plot(1, 0, 'Measles')
my_plot(1, 1, 'BMI')
my_plot(1, 2, 'Polio')

my_plot(2, 0, 'Incidents_HIV')
my_plot(2, 1, 'GDP_per_capita')
my_plot(2, 2, 'Population_mln')

my_plot(3, 0, 'Thinness (10-19 years)')
my_plot(3, 1, 'Schooling')
