This notebook was used for data exploration and can be used for further research.
This is the notebook with which the first feature and model selection research 
was done.

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import spearmanr
from collections import defaultdict
import os

In [None]:
chosen_countries = ['Australia', 'Austria', 'Belgium', 'Canada', 'Chile', 'Czechia', 'Estonia', 'Finland', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Korea', 'Latvia', 'Luxembourg', 'Netherlands', 'New Zealand', 'Norway', 'Portugal', 'Slovak Republic', 'Slovenia', 'Spain', 'Sweden', 'Türkiye']

life_expectancy_data = pd.read_csv('../data/life_expectancy.csv')
life_vars = life_expectancy_data["Variable"].unique().tolist()
pharma_sales_data = pd.read_csv('../data/pharma_sales_ppp.csv')
drug_vars = pharma_sales_data["Variable"].unique().tolist()

In [None]:
def merge_and_plot(filtered_data_x, filtered_data_y, label_x, label_y):
    '''
        This function merges the 2 dataframes that will be plotted and makes a
        scatter plot
    '''
    merged_data = pd.merge(filtered_data_x, filtered_data_y, on="Country", how="inner")
    selected_data = merged_data[["Country", "Value_x", "Value_y"]]
    selected_data.columns = ["Country", label_x, label_y]

    x_data = np.array(selected_data[label_x])
    y_data = np.array(selected_data[label_y])
    plt.scatter(x_data, y_data)
    plt.xlabel(label_x)
    plt.ylabel(label_y)
    plt.show()

def p_value_correlation(filtered_data_x, filtered_data_y, label_x, label_y, threshold):
    '''
        This function calculates the spearsman correlation between the x and y
        variable and does a hypothesis test where the null hypothesis is that
        there is no correlation between x and y.
    '''

    merged_data = pd.merge(filtered_data_x, filtered_data_y, on="Country", how="inner")
    selected_data = merged_data[["Country", "Value_x", "Value_y"]]
    selected_data.columns = ["Country", label_x, label_y]
    corr, p_value = spearmanr(selected_data[label_x], selected_data[label_y])

    if corr >= threshold or corr <= -threshold:

        # p value calculated in the spearman correlation is the correlation value
        # it is first transformed to a Z-score which normalizes it, so the data can
        # be seen as a normal distribution. The p-value is calculated with the cdf
        # and does a two tailed test. The test is done with a t-distribution
        print("X:", label_x, "Y:", label_y)
        print("correlation between X and Y: ", corr)
        print("p value for correlation: ", p_value)

        if p_value <= 0.05:
            print("HYPOTHESIS REJECTED")
        else:
            print("HYPOTHESIS ACCEPTED")

def filter_data(data, year, variable, measure=None):
    '''
        Filters dataframe
    '''
    if measure:
        return data[
            (data["Year"] == year) &
            (data["Country"].isin(chosen_countries)) &
            (data["Variable"] == variable) &
            (data["Measure"] == measure)
        ]
    else:
        return data[
            (data["Year"] == year) &
            (data["Country"].isin(chosen_countries)) &
            (data["Variable"] == variable)
        ]


Plotting and correlation betweent the drug variables (pharma sales) and the life expectancy

In [None]:
for drug in drug_vars:

    filtered_pharma_sales = filter_data(pharma_sales_data, 2014, drug)
    for life in life_vars:
        filtered_life_expectancy = filter_data(life_expectancy_data, 2014, life, "Years")

        merge_and_plot(filtered_pharma_sales, filtered_life_expectancy, str(drug), str(life))
        p_value_correlation(filtered_pharma_sales, filtered_life_expectancy, str(drug), str(life), 0.4)


Checking if the drugs variables correlate with each other

In [None]:


for drug1 in drug_vars:

    filtered_pharma_sales1 = filter_data(pharma_sales_data, 2014, drug1)

    for drug2 in drug_vars:
        filtered_pharma_sales2 = filter_data(pharma_sales_data, 2014, drug2)
        if drug1 != drug2:

            merge_and_plot(filtered_pharma_sales1, filtered_pharma_sales2, str(drug1), str(drug2))
            p_value_correlation(filtered_pharma_sales1, filtered_pharma_sales2, str(drug1), str(drug2), 0.4)

Possible interaction term or extra term in the linear model

In [None]:
alcohol_consumption_data = pd.read_csv('../data/alcohol_consump.csv')
alcohol_measures = alcohol_consumption_data["Measure"].unique().tolist()

for alcohol in alcohol_measures:
    filtered_alcohol_consumption = filter_data(alcohol_consumption_data, 2014, "Alcohol consumption", alcohol)

    for life in life_vars:
        filtered_life_expectancy = filter_data(life_expectancy_data, 2014, life, "Years")

        merge_and_plot(filtered_alcohol_consumption, filtered_life_expectancy, str(alcohol), str(life))
        p_value_correlation(filtered_alcohol_consumption, filtered_life_expectancy, str(alcohol), str(life), 0.4)

Possible interaction term or extra term in the linear model

In [None]:
food_data = pd.read_csv('../data/food.csv')
food_measures = food_data["Measure"].unique().tolist()
food_variables = food_data["Variable"].unique().tolist()

for food in food_variables:
    filtered_food_data = filter_data(food_data, 2014, food)

    for life in life_vars:
        filtered_life_expectancy = filter_data(life_expectancy_data, 2014, life, "Years")


        merge_and_plot(filtered_food_data, filtered_life_expectancy, str(food), str(life))
        p_value_correlation(filtered_food_data, filtered_life_expectancy, str(food), str(life), 0.4)

Testing for heteroscedasticity with the multiple regression for the genders apart.

In [None]:
from sklearn.linear_model import LinearRegression

# Variables that had the highest correlation coefficient and for which the correlation was significant
variables = ["A02B-Drugs for peptic ulcer and gastro-oesophageal reflux diseases (GORD)", "N-Nervous system", "N06A-Antidepressants"]
genders = ["Females at age 40", "Males at age 40"]


filtered_pharma_sales = pharma_sales_data[
    (pharma_sales_data["Year"] == 2014) &
    (pharma_sales_data["Country"].isin(chosen_countries)) &
    (pharma_sales_data["Variable"].isin(variables))
]

# Making the row vectors for the linear regression
df = defaultdict(lambda: [])
for drug in filtered_pharma_sales[["Variable", "Value"]].iterrows():
    df[drug[1]["Variable"]].append(drug[1]["Value"])

df = pd.DataFrame(df)

X = df[["A02B-Drugs for peptic ulcer and gastro-oesophageal reflux diseases (GORD)", "N-Nervous system", "N06A-Antidepressants"]]

for gender in genders:
    filtered_life_expectancy = filter_data(life_expectancy_data, 2014, gender, "Years")

    y = filtered_life_expectancy["Value"]
    model = LinearRegression()
    model.fit(X, y)
    intercept = model.intercept_
    coeff = model.coef_

    y_predicted = model.predict(X)
    residuals = y - y_predicted

    plt.scatter(y, y_predicted)
    plt.title(f"The true {gender} vs the predicted {gender} with the linear regression with 3 drugs: a02B, N-Nervous system, N06A-Antidepressants")
    plt.xlabel(f'true life expectancy of {gender}')
    plt.ylabel(f'predicted life expectancy of {gender}')
    plt.show()

    plt.scatter(y_predicted, residuals)
    plt.xlabel(f"predicted life expectancy {gender}")
    plt.ylabel(f"residuals of the predicted {gender}")
    plt.title(f'The residuals of the linear regression with 3 drugs: a02B, N-Nervous system, N06A-Antidepressants for {gender}')
    plt.show()


Plots used in the presentation

In [25]:
plots_directory = '../data_visualization/data_exploration'
os.makedirs(plots_directory, exist_ok=True)

for drug in variables:
    X = df[[drug]]
    for gender in genders:
        filtered_life_expectancy = filter_data(life_expectancy_data, 2014, gender, "Years")

        y = filtered_life_expectancy["Value"]
        model = LinearRegression()
        model.fit(X, y)
        intercept = model.intercept_
        coeff = model.coef_

        y_predicted = model.predict(X)
        residuals = y - y_predicted

        plt.scatter(y, y_predicted)
        plt.title(f"The true {gender} vs the predicted {gender} with the linear \n regression with the drug {drug}")
        plt.xlabel(f'true life expectancy of {gender}')
        plt.ylabel(f'predicted life expectancy of {gender}')
        plt.ylim(plt.xlim())
        filename = f"{gender}_{drug}_prediction"
        filepath = os.path.join(plots_directory, filename)
        plt.savefig(filepath)
        plt.close()

        plt.scatter(y_predicted, residuals)
        plt.xlabel(f"predicted life expectancy {gender}")
        plt.ylabel(f"residuals of the predicted {gender}")
        plt.title(f'The residuals of the linear regression with the linear regression \n with the drug {drug} \n for {gender}')
        filename = f"{gender}_{drug}_residuals"
        filepath = os.path.join(plots_directory, filename)
        plt.savefig(filepath)
        plt.close()