# Styling of the dataframes with Logistic Regression coefficients and odds ratios for all models

@author: Caroline Gasten

The present notebook helps to visualize results obtained coefficients and odds ratios through colour-coded tables. 

In [None]:
#import packages
import pandas as pd
import os
import matplotlib
from mycolorpy import colorlist as mcp
import re
import numpy as np
import dataframe_image as dfi
from scipy.stats.distributions import chi2, t

In [None]:
#required paths
path_statistics = #path to outputs from Logistic Regression Model
path_figures = #Path to figures

In [None]:
#model specifications
study_area = ['Turkana', 'Marsabit', 'West Pokot']
DIs = ['SPI-1', 'SPI-3', 'SPI-6', 'SPI-12', 'SPEI-1', 'SPEI-3', 'SPEI-6', 'SPEI-12']
lags = [0, 1, 2, 3, 4, 5, 6, 7]
stats = ['Mean', 'Median', 'P10', 'P25', 'P75', 'P90']

In [None]:
 def significant_colouring(x):
    """
    The present function receives a value as an input and styles the corresponding cell in an html in a colour-coded way.
    """
    colours = mcp.gen_color(cmap='RdBu', n=16)
    if '*' in x:
        value = float(re.findall(r'^-?\d{1,}\.\d{1,}', x)[0])
        if value < -0.7:
            return f"background-color:{colours[0]}; color:white"
        elif value < -0.6:
            return f"background-color:{colours[1]}; color:white"
        elif value < -0.5:
            return f"background-color:{colours[2]}"
        elif value < -0.4:
            return f"background-color:{colours[3]}"
        elif value <-0.3:
            return f"background-color:{colours[4]}"
        elif value < -0.2:
            return f"background-color:{colours[5]}"
        elif value < -0.1:
            return f"background-color:{colours[6]}"
        elif value < 0:
            return f"background-color:{colours[7]}"
        elif value < 0.1:
            return f"background-color:{colours[8]}"
        elif value < 0.2:
            return f"background-color:{colours[9]}"
        elif value < 0.3:
            return f"background-color:{colours[10]}"
        elif value < 0.4:
            return f"background-color:{colours[11]}"
        elif value < 0.5:
            return f"background-color:{colours[12]}"
        elif value < 0.6:
            return f"background-color:{colours[13]}"
        elif value < 0.7:
            return f"background-color:{colours[14]}; color:white"
        else:
            return f"background-color:{colours[15]}; color:white"

def dataframe_styling(df, caption):
    """
    The present function receives a dataframe with coefficients as an input and styles it through referring to the colour function above and through adding asterisks for significant results
    """
    colours = mcp.gen_color(cmap='RdBu', n=15)
    
    df = df.astype('float').round(decimals=3)
    df_coeff = df.loc[:, ('coefficient')]
    df_pval = df.loc[:, ('p-value')]
    
    asterisk = df_pval.applymap(lambda x: '***' if x<0.01 else('**' if x<0.05 else ('*' if x<0.1 else '')))
    df_asterisk = df_coeff.applymap(lambda i: i if i<1000 else '{:.2e}'.format(i)).astype('str') + asterisk
    
    df_styled = df_asterisk.style.applymap(significant_colouring)\
                                .set_caption(caption)
    return df_styled

In [None]:
def odds_colouring(x):
    """
    The present function receives a value as an input and styles the corresponding cell in an html in a colour-coded way.
    """
    colours = mcp.gen_color(cmap='RdBu_r', n=16)
    if '*' in x:
        try:
            value = float(re.findall(r'^-?\d{1,}\.\d{1,}', x)[0])
        except IndexError:
            value = float(re.findall(r'[a-z]+', x)[0])
            
    else:
        value = float(x)
    if value < 1/4.5:
        return f"background-color:{colours[0]}; color:white"
    elif value < 1/4:
        return f"background-color:{colours[1]}; color:white"
    elif value < 1/3.5:
        return f"background-color:{colours[2]}"
    elif value < 1/3:
        return f"background-color:{colours[3]}"
    elif value < 1/2.5:
        return f"background-color:{colours[4]}"
    elif value < 1/2:
        return f"background-color:{colours[5]}"
    elif value < 1/1.5:
        return f"background-color:{colours[6]}"
    elif value < 1:
        return f"background-color:{colours[7]}"
    elif value ==1:
        return f"background-color:white"
    elif value < 1.5:
        return f"background-color:{colours[8]}"
    elif value < 2:
        return f"background-color:{colours[9]}"
    elif value < 2.5:
        return f"background-color:{colours[10]}"
    elif value < 3:
        return f"background-color:{colours[11]}"
    elif value < 3.5:
        return f"background-color:{colours[12]}"
    elif value < 4:
        return f"background-color:{colours[13]}"
    elif value < 4.5:
        return f"background-color:{colours[14]}; color:white"
    else:
        return f"background-color:{colours[15]}; color:white"

def dataframe_styling_odds(df, caption):
    """
    The present function receives a dataframe with odds ratios as an input and styles it through referring to the colour function above and through adding asterisks for significant results
    """
    df = df.astype('float').round(decimals=3)
    df_odds = df.loc[:, ('odds-factor')]
    df_pvalodds = df.loc[:, ('p-value_combined')]
    
    asterisk = df_pvalodds.applymap(lambda x: '***' if x<0.01 else('**' if x<0.05 else ('*' if x<0.1 else '')))
    df_asterisk = df_odds.applymap(lambda i: i if i<1000 else '{:.2e}'.format(i)).astype('str') + asterisk
    
    df_styled = df_asterisk.style.applymap(odds_colouring)\
                                .set_caption(caption)
    return df_styled

In [None]:
def model_styling(model, focus):
    
    #model: string which indicates the model to be used: ['Model1', 'Model2', 'Model3', 'Model4']
    #focus: string indicating whether only using negative or positive DIs or both (assuming a linear relationship which holds for positive and negative DIs ['', '_dry', '_wa']
    
    #model specifications
    study_area = ['Turkana', 'Marsabit', 'West Pokot']
    DIs = ['SPI-1', 'SPI-3', 'SPI-6', 'SPI-12', 'SPEI-1', 'SPEI-3', 'SPEI-6', 'SPEI-12']
    lags = [0, 1, 2, 3, 4, 5, 6, 7]
    stats = ['Mean', 'Median', 'P25', 'P75'] #'P10',, 'P90'
    
    #model keywords on cross-sectional individuals
    if model in ['Model1', 'Model3']:
        components = ['']
    elif model == 'Model2':
        components = ['', 'Turkana', 'West Pokot']
    else:
        components = ['', 'Toposa', 'TurkanaEth', 'Pokot', 'Gabra', 'Dassanetch']

    #dictionaries with keywords for each model
    dict_focus = {'':'', '_dry': 'negative', '_wet': 'positive'}
    dict_fe = {'Model1':'county and year', 'Model2': 'county and year', 'Model3': 'ethnic group and year', 'Model4': 'ethnic group and year'}
    dict_int = {'Model1': '', 'Model2':'and interaction with county', 'Model3': '', 'Model4': 'and interaction with ethnic group'}
    dict_xint = {'':'', 'Turkana':'_xturkana', 'West Pokot': '_xwestpokot', 'TurkanaEth':'_xTU', 'Toposa': '_xTO', 'Pokot': '_xPO', 'Gabra':'_xGA', 'Dassanetch':'_xDA'}
    dict_stats = {'':'', 'Turkana': ':countyTurkana', 'West Pokot': ':countyWest Pokot', 'TurkanaEth': ':eth_groupTurkana', 'Toposa': ':eth_groupToposa', 'Pokot': ':eth_groupPokot', 'Gabra': ':eth_groupGabra', 'Dassanetch': ':eth_groupDassanetch'}
    
    #retrieve keywords from dictionaries
    np_string = dict_focus[focus]
    fe_string = dict_fe[model]
    int_string = dict_int[model]
    
    #loop through cross-sectional individuals
    for comp in components:
        if model in ['Model1', 'Model2']:
            df =2
        elif model in ['Model3', 'Model4']:
            df = 5
        
        #retrieve keywords from dictionaries
        xint_string = dict_xint[comp]
        stats_string = dict_stats[comp]
        
        #title of table
        title_string = 'conflict as a function of %s DI with %s fixed effects %s (DI%s)'%(np_string, fe_string, int_string, xint_string[1:])
        
        #create empty dataframe
        results_df = pd.DataFrame(index = pd.MultiIndex.from_product([DIs, stats, lags], names=['Drought_indicator','stats', 'time_lag [months]']), columns=['coefficient', 'p-value'])
        results_df = results_df.sort_index()
        
        #loop through different drought indicators, statistics and lags
        for DI_name in DIs:
            for stat in stats:
                for dt in lags:
                    #read coefficients and variance-covariance matrices from data
                    model_instance_coeffs = pd.read_csv(os.path.join(path_statistics, model, 'R_%s_%s%s_%s_lag%i_coeffs2years.csv'%(model, DI_name, focus, stat, dt))).rename(columns={'Unnamed: 0':'variables'}).set_index('variables')
                    model_instance_vcov = pd.read_csv(os.path.join(path_statistics, model, 'R_%s_%s%s_%s_lag%i_vcov2years.csv'%(model, DI_name, focus, stat, dt))).rename(columns={'Unnamed: 0':'variables'}).set_index('variables')
                    
                    #store coefficients in newly created dataframe
                    results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coefficient'] = model_instance_coeffs.loc[stat+stats_string, 'Estimate']
                    results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'p-value'] = model_instance_coeffs.loc[stat+stats_string, 'Pr(>|t|)']
                    if comp == '':
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'coeff_combined'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coefficient']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined'] = np.sqrt(model_instance_vcov.loc[stat, stat])
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_005'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.005, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_025'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.025, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_05'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.05, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_95'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.95, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_975'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.975, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_99'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.99, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']

                    else:
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'coeff_combined'] = model_instance_coeffs.loc[stat, 'Estimate'] + results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coefficient']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined'] = np.sqrt(model_instance_vcov.loc[stat, stat] + model_instance_vcov.loc[stat+stats_string, stat+stats_string] + 2*model_instance_vcov.loc[stat+stats_string, stat]) #formular for standard error of sum of coefficients
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_005'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.005, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_025'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.025, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_05'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.05, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_95'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.95, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_975'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.975, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
                        results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'confint_99'] = results_df.loc[pd.IndexSlice[(DI_name, stat, dt)], 'coeff_combined'] + t.ppf(0.99, df)*results_df.loc[pd.IndexSlice[(DI_name, stat, dt)],'SE_combined']
        
        
        #perform Wald-test to check significance
        results_df.loc[:,'Wald'] = results_df.loc[:,'coeff_combined']/results_df.loc[:,'SE_combined']
        results_df.loc[:, 'p-value_combined'] = t.sf(abs(results_df.loc[:, 'Wald']), df=df) * 2
        
        #caclulate odds ratios from coefficients and their respective confidence intervals
        if focus=='_dry':
            results_df.loc[:, 'odds-factor'] = 1/np.exp(results_df.loc[:,'coeff_combined'])
            results_df.loc[:, 'oddsconfint_005'] = 1/np.exp(results_df.loc[:,'confint_005'])
            results_df.loc[:, 'oddsconfint_025'] = 1/np.exp(results_df.loc[:,'confint_025'])
            results_df.loc[:, 'oddsconfint_05'] = 1/np.exp(results_df.loc[:,'confint_05'])
            results_df.loc[:, 'oddsconfint_95'] = 1/np.exp(results_df.loc[:,'confint_95'])
            results_df.loc[:, 'oddsconfint_975'] = 1/np.exp(results_df.loc[:,'confint_975'])
            results_df.loc[:, 'oddsconfint_99'] = 1/np.exp(results_df.loc[:,'confint_99'])
        else:
            results_df.loc[:, 'odds-factor'] = np.exp(results_df.loc[:,'coeff_combined'])
            results_df.loc[:, 'oddsconfint_005'] = np.exp(results_df.loc[:,'confint_005'])
            results_df.loc[:, 'oddsconfint_025'] = np.exp(results_df.loc[:,'confint_025'])
            results_df.loc[:, 'oddsconfint_05'] = np.exp(results_df.loc[:,'confint_05'])
            results_df.loc[:, 'oddsconfint_95'] = np.exp(results_df.loc[:,'confint_95'])
            results_df.loc[:, 'oddsconfint_975'] = np.exp(results_df.loc[:,'confint_975'])
            results_df.loc[:, 'oddsconfint_99'] = np.exp(results_df.loc[:,'confint_99'])
        
        #resort dataframe and save
        results_df_unstacked = results_df.unstack().reindex(pd.MultiIndex.from_product([DIs, stats]))

        results_df_unstacked.to_csv(os.path.join(path_statistics, model, 'R_%s%s%s_coeffs2years.csv'%(model, focus, xint_string)))
        
        #style dataframes of coefficients and odds and save as figure and string for latex
        results_df_styled = dataframe_styling(results_df_unstacked, title_string)
        dfi.export(results_df_styled, os.path.join(path_figures, 'R_%s%s%s_coeffs2years.png'%(model, focus, xint_string)))

        title_odds_string = 'change in odds (factor) with a %s unit change in %s DI with %s fixed effects %s (DI%s)'%(np_string, np_string, fe_string, int_string, xint_string[1:])
        results_df = results_df.reindex(pd.MultiIndex.from_product([DIs, stats, lags], names=['Drought_indicator','stats', 'time_lag [months]']), columns=['coefficient', 'p-value', 'coeff_combined', 'SE_combined','confint_005','confint_025','confint_05','confint_95','confint_975','confint_99' , 'Wald', 'p-value_combined' , 'odds-factor','oddsconfint_005' ,'oddsconfint_025' ,'oddsconfint_05' ,'oddsconfint_95' ,'oddsconfint_975', 'oddsconfint_99'])

        odds_df_styled = dataframe_styling_odds(results_df.unstack().reindex(pd.MultiIndex.from_product([DIs, stats])), title_odds_string)
        dfi.export(odds_df_styled, os.path.join(path_figures, 'R_%s%s%s_odds2years.png'%(model, focus, xint_string)))
        odds_df_styled.to_latex(os.path.join(path_figures, 'R_%s%s%s_odds2years.txt'%(model, focus, xint_string)), convert_css=True)

In [None]:
#loop through all models for negative and positive DIs and retrieve coefficients, odds ratios, respective confidence intervals and styled tables to visually interpret the results in terms of significance and direction of the results
%%capture
for model in ['Model1', 'Model2', 'Model3', 'Model4']:
    for focus in ['_dry', '_wet']:
        styler = model_styling(model, focus)