In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import kurtosis,skew 

In [None]:
def describeNumCol (df,colName) :
    """
    describe a numeric column with basic stats and graphics
    """

    dist_series = df[colName].value_counts()
    n = len(df[colName])

    #calculate sample kurtosis
    gamma1 = kurtosis(df[colName], bias=False)
    #The kurtosis of a normal distribution is 3.
    #kurtosis less than 3 : playkurtic, which means it tends to produce fewer and less extreme outliers than the normal distribution.
    #kurtosis greater than 3, : leptokurtic, which means it tends to produce more outliers than the normal distribution.
    if gamma1 <2.9 : 
        interp_gamma1 =  "playkurtic (concentrated)"
    elif gamma1 >3.1 :
        interp_gamma1 =  "leptokurtic (many outliers)"
    else : 
        interp_gamma1 = "distribution is normal"

    #calculate sample skewness
    gamma2 = skew(df[colName], bias=False)
    #A negative skew = the tail is on the left side of the distribution, which extends towards more negative values.
    #A positive skew = the tail is on the right side of the distribution, which extends towards more positive values.
    #A value of zero = there is no skewness in the distribution at all, meaning the distribution is perfectly symmetrical.
    if gamma2 <-0.2 : 
        interp_gamma2 =  "tail is on the left side of the distribution"
    elif gamma2 >0.2 :
        interp_gamma2 =  "tail is on the right side of the distribution"
    else : 
        interp_gamma2 = "distribution is symmetrical"


    #calculate Lorentz and Gini  
    lorenz = np.cumsum(np.sort(df[colName])) / df[colName].sum()
    lorenz = np.append([0],lorenz) # La courbe de Lorenz commence à 0
    AUC = (lorenz.sum() -lorenz[-1]/2 -lorenz[0]/2)/n # Surface sous la courbe de Lorenz. Le premier segment (lorenz[0]) est à moitié en dessous de 0, on le coupe donc en 2, on fait de même pour le dernier segment lorenz[-1] qui est à moitié au dessus de 1.
    S = 0.5 - AUC # surface entre la première bissectrice et le courbe de Lorenz
    gini = 2*S
    gini


    #prepare outputs
    graph = widgets.Output()
    table = widgets.Output(layout=widgets.Layout(margin='0 50px 0 0'))
    hLayout = widgets.HBox([table,graph])


    fig,ax = plt.subplots(figsize=(20,15) )
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    
    plt.title(f"Variable {colName}",
                fontdict={
                    'fontsize': 24,
                    'fontweight' : "bold" ,
                    'verticalalignment': 'baseline',
                    'horizontalalignment': 'center'},
                pad =50,
                )

    # grid specification
    gs = fig.add_gridspec(3,2)
    ax1 = fig.add_subplot(gs[0, 0])

    ax2 = fig.add_subplot(gs[1, 0])
    ax2.sharex = ax1

    ax3 = fig.add_subplot(gs[0, 1])
    ax3.sharey = ax1

    ax4 = fig.add_subplot(gs[1, 1])

    ax5 = fig.add_subplot(gs[2, 1])


    # axe 1 ---------------------------------------------
    ax1.bar(dist_series.index,dist_series, color='b')
    ax1.set_title('Distribution of values')
    ax1.set_ylabel("Nb observations")
    ax1.xaxis.set_tick_params(rotation=45)

    # axe 2 ---------------------------------------------
    sns.histplot(x=dist_series.index, kde=True, stat='density',ax=ax2 )
    ax2.xaxis.set_visible(False)
    ax2.set_title('Density KDE')
    ax2.set_ylabel(colName)
    ax2.invert_yaxis()

    # axe 3 ---------------------------------------------
    sns.histplot(y=dist_series, kde=True, stat='density',ax=ax3 )
    ax3.xaxis.set_visible(False)
    ax3.set_title('Density KDE')
    ax3.set_ylabel("Nb Observation")

    # axe 4 ---------------------------------------------
    sns.boxplot(x=df[colName], ax =ax4,notch=True,color=(1,0,1,0.5))
    ax4.set_xlabel(colName)
    ax4.set_title('Distribution of values Boite a mous')


    # axe 5 -----------------------------------------------

    xaxis = np.linspace(0-1/n,1+1/n,len(lorenz)) #one segment of len n for each individual + 1 additional segment at the origine 0. 
    #first segment starts at 0-1/n, las ends at 1+1/n.
    ax5.plot(xaxis,lorenz,drawstyle='steps-post')
    ax5.plot([0,1], [0,1]) #bisector
    ax5.set_title('Lorentz distribution')



    # display ---------------------------------------------

    with graph :
        graph.clear_output()
        plt.show()
    with table :
        table.clear_output()
        full_desc = df[[colName]].describe()
        full_desc.loc["Unique values"] = n 
        full_desc.loc["Missing values%"] = f"{100*df[colName].isna().sum()/len(df[colName]):.2f}"
        full_desc.loc["Kurtosis"] = gamma1
        full_desc.loc["Kurto desc"] = interp_gamma1
        full_desc.loc["Skew"] = gamma2
        full_desc.loc["Skew desc"] = interp_gamma2
        full_desc.loc["Gini"] = gini

        display(full_desc)


    display(hLayout)