In [None]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns

from matplotlib import pyplot as plt

In [None]:
def creat_log_normal(data):
    """create theoretical log normal distribution with observed values"""
    s, loc, scale = ss.lognorm.fit(data)
    return ss.lognorm.rvs(s, loc, scale, size=len(data))

In [None]:
def compare_log_normal_with(data):
    """compare dists using histogram"""
    plt.figure(figsize=[12, 6], dpi=300)
    sns.kdeplot(x=creat_log_normal(data), shade=False, label='theoretical distribution')
    sns.kdeplot(x=data, shade=False, label='observed distribution')
    plt.legend()
    plt.show()
    return

In [None]:
def normalization(data, y):
    """apply normalization on observed data"""
    figure, axes = plt.subplots(nrows=2, ncols=2, figsize=[24, 8], dpi=300)
    transformed = np.log((data + 0.0001))
    axes = axes.ravel()

    sns.scatterplot(x=(data + 0.0001), y=y, label='before', ax=axes[0])
    axes[0].set_legend()
    sns.kdeplot(x=data, shade=True, label='before', ax=axes[1])
    axes[1].set_legend()
    sns.scatterplot(x=transformed, y=y, label='after', ax=axes[2])
    axes[2].set_legend()
    sns.kdeplot(x=transformed, shade=True, label='after', ax=axes[3])
    axes[3].set_legend()
    plt.show()
    return

In [None]:
def quantile_quantile_plot(data, dist):
    """create qq plot"""
    bins = np.linspace(0, 100, data.shape[0])
    std_norm = np.percentile(np.random.standard_normal(data.shape[0]), bins)
    input_data = np.percentile(data, bins)
    trans_input_data = np.percentile(np.log(data, bins))

    plt.figure(figsize=[12, 6], dpi=300)
    sns.regplot(x=std_norm, y=input_data)
    sns.regplot(x=std_norm, y=trans_input_data)
    plt.xlabel('theoretical distribution')
    plt.ylabel('observed distribution')
    plt.show()
    return

In [None]:
tb_cont_data = pd.read_csv('../../data/train.csv', index_col='id')
tb_cont_data = tb_cont_data.loc[:, 'cont0':'cont13']
tb_cont_data