**1. Load the data**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy.stats as stats
from tools.datasets import get_prudential

data, labels, continuous, discrete, dummy, categorical, _ = get_prudential()

In [None]:
# check target values
labels.unique()

In [None]:
# look at some properties of the target variable
labels.describe()

Check the distribution of the target variable.

In [None]:
sns.countplot(labels)

From the description (link to source), we know that the target is an ordinal variable from 1 to 8 and the evaluation metric is quadratic weighed kappa. Even though the ordering 

In [None]:
# check for missing values
data[categorical].isnull().sum()

In [None]:
from statistics.cramer import categorical_relation_with

top = categorical_relation_with(data, 'Response', categorical + dummy)
top_categorical = top[0].sort_values(ascending=False).index[0:10]
top[1].sort_values(ascending=False)[0:10] # print 

Medical Keyword 32 and 45 seem to be unrelated (

In [None]:
from statistics.cramer import cross_categorical

v, p = cross_categorical(data, sorted(list(top_categorical)) + ['Response'])
sns.heatmap(v, annot=True)

Medical history 33 is strongly correlated with medical keyword 23, maybe the keyword is always present for the patients with something specific in their medical history. We can drop the keyword too. We can see that medical history 4 and keywords 15 and 3 have a significant correlation with Response.

**3. Discrete **

In [None]:
data[discrete].isnull().sum()

In [None]:
data[discrete].describe()

Quite a lot comparing to the set size (59k), except medical history 1, most of the values are missing or not present. Maybe they refer to some specific illness or test that not everyone needed to pass. Before doing anything about them, let's check the distibutions

In [None]:
i = 0
fig, axes = plt.subplots(int(math.ceil(len(discrete)/3)), 3, figsize=(15,12))
for attr in discrete:
    clean = data[attr].dropna()
    sns.distplot(clean, bins=40, ax=axes[int(i/3)][i%3]).set_title(attr)
    axes[int(i/3)][i%3].set_ylim(0.00001,0.005)
    i += 1

Two significant peaks at the edges, maybe the values were clipped to 240. let's check if these are 240 and 0 that take most of these samples or is it distributed.

In [None]:
for attr in discrete:
    print("{}, nan: {}".format(attr, data[attr].isnull().sum()))
    print(dict(data[attr].value_counts().nlargest(20)))

In [None]:
from config import BINNER_CONFIG
from transformers.custom_binner import CustomBinner
binner = CustomBinner(BINNER_CONFIG)

data = binner.transform(data)
new_cols = list(set(data.columns) - set(discrete + categorical + dummy + continuous + ['Id', 'Response']))
# check new columns for correlations

top = categorical_relation_with(data, 'Response', new_cols)
top_categorical = top[0].sort_values(ascending=False).index[0:10]
top[0].sort_values()

Medical history 15 might be usefull. Three lowest are not relevant (to delete).

In [None]:
from statistics.cat_to_num import apply_across

# apply_across(data, new_cols, ['Response'], stats.spearmanr)
# experimentally it seams that filling missing
pd.Series([stats.spearmanr( data['Response'], data[col].fillna(300))[0] for col in discrete], index=discrete).sort_values()

Binning seems to work, but filling missing values makes sense too. Let's check it later on the model.

In [None]:
print(apply_across(data[['Response'] + discrete].fillna(-1), ['Response'], discrete, stats.kruskal))

All seem to have some influence. p < 0.05

**4. Continuous **

In [None]:
print(data[continuous].describe())

There are some missing values amond the variables, handle later. All seem to be normalized to range 0-1.

In [None]:
i = 0
fig, axes = plt.subplots(int(math.ceil(len(continuous)/3)), 3, figsize=(15,12))
# plt.figure()
for attr in continuous:
    clean = data[attr].dropna()
    sns.distplot(clean, bins=20, ax=axes[int(i/3)][i%3]).set_title(attr)
    i += 1

In [None]:
# for now just use the simplest approach, and add na flags
from transformers.fill_missing_transformer import FillNaTransformer

fill_with_zero = ['Employment_Info_4', 'Insurance_History_5']
fill_with_median = [ x for x in continuous if x not in fill_with_zero]

filler = FillNaTransformer(median=fill_with_median, zero=fill_with_zero, nan_flag=continuous)
filler.fit(data)
data = filler.transform(data)


In [None]:
# plot to check result
i = 0
fig, axes = plt.subplots(int(math.ceil(len(continuous)/3)), 3, figsize=(15,12))
# plt.figure()
for attr in continuous:
    clean = data[attr].dropna()
    sns.distplot(clean, bins=20, ax=axes[int(i/3)][i%3], kde=None).set_title(attr)
    i += 1

Doesn't look that good, but let's leave it like that for now. Most of the variables are more or less normally distributed. Some of the features are evidently skewed, let's check it and fix.

In [None]:

data[continuous].apply(lambda x: stats.skew(x))


In [None]:
from transformers.box_cox import BoxCoxTransformer
# tune those,
lambdas_per_column = {
    'Product_Info_4': 0.5,
    'Ht': 1.2,
    'Wt': 0.5,
    'BMI': 0.6,
    'Employment_Info_1': 0.5,
    'Employment_Info_4': 0.5,
    'Employment_Info_6': 0.5,
    'Insurance_History_5': 0.5,
    'Family_Hist_2': 0.7,
    'Family_Hist_3': 2,
    'Family_Hist_4': 0.7,
    'Family_Hist_5': 2
}
boxcox = BoxCoxTransformer(lambdas_per_column)
data = boxcox.transform(data)


In [None]:
i = 0
fig, axes = plt.subplots(int(math.ceil(len(continuous)/3)), 3, figsize=(15,12))
# plt.figure()
for attr in continuous:
    clean = data[attr].dropna()
    sns.distplot(clean, bins=20, ax=axes[int(i/3)][i%3], kde=None).set_title(attr)
    i += 1

In [None]:

data[continuous].apply(lambda x: stats.skew(x))

Looks much better, still not perfect, but quite ok.

In [None]:
# from statistics.cat_to_num import spearman_with
# sns.heatmap(apply_across(data, ['Response'], continuous, stats.spearmanr))
from statistics.cat_to_num import spearman_with

correlation = spearman_with(data, 'Response', continuous)
correlation = pd.Series({col: res for col, res in correlation.items()})

In [None]:
correlation.sort_values()

Most correlated are BMI, Wt, Ins_Age, Ht and Product_Info_4.