---
## Load datasets
---

In [None]:
# import basic packages
import numpy as np
import pandas as pd

In [None]:
# load input data from csv files
data = pd.read_csv('./input_data/DataPD.txt')

# get a glimpse of data
data.head()

In [None]:
# load description of data
desc = pd.read_csv('./input_data/Description.txt')
desc.head()

In [None]:
# display full dataframe with descriptions
display(desc)

In [None]:
# another shot at displaying full dataframe
from tabulate import tabulate

printout = tabulate(desc, headers='keys', tablefmt='plain')
print(printout)

# Note: useful formats:
# >> 'latex'
# >> 'github'
# >> 'grid'
# >> and many more...

In [None]:
# if package is missing, try installing with pip
pip install tabulate

---
## Analyze datasets
---

In [None]:
# description: what kind of variables we have?
desc.sort_values(by='Type')

In [None]:
# list of distinct variable types
variable_types = desc['Type'].unique()
print(variable_types)

In [None]:
# check how many values are missing in the main 'data'

data_stats = pd.DataFrame()

for col in data.columns:
    name = col
    valid = sum(data[col].notna())
    empty = sum(data[col].isna())
    
    row = pd.Series({
        'Name' : name,
        'Valid' : valid,
        'Empty' : empty
    })
    
    data_stats = data_stats.append(row, ignore_index=True)

# check the dataframe
data_stats.head()

In [None]:
# remove 'ID' and 'deflag' (dependent variable) from analysis
data_stats = data_stats.drop([0,1])

In [None]:
# display data stats
print(tabulate(data_stats, headers='keys', tablefmt='simple'))

In [None]:
# average and standard deviation
average = data.mean().reset_index().rename(columns={'index':'Name', 0:'Average'})
std_dev = data.std().reset_index().rename(columns={'index':'Name', 0:'Std_Dev'})

In [None]:
# compute quantiles
quant = data.quantile(q=np.linspace(0.00, 1.00, 5))
quant.head()

In [None]:
# quantiles - transpose
quant_1 = quant.transpose()

# quantiles - rename columns
quant_1.columns = ['Min', 'Q_0.25', 'Q_0.50', 'Q_0.75', 'Max']

# quantiles - put index as another column
quant_1 = quant_1.reset_index()
quant_1 = quant_1.rename(columns={'index': 'Name'})

# verify
quant_1.head()

In [None]:
# combine all data statistics into one dataframe
data_stats_1 = data_stats.copy()
data_stats_1 = data_stats_1.merge(average, how='left', on='Name', validate='one_to_one')
data_stats_1 = data_stats_1.merge(std_dev, how='left', on='Name', validate='one_to_one')
data_stats_1 = data_stats_1.merge(quant_1, how='left', on='Name', validate='one_to_one')
display(data_stats_1)

In [None]:
# some simple checks
print(f"==> NA values: {sum(data['var2_AQ'].isna())}")
print(f"==> NA values: {sum(data['var2_AQ'] >= 5)}")

---
## Plot distributions
---

In [None]:
# import plotting package
import matplotlib.pyplot as plt

In [None]:
# check one of the variables
plt.hist(data['var1_AQ'], bins=100);

In [None]:
# check all variables at once
fig, axes = plt.subplots(5, 6, figsize=(20,20), sharex=False, sharey=False)


# some fancier colors
color_list = ['tab:red', 'tab:blue', 'tab:green', 'tab:purple', 'tab:olive', 'tab:brown', 'tab:orange']
type_list = desc['Type'].unique()
type_color_dict = dict(zip(type_list, color_list))


# enumerate over all variables
for i, (ax,col) in enumerate(zip(axes.flatten(), data.columns[2:])):
    
    # prepare values for the histogram
    values = data[col]
    
    # decide on the color
    var_type = desc[desc.Criteria == col].Type.values[0]
    plot_color = type_color_dict[var_type]
    
    ## remove some outliers from histograms
    cond_min = values >= values.quantile(q=0.01)
    cond_max = values <= values.quantile(q=0.99)
    values = values[ cond_min & cond_max ]
    
    ax.hist(values, alpha=0.8, bins=100, density=True, stacked=True, label=col, color=plot_color)
    ax.set_title(col)

In [None]:
correlation_matrix = data.corr()

In [None]:
# plot correlation matrix
plt.matshow(correlation_matrix);

In [None]:
# plot correlation matrix with scale-bar
fig = plt.figure();
ax = fig.add_subplot(111);
cax = ax.matshow(correlation_matrix, interpolation='nearest');
fig.colorbar(cax);

---
## Regression
---

In [None]:
# clean dataset - remember: there are NAs in 'X' dataset - need to be resolved
data_regression = data.copy()

# clean dataset - 99 as NA (there are many '99' entries across variables, maybe NaN?)
cond_99 = data_regression == 99.0
cond_99['ID'] = False

data_regression = data_regression.mask(cond_99)

data_regression = data_regression.fillna(value=data.median())

In [None]:
# prepare variables for regression

# output (dependent) variable
y = data_regression['deflag']

# predictor (independent) variables
X = data_regression.drop(columns=['ID','deflag'])

In [None]:
# prepare training/test samples

# import a dedicated module from sklearn
from sklearn.model_selection import train_test_split

# split into train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=123, train_size = 0.8)

In [None]:
# check split size
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_test:  {X_test.shape}')
print(f'y_test:  {y_test.shape}')

print(f'default ratio - train: {100*sum(y_train)/len(y_train):6.4f}%')
print(f'default ratio - test:  {100*sum(y_test)/len(y_test):6.4f}%')

In [None]:
# perform regression

# define classifier
clf = LogisticRegression()

# perform fit
clf.fit(X_train, y_train)

In [None]:
# check if classes are ok (in our case: should be just 0 and 1)
clf.classes_

In [None]:
# check fitted coefficients
clf.coef_

In [None]:
# study some cases - check actual results
y_test.iloc[0:4]

In [None]:
# study some cases - check predictions on a slice from a test sample
clf.predict(X_test.iloc[0:4])

In [None]:
# study some cases - check scores
clf.decision_function(y_test.iloc[0:4])

In [None]:
# study some cases - check probabilities on a few entries from a test sample
clf.predict_proba(X_test.iloc[0:4])

In [None]:
# plot ROC curve - import dedicated packages
from sklearn.metrics import RocCurveDisplay, roc_curve

In [None]:
# plot ROC curve - prepare scores for the test sample
y_score = clf.decision_function(X_test)

# plot ROC curve - prepare elements for the plot
fp_rate, tp_rate, thresholds = roc_curve(y_test, y_score, pos_label=clf.classes_[1])

# plot ROC curve - display
roc_display = RocCurveDisplay(fpr=fp_rate, tpr=tp_rate)
roc_display.plot();

In [None]:
# draw a confusion matrix - import dedicated packages
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# draw a confusion matrix - prepare model predictions
y_pred = clf.predict(X_test)

# draw a confusion matrix - prepare matrix out of true and predicted results
cm = confusion_matrix(y_test, y_pred)

# draw a confusion matrix - display the matrix
cm_display = ConfusionMatrixDisplay(cm).plot()