<a href="https://colab.research.google.com/github/pratzz/Diabetes-Readmission-Prediction/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport
import os
import time

In [None]:
timestr = time.strftime("%Y%m%d-%H%M%S")
output_path = os.path.join("../output")
data_path = "../data"
data_exploration_path = os.path.join(output_path, "data_exploration")
processed_data_path = os.path.join(data_path, "processed_data")
if not os.path.exists(data_exploration_path):
    os.makedirs(data_exploration_path) 
if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)

In [None]:
train_data = pd.read_csv(os.path.join(data_path, "raw_data/assignment_train.csv"))
test_data = pd.read_csv(os.path.join(data_path, "raw_data/assignment_test.csv"))

# train_data.head(10)

In [None]:
train_data.head(10)

### Data Profiling & Analysis 


In [None]:
profile_report = ProfileReport(train_data, title="Pandas Profiling Report")
profile_report.to_widgets()

In [None]:
profile_report.to_file(os.path.join(data_exploration_path, "profile_report_1__%s.html"%timestr)) 

## Vif_drop_columns

from statsmodels.stats.outliers_influence import variance_inflation_factor    

def calculate_vif_(X, thresh=5.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True

    print('Remaining variables:')
    print(X.columns[variables])
    return X.iloc[:, variables]

# calculate_vif_(x_train, 5)

vif_keep_col = ['7days_all_gap_days', 'all_gap_7days_last_vs_previous',
       '30days_all_gap_days', 'all_gap_30days_last_vs_previous',
       'mtd_all_gap_days', 'all_gap_mtd_previous_days', 'all_last_day',
       'all_last30_stable', 'all_last30_inc_count', 'all_consistency_index',
       'avg_all_gap_days_d1_10', 'avg_all_gap_days_d11_20',
       'all_gap_days_d1_10_thisvsprev', 'all_gap_days_d11_20_thisvsprev',
       'all_gap_days_d20_31_thisvsprev', 'all_7days_min_thisvs4w',
       'all_7days_trend_vs4weeks', 'all_7days_trend_vs10weeks',
       'all_7days_vslast_month7days', 'all_7days_max_thisvs10w',
       'all_ystrday_vsmin10d', 'all_ystrday_trend_vs10d',
       'all_ystrday_vsdaybfr', 'all_mrr_trend_vs6M', 'all_lst30days_vsprvmnth',
       'all_mtd_vs_min_lst3M', 'all_trend_mtdvs3M_sameday',
       'all_norm_growth_m1', 'all_norm_growth_m2', 'all_norm_growth_m3',
       'all_norm_growth_m4', 'all_norm_growth_m5', 'all_norm_growth_m6',
       'all_norm_growth_index_last', 'all_gtv_last12Months_m12',
       'all_gtv_last10days_d3', 'all_gtv_last10days_d4',
       'all_gtv_last10days_d6', 'all_gtv_last10days_d7',
       'all_gtv_last10days_d8', 'all_gtv_last10days_d9']


x_train = x_train[vif_keep_col]
x_val = x_val[vif_keep_col]

x_val.shape

## Log Transformation log(x - (min(x)-1))¶

for c in vif_keep_col:
    x_train[c] = np.log(x_train[c] - (min(x_train[c]) - 1))
x_train.shape

for c in vif_keep_col:
    x_val[c] = np.log(x_val[c] - (min(x_val[c]) - 1))
x_val.shape

##### Splitting data into train and validation datasets 


In [None]:
target='business_risk'

X = train_data.drop([target],1)
y = train_data[target]
# Divide the training data given to train and validation data frames
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

##### Saving data on disk

In [None]:
x_train.to_csv(os.path.join(processed_data_path, "x_train.csv"),index=False)
x_val.to_csv(os.path.join(processed_data_path, "x_val.csv"),index=False)
y_train.to_csv(os.path.join(processed_data_path, "y_train.csv"),index=False)
y_val.to_csv(os.path.join(processed_data_path, "y_val.csv"),index=False)
test_data.to_csv(os.path.join(processed_data_path, "x_test.csv"),index=False)