In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, power_transform
from imblearn.over_sampling import SMOTE

def lower(word:str):
    print(word.lower())

def unique_values(database ,label:str):
    
    label = label.lower()
    n_obs = database.shape[0]
    nunique_label = database[label].nunique()

    print(f"n° unique values is {nunique_label} of {n_obs}")

def clean_cardinality(df):
    
    """Drops all columns with high and low cardinality"""
    n_unique_values = df.nunique()

    high = n_unique_values[n_unique_values == df.shape[0]].index
    low = n_unique_values[n_unique_values == 1 ].index
    total = [*high, *low]

    df.drop(columns=total, inplace=True)
    
    return df

def clean_missing(df, threshold=0.3):
    
    """drops all columns above a certain threshold"""

    # mysterious treatment.
    #df["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace=True)

    # now, my treatment
    nulls = df.isnull().sum()/df.shape[0]
    #nulls.sort_values(ascending=False, inplace=True)
    null_columns = list(nulls[nulls>=threshold].index)
    df.drop(columns = null_columns, inplace=True)
    
    return df

def drop_correlated_features(X, threshold=0.5):

    """Find correlated columns in a DataFrame and drop them 
    
    Arguments:
        df (DataFrame): Data to analize.
        threshold (float): Minimun correlation value considered to decide whether
        two columns are correlated or not.
     
    Rreturns:
        A list with non-correlated columns."""
    
    numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
    numeric_data= X[numeric_features].copy()
    # Creating correlation matrix and getting their absolute values.
    corr = numeric_data.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr.where(np.triu(np.ones(corr.shape, dtype=bool), k=1))
    # Find features with correlation greater than 0.95
    the_drop = [column for column in upper.columns if any(upper[column] >= threshold)]

    no_correlated_columns = list(numeric_data.drop(the_drop, axis=1).columns)

    return no_correlated_columns

def get_features(X, type:str, get="all"):

    """Extract categorical or numeric features from a DataFrame

    Arguments:
        df (dataframe): Data to analize
        type (str): {"c, "n"} Whether the desired features is categorical(c)
                    or numeric(n).
        get (str): Whether extract only binary, no binary or all 
        categorical features {"all", "binary", "no_binary"}
        
    Return:
     list of all, binary or no binary categorical features.
    """
    if type=="c":
        # getting a table with only categorical features an their n° of unique values.
        cat_feat = X.select_dtypes(include=['O']).nunique()

        # from the object features: filtering the binary features.
        bin_cat_feat = cat_feat[cat_feat == 2].index
        # from the object features: filtering the non-binary features.
        no_bin_cat_feat = cat_feat[cat_feat != 2].index

        if get=="all":
            col = list(cat_feat.index)
        elif get=="binary":
            col= list(bin_cat_feat)
        elif get=="no_binary":
            col=list(no_bin_cat_feat)
        else:
            raise Exception("'get' must be in {all, binary, no_binary}")
    elif type =="n":
        # getting the names of the numerical features
        num_feat = X.select_dtypes(include=['int', 'float']).nunique()

        if get=="all":
            col = list(num_feat.index)
        elif get=="binary":
            bin_num_feat = num_feat[num_feat==2].index
            col= list(bin_num_feat)
        elif get=="no_binary":
            no_bin_num_feat = num_feat[num_feat!=2].index
            col=list(no_bin_num_feat)
        else:
            raise Exception("'get' must be in {all, binary, no_binary}")
    else:
        raise Exception("'type' must be in {c, n}")
    
    return col


def val_col(df, columns_to_select):
    valid_columns = [column for column in columns_to_select if column in df.columns]
    return valid_columns

def sel(df, columns_to_select):
    col = val_col(df, columns_to_select)
    return df[col]


def preprocess_no_bin(X_train, X_test, strategy):
    if strategy == "simple":
        si = SimpleImputer(strategy='median')
        si.fit(X_test)
        X_test = si.transform(X_test)
        X_train = si.transform(X_train)
    elif strategy == "knn":
        knn = KNNImputer(n_neighbors=8)
        knn.fit(X_test)
        X_test = knn.transform(X_test)
        X_train = knn.transform(X_train)    


    ss = StandardScaler()
    ss.fit(X_test)
    X_test=ss.transform(X_test)
    X_train=ss.transform(X_train)

    X_test = power_transform(X_test, method='yeo-johnson')
    X_train = power_transform(X_train, method='yeo-johnson')

    return X_train, X_test

def preprocess_bin(X_train, X_test):

    si = SimpleImputer(strategy='most_frequent')
    si.fit(X_test)
    X_test = si.transform(X_test)
    X_train = si.transform(X_train)

    return X_train, X_test

def preprocess(df, w):
    target = f"r{w}hosp1y"

    #print(f"n° features original data: {w1.shape[1]}")
    no_corr_columns = drop_correlated_features(df, threshold=0.4)
    w1_no_correlated = df[no_corr_columns].copy()
    #print(f"n° features after drop_correlated: {w1_no_correlated.shape[1]}")
    w1_clean= clean_cardinality(w1_no_correlated)
    #print(f"n° features after clean_missing & clean_cardinality: {w1_clean.shape[1]}")
    # dropping the nans in the target values by row-wise
    w1_clean.dropna(subset=[target], inplace=True)
    #print(w1_clean.shape)

    X = w1_clean.drop(columns=target).copy()
    y = w1_clean[target].copy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    num_bin = get_features(X_train, "n", 'binary')
    #X_train_num_bin = X_train[num_bin]

    num_no_bin = get_features(X_train, "n", 'no_binary')
    #X_train_num_no_bin = X_train[num_no_bin]

    ################# nothing here ######################
    X_train_cat_bin = get_features(X_train, "c", 'binary') # nothing
    X_train_cat_no_bin = get_features(X_train, "c", 'no_binary') # nothing

    X_train[num_no_bin], X_test[num_no_bin] = preprocess_no_bin(X_train[num_no_bin], X_test[num_no_bin], strategy="simple")
    X_train[num_bin], X_test[num_bin] = preprocess_bin(X_train[num_bin], X_test[num_bin])

    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    X_test, y_test = smote.fit_resample(X_test, y_test)
    return X_train, X_test, y_train, y_test




In [2]:
w1 = pd.read_csv("data/waves_norm/wave1_norm.csv")
w2 = pd.read_csv("data/waves_norm/wave2_norm.csv")
w3 = pd.read_csv("data/waves_norm/wave3_norm.csv")
w4 = pd.read_csv("data/waves_norm/wave4_norm.csv")
w5 = pd.read_csv("data/waves_norm/wave5_norm.csv")


In [4]:
X_train_1, X_test_1, y_train_1, y_test_1 = preprocess(w1, 1)
X_train_2, X_test_2, y_train_2, y_test_2 = preprocess(w2, 2)
X_train_3, X_test_3, y_train_3, y_test_3 = preprocess(w3, 3)
X_train_4, X_test_4, y_train_4, y_test_4 = preprocess(w4, 4)
X_train_5, X_test_5, y_train_5, y_test_5 = preprocess(w5, 5)


In [5]:
from sklearn.utils import class_weight

cw1 = class_weight.compute_sample_weight(class_weight='balanced', y=y_train_1)
cw2 = class_weight.compute_sample_weight(class_weight='balanced', y=y_train_2)
cw3 = class_weight.compute_sample_weight(class_weight='balanced', y=y_train_3)
cw4 = class_weight.compute_sample_weight(class_weight='balanced', y=y_train_4)
cw5 = class_weight.compute_sample_weight(class_weight='balanced', y=y_train_5)

Looking at the target variable temporal distribution

In [6]:
y_train_list =[y_train_1, y_train_2, y_train_3, y_train_4, y_train_5]
y_test_list = [y_test_1, y_test_2, y_test_3, y_test_4, y_test_5]

for i in y_train_list:
    print(round(i.value_counts(normalize=True), 2))

print("="*100)

for i in y_test_list:
    print(round(i.value_counts(normalize=True), 2))

r1hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r2hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r3hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r4hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r5hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r1hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r2hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r3hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r4hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
r5hosp1y
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64


# 1. Modeling

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score


def logistic_reg(X_train, y_train, w_0, w_1):
    log = LogisticRegression(random_state=42, 
                            max_iter=1000, 
                            n_jobs=-1, 
                            class_weight={0: w_0, 1: w_1})
    return log.fit(X_train, y_train)

def random_forest(X_train, y_train):
    rf = RandomForestClassifier(n_estimators=100,
                                random_state=42, 
                                n_jobs=-1)
    return rf.fit(X_train, y_train)

def get_performance(model, X_train, X_test, y_train, y_test):
      y_pred = model.predict(X_test)

      roc_auc_train = roc_auc_score(y_train, model.predict(X_train))
      # Validation ROC AUC Score
      acc = accuracy_score(y_test, y_pred)
      roc_auc_val = roc_auc_score(y_test, y_pred)
      print( f"Model Accuracy: {acc}",
            f"Train roc_auc: {roc_auc_train}", 
            f"Test roc_auc: {roc_auc_val}",
            sep='\n', end='\n\n')
      print(confusion_matrix(y_test, y_pred),
      classification_report(y_test, y_pred),
      sep='\n')

def proba_dist(model, X_test):
    with plt.style.context('fivethirtyeight'):
        sns.histplot(model.predict_proba(X_test), bins=50)
        plt.title("Probability distribution for each Target category")
        plt.xlabel("Probability")
        plt.ylabel("Frequency")
        plt.show()

## 1.1. Logistic regression.

In [8]:
lr_1 = logistic_reg(X_train_1, y_train_1, 0.5, 0.5)
get_performance(lr_1, X_train_1, X_test_1, y_train_1, y_test_1)


Model Accuracy: 0.7241442767758557
Train roc_auc: 0.7817402407880335
Test roc_auc: 0.7241442767758557

[[2090  627]
 [ 872 1845]]
              precision    recall  f1-score   support

         0.0       0.71      0.77      0.74      2717
         1.0       0.75      0.68      0.71      2717

    accuracy                           0.72      5434
   macro avg       0.73      0.72      0.72      5434
weighted avg       0.73      0.72      0.72      5434



## 1.2. Random forest.

In [9]:
rf_1 = random_forest(X_train_1, y_train_1)
get_performance(rf_1, X_train_1, X_test_1, y_train_1, y_test_1)

Model Accuracy: 0.9344865660655134
Train roc_auc: 1.0
Test roc_auc: 0.9344865660655133

[[2625   92]
 [ 264 2453]]
              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94      2717
         1.0       0.96      0.90      0.93      2717

    accuracy                           0.93      5434
   macro avg       0.94      0.93      0.93      5434
weighted avg       0.94      0.93      0.93      5434



## Bagging for XGBoost

In [10]:
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier

xgbm = XGBClassifier()

xgb_model_1=xgbm.fit(X_train_1, y_train_1)
xgb_model_2=xgbm.fit(X_train_2, y_train_2)
xgb_model_3=xgbm.fit(X_train_3, y_train_3)
xgb_model_4=xgbm.fit(X_train_4, y_train_4)

#models = [xgb_model_1, xgb_model_2, xgb_model_3, xgb_model_4]


In [11]:
# Combine models using BaggingClassifier
bagging_model = BaggingClassifier(estimator=xgbm, n_estimators=20, n_jobs=-1, random_state=42)

# Train the bagging model
bagging_model.fit(X_train_5, y_train_5)



In [12]:
get_performance(bagging_model, X_train_5, X_test_5, y_train_5, y_test_5)

Model Accuracy: 0.6791044776119403
Train roc_auc: 0.9642614312425635
Test roc_auc: 0.6791044776119404

[[1109 1839]
 [  53 2895]]
              precision    recall  f1-score   support

         0.0       0.95      0.38      0.54      2948
         1.0       0.61      0.98      0.75      2948

    accuracy                           0.68      5896
   macro avg       0.78      0.68      0.65      5896
weighted avg       0.78      0.68      0.65      5896



## Bagging for RandomForest

In [13]:
rfm=RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

rfm_1=rfm.fit(X_train_1, y_train_1)
rfm_2=rfm.fit(X_train_2, y_train_2)
rfm_3=rfm.fit(X_train_3, y_train_3)
rfm_4=rfm.fit(X_train_4, y_train_4)

b_rf =  BaggingClassifier(estimator=rfm, n_estimators=20, n_jobs=-1, random_state=42)
b_rf.fit(X_train_5, y_train_5)

In [14]:
get_performance(b_rf, X_train_5, X_test_5, y_train_5, y_test_5)

Model Accuracy: 0.923337856173677
Train roc_auc: 0.9930732619411865
Test roc_auc: 0.9233378561736771

[[2876   72]
 [ 380 2568]]
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93      2948
         1.0       0.97      0.87      0.92      2948

    accuracy                           0.92      5896
   macro avg       0.93      0.92      0.92      5896
weighted avg       0.93      0.92      0.92      5896



## 1.3 Votting Classifier

In [15]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

clf1 = LogisticRegression(max_iter=1000)
clf2 = DecisionTreeClassifier(random_state=42)
clf3 = SVC(random_state=42)
clf4 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

classifiers = [
    ('Logistic Regression', clf1), 
    ('Decision Tree', clf2), 
    ('SVM', clf3), 
    ('Random Forest', clf4)
    ]

voting_clf = VotingClassifier(estimators=classifiers, voting='hard', n_jobs=-1)

v1 = voting_clf.fit(X_train_1, y_train_1)
v2 = voting_clf.fit(X_train_2, y_train_2)
v3 = voting_clf.fit(X_train_3, y_train_3)
v4 = voting_clf.fit(X_train_4, y_train_4)

b_voting =  BaggingClassifier(estimator=voting_clf, n_estimators=20, n_jobs=-1, random_state=42)
b_voting.fit(X_train_5, y_train_5)



In [16]:
get_performance(b_voting, X_train_5, X_test_5, y_train_5, y_test_5)

Model Accuracy: 0.8264925373134329
Train roc_auc: 0.9671086180520143
Test roc_auc: 0.8264925373134329

[[2839  109]
 [ 914 2034]]
              precision    recall  f1-score   support

         0.0       0.76      0.96      0.85      2948
         1.0       0.95      0.69      0.80      2948

    accuracy                           0.83      5896
   macro avg       0.85      0.83      0.82      5896
weighted avg       0.85      0.83      0.82      5896

