In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
w1 = pd.read_csv("data/waves_norm/wave1_norm.csv")
w2 = pd.read_csv("data/waves_norm/wave2_norm.csv")
w3 = pd.read_csv("data/waves_norm/wave3_norm.csv")
w4 = pd.read_csv("data/waves_norm/wave4_norm.csv")
w5 = pd.read_csv("data/waves_norm/wave5_norm.csv")


In [3]:
w1.shape

(26839, 370)

In [4]:
w1.head()

Unnamed: 0,r1tr8_m,r1dresshlp,r1quitsmok,r1rorgnz,r1walkra,r1cncrmeds,r1oopmd1y,r1livsib,r1rarcarehr,r1stroklmt,...,r1walkhlp,r1iothr,r1rpfcaredpm,r1lost,r1iadlfourm,r1adla,r1doctim1y,r1rfcaredpmm,r1iqscore9,r1paina
0,10.333333,,15.0,,0.0,0.0,775.0,2.0,,,...,,0.0,,,0.0,0.0,4.0,,,0.0
1,13.333334,,,,0.0,0.0,500.0,5.0,,,...,,0.0,,,0.0,0.0,1.0,,,
2,13.0,,,,0.0,0.0,380.0,16.0,,,...,,0.0,,,0.0,0.0,2.0,,,1.0
3,10.666666,,,,0.0,0.0,0.0,6.0,,,...,,0.0,,,0.0,0.0,0.0,,,
4,8.0,,3.0,,0.0,0.0,2250.0,9.0,,,...,,0.0,,,0.0,0.0,0.0,,,


In [5]:
def lower(word:str):
    print(word.lower())

def unique_values(database ,label:str):
    
    label = label.lower()
    n_obs = database.shape[0]
    nunique_label = database[label].nunique()

    print(f"n° unique values is {nunique_label} of {n_obs}")

def clean_cardinality(df):
    
    """Drops all columns with high and low cardinality"""
    n_unique_values = df.nunique()

    high = n_unique_values[n_unique_values == df.shape[0]].index
    low = n_unique_values[n_unique_values == 1 ].index
    total = [*high, *low]

    df.drop(columns=total, inplace=True)
    
    return df

def clean_missing(df, threshold=0.3):
    
    """drops all columns above a certain threshold"""

    # mysterious treatment.
    #df["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace=True)

    # now, my treatment
    nulls = df.isnull().sum()/df.shape[0]
    #nulls.sort_values(ascending=False, inplace=True)
    null_columns = list(nulls[nulls>=threshold].index)
    df.drop(columns = null_columns, inplace=True)
    
    return df

def drop_correlated_features(X, threshold=0.5):

    """Find correlated columns in a DataFrame and drop them 
    
    Arguments:
        df (DataFrame): Data to analize.
        threshold (float): Minimun correlation value considered to decide whether
        two columns are correlated or not.
     
    Rreturns:
        A list with non-correlated columns."""
    
    numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
    numeric_data= X[numeric_features].copy()
    # Creating correlation matrix and getting their absolute values.
    corr = numeric_data.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr.where(np.triu(np.ones(corr.shape, dtype=bool), k=1))
    # Find features with correlation greater than 0.95
    the_drop = [column for column in upper.columns if any(upper[column] >= threshold)]

    no_correlated_columns = list(numeric_data.drop(the_drop, axis=1).columns)

    return no_correlated_columns

def get_features(X, type:str, get="all"):

    """Extract categorical or numeric features from a DataFrame

    Arguments:
        df (dataframe): Data to analize
        type (str): {"c, "n"} Whether the desired features is categorical(c)
                    or numeric(n).
        get (str): Whether extract only binary, no binary or all 
        categorical features {"all", "binary", "no_binary"}
        
    Return:
     list of all, binary or no binary categorical features.
    """
    if type=="c":
        # getting a table with only categorical features an their n° of unique values.
        cat_feat = X.select_dtypes(include=['O']).nunique()

        # from the object features: filtering the binary features.
        bin_cat_feat = cat_feat[cat_feat == 2].index
        # from the object features: filtering the non-binary features.
        no_bin_cat_feat = cat_feat[cat_feat != 2].index

        if get=="all":
            col = list(cat_feat.index)
        elif get=="binary":
            col= list(bin_cat_feat)
        elif get=="no_binary":
            col=list(no_bin_cat_feat)
        else:
            raise Exception("'get' must be in {all, binary, no_binary}")
    elif type =="n":
        # getting the names of the numerical features
        num_feat = X.select_dtypes(include=['int', 'float']).nunique()

        if get=="all":
            col = list(num_feat.index)
        elif get=="binary":
            bin_num_feat = num_feat[num_feat==2].index
            col= list(bin_num_feat)
        elif get=="no_binary":
            no_bin_num_feat = num_feat[num_feat!=2].index
            col=list(no_bin_num_feat)
        else:
            raise Exception("'get' must be in {all, binary, no_binary}")
    else:
        raise Exception("'type' must be in {c, n}")
    
    return col


def val_col(df, columns_to_select):
    valid_columns = [column for column in columns_to_select if column in df.columns]
    return valid_columns

def sel(df, columns_to_select):
    col = val_col(df, columns_to_select)
    return df[col]

In [6]:
print(f"n° features original data: {w1.shape[1]}")
no_corr_columns = drop_correlated_features(w1, threshold=0.4)
w1_no_correlated = w1[no_corr_columns].copy()
print(f"n° features after drop_correlated: {w1_no_correlated.shape[1]}")
w1_clean= clean_cardinality(w1_no_correlated)
#w1_clean = w1_no_correlated[w1_clean_columns].copy()
print(f"n° features after clean_missing & clean_cardinality: {w1_clean.shape[1]}")


n° features original data: 370
n° features after drop_correlated: 121
n° features after clean_missing & clean_cardinality: 95


In [7]:
w1_clean.dropna(subset=["r1hosp1y"], inplace=True)
w1_clean.shape


(15150, 95)

In [8]:
w1_clean.corr()["r1hosp1y"].sort_values(ascending=False)

r1hosp1y       1.000000
r1oophosf1y    0.263367
r1oopmd1y      0.203817
r1dresshlp     0.186922
r1doctor1y     0.166849
                 ...   
r1smoken      -0.037122
r1wthh        -0.041767
r1rarcarehr   -0.052259
r1alone       -0.141456
r1iwstat            NaN
Name: r1hosp1y, Length: 95, dtype: float64

In [9]:
from imblearn.over_sampling import SMOTE
target = "r1hosp1y"
X = w1_clean.drop(columns=target).copy()
y = w1_clean[target].copy()

In [10]:
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

df_smote = pd.concat([X_smote, y_smote], axis=1)

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
num_bin = get_features(X_train, "n", 'binary')
#X_train_num_bin = X_train[num_bin]

num_no_bin = get_features(X_train, "n", 'no_binary')
#X_train_num_no_bin = X_train[num_no_bin]

################# nothing here ######################
X_train_cat_bin = get_features(X_train, "c", 'binary') # nothing
X_train_cat_no_bin = get_features(X_train, "c", 'no_binary') # nothing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, power_transform

def preprocess_no_bin(X_train, X_test):

    si = SimpleImputer(strategy='median')
    si.fit(X_test)
    X_test = si.transform(X_test)
    X_train = si.transform(X_train)

    ss = StandardScaler()
    ss.fit(X_test)
    X_test=ss.transform(X_test)
    X_train=ss.transform(X_train)

    X_test = power_transform(X_test, method='yeo-johnson')
    X_train = power_transform(X_train, method='yeo-johnson')

    return X_train, X_test

def preprocess_bin(X_train, X_test):

    si = SimpleImputer(strategy='most_frequent')
    si.fit(X_test)
    X_test = si.transform(X_test)
    X_train = si.transform(X_train)

    return X_train, X_test

X_train[num_no_bin], X_test[num_no_bin] = preprocess_no_bin(X_train[num_no_bin], X_test[num_no_bin])
X_train[num_bin], X_test[num_bin] = preprocess_bin(X_train[num_bin], X_test[num_bin])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold


log = LogisticRegression(random_state=42, 
                         max_iter=1000, 
                         n_jobs=-1, 
                         class_weight={0:0.1, 1:0.9})
log.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
y_pred = log.predict(X_test)

roc_auc_train = roc_auc_score(y_train, log.predict(X_train))
# Validation ROC AUC Score
roc_auc_val = roc_auc_score(y_test, y_pred)
print(f"roc_auc for train data: {roc_auc_train}", 
      f"roc_auc for test data: {roc_auc_val}",
      sep='\n')

acc = accuracy_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred),
    classification_report(y_test, y_pred),
    sep='\n')

In [None]:
with plt.style.context('fivethirtyeight'):
    sns.histplot(log.predict_proba(X_test), bins=100)
    plt.title("Probability distribution for each Target category")
    plt.xlabel("Probability")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
w5.head(15)

In [None]:
def impute_missing_values(df, columns):
    from sklearn.impute import KNNImputer
    from sklearn.compose import ColumnTransformer
    
    num_imputer = KNNImputer(n_neighbors=2)
    
    imputer = ColumnTransformer([("num_imputer",num_imputer,columns)])
    
    filled_columns = imputer.fit_transform(df)
    df_filled = pd.DataFrame(filled_columns, columns=columns)
    
    return df_filled

# columns_to_impute = w5.columns.tolist()
# w5_filled = impute_missing_values(w5, columns_to_impute)
# w5_filled.head(15)

roc_auc for train data: 0.7617286962427524
roc_auc for test data: 0.7155879264505463
[[2109  608]
 [ 108  205]]
              precision    recall  f1-score   support

         0.0       0.95      0.78      0.85      2717
         1.0       0.25      0.65      0.36       313

    accuracy                           0.76      3030
   macro avg       0.60      0.72      0.61      3030
weighted avg       0.88      0.76      0.80      3030