In [189]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [190]:
X_PATH = "./data/Train.csv"
Y_PATH = "./data/y.csv"

X = pd.read_csv(X_PATH)
y = pd.read_csv(Y_PATH).values.ravel()

In [191]:
len(X)

4000

In [192]:
X.head()

Unnamed: 0,ALP_first,ALP_last,ALT_first,ALT_last,AST_first,AST_last,Age,Albumin_first,Albumin_last,BUN_first,...,TroponinT_first,TroponinT_last,UrineOutputSum,WBC_first,WBC_last,Weight,Weight_first,Weight_last,pH_first,pH_last
0,,,,,,,54.0,,,13.0,...,,,,11.2,9.4,,,,,
1,,,,,,,76.0,,,16.0,...,,,5.0,7.4,13.3,76.0,80.6,81.6,7.45,7.37
2,127.0,105.0,91.0,75.0,235.0,164.0,44.0,2.7,2.3,8.0,...,,,14.0,4.2,6.2,56.7,56.7,56.7,7.51,7.47
3,105.0,105.0,12.0,12.0,15.0,15.0,68.0,4.4,4.4,23.0,...,,,,11.5,7.9,84.6,84.6,84.6,,
4,,,,,,,88.0,3.3,3.3,45.0,...,,,,3.8,4.8,,,,,


## Removing Outliers

Taken from https://www.kaggle.com/code/jonaspalucibarbosa/removing-outliers-within-a-pipeline

In [193]:
def CustomSampler_IQR (X, y):
    
    features = X.columns
    df = X.copy()
    df['Outcome'] = y
    
    indices = [x for x in df.index]    
    out_indexlist = []
        
    for col in features:
       
        #Using nanpercentile instead of percentile because of nan values
        Q1 = np.nanpercentile(df[col], 25.)
        Q3 = np.nanpercentile(df[col], 75.)
        
        cut_off = (Q3 - Q1) * 1.5
        upper, lower = Q3 + cut_off, Q1 - cut_off
                
        outliers_index = df[col][(df[col] < lower) | (df[col] > upper)].index.tolist()
        outliers = df[col][(df[col] < lower) | (df[col] > upper)].values        
        out_indexlist.extend(outliers_index)
        
    #using set to remove duplicates
    out_indexlist = list(set(out_indexlist))
    
    clean_data = np.setdiff1d(indices,out_indexlist)

    return X.loc[clean_data], y[clean_data]

In [194]:
X,y = CustomSampler_IQR(X, y)

We can see that very small amount of samples is left.

In [195]:
len(X)

594

In [196]:
print(np.sum(y == 0))
print(np.sum(y == 1))

550
44


## Dropping Nan values

### Method1: Median

In [175]:
imputer = SimpleImputer(strategy = "median")
param_grid = {}

In [176]:
X = pd.DataFrame(imputer.fit_transform(X), index=X.index, columns=X.columns)

### Method2: KNNImpute

In [185]:
imputer = KNNImputer()
param_grid = {"imputer__n_neighbors" : [1, 3, 7], "imputer__weights": ["uniform", "distance"] }

## Fitting to a model

We can see that the accuracy metric is pretty high, especially when removing outliers. However, the f1 score is quite low and more so with removed outliers. These are due to the following reasons.

1. The data is imbalanced, so high accuracy can be achieved only with correct guesing of the class1
2. 

In [186]:
scoring_metric = "f1"

model = LogisticRegression(penalty = "l2", max_iter = 500)
pipe = Pipeline([("imputer", imputer), ('scaler', StandardScaler()), ('model', model)])
grid = GridSearchCV(pipe, param_grid, scoring = scoring_metric, verbose = 3)
grid.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END imputer__n_neighbors=1, imputer__weights=uniform;, score=0.447 total time=   7.6s
[CV 2/5] END imputer__n_neighbors=1, imputer__weights=uniform;, score=0.378 total time=   9.9s
[CV 3/5] END imputer__n_neighbors=1, imputer__weights=uniform;, score=0.323 total time=   7.9s
[CV 4/5] END imputer__n_neighbors=1, imputer__weights=uniform;, score=0.360 total time=   7.3s
[CV 5/5] END imputer__n_neighbors=1, imputer__weights=uniform;, score=0.349 total time=   7.6s
[CV 1/5] END imputer__n_neighbors=1, imputer__weights=distance;, score=0.447 total time=   8.0s
[CV 2/5] END imputer__n_neighbors=1, imputer__weights=distance;, score=0.378 total time=   7.6s
[CV 3/5] END imputer__n_neighbors=1, imputer__weights=distance;, score=0.323 total time=   7.5s
[CV 4/5] END imputer__n_neighbors=1, imputer__weights=distance;, score=0.360 total time=   7.6s
[CV 5/5] END imputer__n_neighbors=1, imputer__weights=distance;, score=0.349 tota

In [187]:
grid.best_score_

0.37140863555128056

In [179]:
grid.best_params_

{}