In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve, plot_precision_recall_curve, accuracy_score

from imblearn.over_sampling import RandomOverSampler

from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_init = pd.read_csv('data.csv', sep=';')
df_init.head()

Unnamed: 0,Id,Result,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,Feature_109,Feature_110,Feature_111,Feature_112,Feature_113,Feature_114,Feature_115,Feature_116,Feature_117,Feature_118
0,1,2,2,56,12,1,7.0,4.0,0,1.0,...,1.0,0.0,1,0,1,1,1,1,1,0
1,2,2,2,69,19,1,6.0,4.0,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,1
2,3,2,1,66,8,1,4.0,4.0,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,0
3,4,2,2,62,16,1,,3.0,0,1.0,...,1.0,0.0,1,0,1,1,0,0,1,1
4,5,2,2,67,30,1,,4.0,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,0


In [3]:
df = df_init.copy()

target_feature = 'Result'
num_features = ['Feature_39', 'Feature_40', 'Feature_41', 'Feature_42', 'Feature_43', 'Feature_44', 'Feature_45', 'Feature_46',
                'Feature_47', 'Feature_48', 'Feature_49', 'Feature_50', 'Feature_51', 'Feature_53', 'Feature_55', 'Feature_57', 
                'Feature_58', 'Feature_59', 'Feature_64', 'Feature_70', 'Feature_71', 'Feature_72', 'Feature_73']

for col in df.columns:
    if col in num_features:
        df[col].fillna(df[col].median(), inplace=True)
        df[col].astype('float64')
    else:
        df[col].fillna(-1, inplace=True)
        df[col].astype('int64')

golden_features = ['Feature_3', 'Feature_4', 'Feature_35']

drop_susp_features = True
trash_features = []
susp_features = ['Feature_29', 'Feature_30', 'Feature_32', 'Feature_101'] if drop_susp_features else []
all_features = df.columns.drop([target_feature] + golden_features + susp_features + trash_features + ['Id']).tolist()

df = df[[target_feature] + all_features]

# Result=0 - alive, Result=1 died
df.loc[df[target_feature] == 1, target_feature] = 1
df.loc[df[target_feature] == 2, target_feature] = 0

In [4]:
df.head()

Unnamed: 0,Result,Feature_1,Feature_2,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Feature_11,...,Feature_109,Feature_110,Feature_111,Feature_112,Feature_113,Feature_114,Feature_115,Feature_116,Feature_117,Feature_118
0,0,2,56,7.0,4.0,0,1.0,1.0,0.0,0.0,...,1.0,0.0,1,0,1,1,1,1,1,0
1,0,2,69,6.0,4.0,0,1.0,1.0,0.0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,1
2,0,1,66,4.0,4.0,0,1.0,1.0,1.0,0.0,...,1.0,0.0,1,0,1,1,1,0,1,0
3,0,2,62,-1.0,3.0,0,1.0,1.0,0.0,1.0,...,1.0,0.0,1,0,1,1,0,0,1,1
4,0,2,67,-1.0,4.0,0,1.0,1.0,0.0,0.0,...,1.0,0.0,1,0,1,1,1,0,1,0


In [5]:
def get_best_model(X, y):
    models = {
        0: {'name': 'LogisticRegression', 'estimator': LogisticRegression()},
        1: {'name': 'RidgeClassifier', 'estimator': RidgeClassifier()},
        2: {'name': 'SGDClassifier', 'estimator': SGDClassifier()},
        3: {'name': 'LinearSVC', 'estimator': LinearSVC()},
        4: {'name': 'DecisionTreeClassifier', 'estimator': DecisionTreeClassifier()},
        5: {'name': 'ExtraTreeClassifier', 'estimator': ExtraTreeClassifier()},
        6: {'name': 'GradientBoostingClassifier', 'estimator': GradientBoostingClassifier()},
    }

    scalers = {
        0: {'name': 'None', 'estimator': None},
        1: {'name': 'StandardScaler', 'estimator': StandardScaler()},
        2: {'name': 'Normalizer', 'estimator': Normalizer()},
        3: {'name': 'MinMaxScaler', 'estimator': MinMaxScaler()}
    }

    for i in scalers.keys():
        X_ = X.copy()
        if scalers[i]['estimator'] is not None:
            X_ = scalers[i]['estimator'].fit_transform(X_)

        print(f"\nScaler: {scalers[i]['name']}")
        for k in models.keys():
            score = cross_val_score(
                models[k]['estimator'],
                X_,
                y,
                scoring="accuracy",
                cv=5,
                # cv=StratifiedKFold(
                #     n_splits=5,
                #     shuffle=True,
                #     random_state=0
                # ),
            )
            
            print(f"\t{models[k]['name']}: {round(score.mean(),2)}")

In [6]:
X = df[all_features]
y = df[target_feature]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    shuffle=True,
    test_size=0.1,
    random_state=100,
    stratify=y
)

over_sampler = RandomOverSampler(
    random_state=0
)
X_bal_over, y_bal_over = over_sampler.fit_resample(
    X_train,
    y_train,
)

get_best_model(X_bal_over, y_bal_over)


Scaler: None
	LogisticRegression: 0.78
	RidgeClassifier: 0.86
	SGDClassifier: 0.6
	LinearSVC: 0.74
	DecisionTreeClassifier: 0.88
	ExtraTreeClassifier: 0.84
	GradientBoostingClassifier: 0.94

Scaler: StandardScaler
	LogisticRegression: 0.91
	RidgeClassifier: 0.82
	SGDClassifier: 0.92
	LinearSVC: 0.88
	DecisionTreeClassifier: 0.89
	ExtraTreeClassifier: 0.86
	GradientBoostingClassifier: 0.93

Scaler: Normalizer
	LogisticRegression: 0.63
	RidgeClassifier: 0.62
	SGDClassifier: 0.57
	LinearSVC: 0.64
	DecisionTreeClassifier: 0.91
	ExtraTreeClassifier: 0.82
	GradientBoostingClassifier: 0.94

Scaler: MinMaxScaler
	LogisticRegression: 0.87
	RidgeClassifier: 0.9
	SGDClassifier: 0.9
	LinearSVC: 0.92
	DecisionTreeClassifier: 0.86
	ExtraTreeClassifier: 0.82
	GradientBoostingClassifier: 0.93
