In [109]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression
from sklearn.discriminant_analysis import StandardScaler
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler
from keras.models import Model
from keras.layers import Activation, Dense, LSTM, Input
from keras.optimizers import Adam, RMSprop, SGD
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
from os import path

Method to load the dataset

In [110]:
def load_dataset(columns_drop) -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):
    train = pd.read_csv("C:\\Projects\\kaggle\\competitions\\identify-age-related-conditions\\data\\train.csv")
    greeks = pd.read_csv("C:\\Projects\\kaggle\\competitions\\identify-age-related-conditions\\data\\greeks.csv")
    test = pd.read_csv("C:\\Projects\\kaggle\\competitions\\identify-age-related-conditions\\data\\test.csv")
    columns_drop = ['Id'] + columns_drop
    id_list = test["Id"]
    train.drop(columns_drop, inplace=True, axis=1)
    test.drop(columns_drop, inplace=True, axis=1)
    print(len(train.columns))
    return (train, greeks, test, id_list)

Method to split the data in validation and train set randomly

In [111]:
def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):
    X = df.loc[:, df.columns != "Class"]
    y = df.loc[:, "Class"]
    return train_test_split(X, y, test_size=split, random_state=42)

Method to build a Tensorflow model

In [112]:
def build_tensorflow_model(input_shape:int, output_shape:int, units1: int, units2: int, units3: int, activation1: str, 
                activation2: str, activation3: str, optimizer: tf.keras.optimizers.Optimizer, learning_rate: float) -> Model:
    input = Input(shape=input_shape)
    x = Dense(units=units1, activation=activation1)(input)
    x = Dense(units=units2, activation=activation2)(x)
    x = Dense(units=units3, activation=activation3)(x)
    output = Dense(units=output_shape, activation="softmax")(x)
    model = Model(inputs=[input], outputs=[output])
    
    model.compile(loss="categorical_crossentropy",
              optimizer=optimizer(learning_rate=learning_rate),
              metrics=["accuracy"])   
    return model

Method to plot the accuracy of the model

In [113]:
def plot_acc_tf_model(history:Model):
    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()
    
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

Method to fit the Tensorflow model with ES Callback

In [114]:
es_callback = EarlyStopping(
        monitor="val_accuracy",
        patience=5,
        verbose=1,
        restore_best_weights=True,
        min_delta=0.005
    )
 
def fit_model(model: Model, x: np.ndarray, y: np.ndarray, epochs: int, split: float) -> Model:
    #split train and validation
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=split, random_state=42)
    #fit the model
    history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val,y_val), callbacks=[es_callback])
    return history 

Method for GridSearch of Tensorflow model

In [115]:
grid_params = {"units1": [8], "units2": [16,32], "units3": [64,128], "activation1": ["relu"], "activation2": ["relu"], 
          "activation3": ["relu"], "optimizer": [Adam, RMSprop, SGD], "learning_rate": [0.001, 0.0007]}

#GridSearch
def grid_search_tf_model(X_train: pd.DataFrame, y_train: pd.DataFrame)->Model:
    grid = ParameterGrid(param_grid = grid_params)
    results = []
    input_shape = len(X_train[1])
    output_shape = 2
    for idx,params in enumerate(grid):
        model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **params)
        history = fit_model(model, X_train, y_train, 100, 0.2)
        val_loss = history.history['val_loss'][-1]    
        val_acc = history.history['val_accuracy'][-1]
        results.append([val_loss, val_acc])
        
    val_accuracies = [i[1] for i in results]
    val_losses= [i[0] for i in results]
    best_acc = val_accuracies.index(max(val_accuracies))
    best_loss = val_losses.index(min(val_losses))
    print(f"best acc at index {best_acc}: {max(val_accuracies)}")
    print(f"best loss at index {best_loss}: {min(val_losses)}")
    print(grid[best_acc])
    
    model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **grid[best_acc])
    history = fit_model(model, X_train, y_train, 100, 0.2)
    #plot_acc_tf_model(history)
    return model

Method to build a preprocessing pipeline

In [116]:
def build_preprocessing_pipeline(df: pd.DataFrame) -> ColumnTransformer:
    # Preprocessing for numerical data    
    numerical_transformer = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='constant')),
        ('scaler', StandardScaler())])
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    numerical_cols = [cname for cname in df.columns if df[cname].dtype in ["int64", "float64"]]
    categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    print(f"Number of columns: {len(df.columns)}")
    return preprocessor

Method to generate the Mutual Info scores and plot them

In [117]:

def make_mi_scores(X: pd.DataFrame, y: pd.DataFrame, discrete_features: list):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.figure(dpi=100, figsize=(16, 16))
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

Display MI scores that are beneath 0.01

In [118]:
#get the mutual information of features
train, greeks, test, id_list = load_dataset(columns_drop=[])
print(f"dataset shape: {train.shape}")
X_train, X_valid, y_train, y_valid = split_data(train, 0.3)
print(f"X shape: {X_train.shape} and y shape: {y_train.shape}")

cols = list(X_train.columns)
cols.append("1")
X_transformed = X_train.fillna(0)
X_transformed["EJ"].replace(['A', 'B'], [0, 1], inplace=True)
discrete_features = X_transformed.dtypes == int
mi_scores = make_mi_scores(X_transformed, y_train, discrete_features)
bad_scores = list(mi_scores.index[i] for i, score in zip(range(len(mi_scores)),mi_scores) if score < 0.01)
#plot_mi_scores(mi_scores)
print(f" Columns with MI equal zero: {bad_scores} --> total length: {len(bad_scores)}")

57
dataset shape: (617, 57)
X shape: (431, 56) and y shape: (431,)
 Columns with MI equal zero: ['AY', 'DN', 'CC', 'DY', 'EG', 'CH', 'DF', 'CF', 'CD ', 'CU', 'AR', 'GB', 'DE', 'EJ', 'EL', 'DV', 'AZ', 'FD ', 'BN', 'CL', 'BD ', 'CB'] --> total length: 22


Create a model and preprocessor

In [119]:
#Newly load the dataset with columns drop and create preprocessing pipeline
train, greeks, test, id_list = load_dataset(columns_drop=bad_scores)
print(f"dataset shape: {train.shape}")
X_train, X_valid, y_train, y_valid = split_data(train, 0.3)
preprocessor = build_preprocessing_pipeline(X_train)


# Define models for the pipeline
preprocessor.fit(X_train)
X_preprocessed = preprocessor.transform(X_train)
y_train_ohe = pd.get_dummies(y_train, columns = ['Class'])
y_valid_ohe = pd.get_dummies(y_valid, columns = ['Class'])
model_keras = KerasClassifier(model=grid_search_tf_model(X_train=X_preprocessed, y_train=y_train_ohe), epochs=0)
model_rf = RandomForestClassifier(n_estimators=100, random_state=22)
model_xgb = XGBClassifier(n_estimators=500)

35
dataset shape: (617, 35)


Number of columns: 34
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
 1/11 [=>............................] - ETA: 0s - loss: 0.3964 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
 1/11 [=>............................] - ETA: 0s - loss: 0.2606 - accuracy: 0.9688Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
 1/11 [=>............................] - ETA: 0s - loss: 0.1730 - accuracy: 0.9375Restoring model weights from the end of the best epoch: 6.
Epoch 11: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
 1/11 [=>.........

Build the final pipeline with preprocessor and model, fit it and display accuracy score

In [120]:
# Building Pipeline
def fit_pipeline(X_train: pd.DataFrame, y_train: pd.DataFrame, preprocessor, model) -> Pipeline:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', model)
                            ])
    #Fit the Model and make preds
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_valid)
    """ score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')
    print(f"Accuracy of {score}") """
    correct_answers = 0
    for y_pred,y_true in zip(preds,y_valid_ohe.to_numpy()):
        if(y_pred[0] == y_true[0]):correct_answers+=1
    print(correct_answers/len(preds))
    return pipeline

Print results of the model

In [121]:
pipeline_keras = fit_pipeline(X_train, y_train_ohe, preprocessor, model_keras)
pipeline_rf = fit_pipeline(X_train, y_train_ohe, preprocessor, model_rf)
pipeline_xgb = fit_pipeline(X_train, y_train_ohe, preprocessor, model_xgb)

0.8225806451612904
0.9247311827956989
0.946236559139785


Combine the fitted models to look if the accuracy improves

In [122]:
preds1 = pipeline_keras.predict(X_valid)
preds2 = pipeline_rf.predict(X_valid)
preds3 = pipeline_xgb.predict(X_valid)
correct_answers = 0
preds = [[0,0] for _ in range(len(preds1))]
for y_pred1,y_pred2,y_pred3, i in zip(preds1,preds2,preds3, range(len(preds1))):
    count_class1 = y_pred1[0] + y_pred2[0] + y_pred3[0]
    count_class2 = y_pred1[1] + y_pred2[1] + y_pred3[1]
    if(count_class1 > count_class2):
        preds[i][0] = 1
        preds[i][1] = 0
    else:
        preds[i][0] = 0
        preds[i][1] = 1
for y_pred,y_true in zip(preds,y_valid_ohe.to_numpy()):
    if(y_pred[0] == y_true[0]):correct_answers+=1
print(correct_answers/len(preds))

0.9193548387096774


Submission

In [123]:
""" submission = pd.DataFrame()
prediction = model.predict(x_test)
submission.insert(0, "Id", id_number, False)
submission.insert(1, "class_0", [round(1-i[0],2) for i in prediction], True)
submission.insert(2, "class_1", [round(i[0],2) for i in prediction], True)
submission.to_csv("/kaggle/working/submission.csv",index = False) """

' submission = pd.DataFrame()\nprediction = model.predict(x_test)\nsubmission.insert(0, "Id", id_number, False)\nsubmission.insert(1, "class_0", [round(1-i[0],2) for i in prediction], True)\nsubmission.insert(2, "class_1", [round(i[0],2) for i in prediction], True)\nsubmission.to_csv("/kaggle/working/submission.csv",index = False) '