# **Introduction**
ICR - Identifying age related condictions is a competition in which we will measure some health characteristic data to solve critical problems in bioinformatics. The data comprises of over fifty anonymized health characteristics linked to three age-related conditions. Our goal is to predict whether a subject has or has not been diagnosed with one of these conditions. The problem is a binary classification problem. <br>
<br>
**Our Dataset**<br>
* **train.csv - The training set.**
    * Id Unique identifier for each observation.<br>
    * AB-GL Fifty-six anonymized health characteristics. All are numeric except for EJ, which is categorical.
    * Class A binary target: 1 indicates the subject has been diagnosed with one of the three conditions, 0 indicates they have not.
* **test.csv - The test set. Your goal is to predict the probability that a subject in this set belongs to each of the two classes.**
* **greeks.csv - Supplemental metadata, only available for the training set.**
    * Alpha Identifies the type of age-related condition, if present.
    * A No age-related condition. Corresponds to class 0.
    * B, D, G The three age-related conditions. Correspond to class 1.
    * Beta, Gamma, Delta Three experimental characteristics.
    * Epsilon The date the data for this subject was collected. Note that all of the data in the test set was collected after the training set was collected.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample

In [None]:
df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
df.head()

In [None]:
count = df['Class'].value_counts()
x = count.index
y = count.values
sns.barplot(x=x, y=y)
plt.ylabel('Count')
plt.xlabel('Class')
plt.show()

In [None]:
df['Class'].value_counts()

Our dataset is highly biased.

In [None]:
df.info()

ID and EJ are the only features with object data type. 

In [None]:
df['EJ'].unique()

In [None]:
id_var = df['Id']
df.drop('Id', axis = 1, inplace = True)

In [None]:
# Encoding Ej
encoder = OneHotEncoder()
encoded_columns = encoder.fit_transform(df[['EJ']])
encoded_df = pd.DataFrame(encoded_columns.toarray(), columns=encoder.get_feature_names_out(['EJ']))
df = pd.concat([df, encoded_df], axis=1)
df.drop(['EJ'], axis=1, inplace=True)

In [None]:
classes = df.pop('Class')
df.insert(df.shape[1], 'Class', classes)

In [None]:
missing = df.isnull().sum()
missing[missing>0]

Checking the mean and standard deviation to decide which central tendency to use.

In [None]:
df.describe().T

In [None]:
scaler = MinMaxScaler()
scaler.fit_transform(df)

| Features | Mean      | Std       |
| -------- | --------- | --------- |
| BQ       | 98.328737 | 96.479371 |
| CB       | 77.104151 | 159.049302|
| CC       | 0.688801  | 0.263994  |
| DU       | 1.802900  | 9.034721  |
| EL       | 69.582596 | 38.555707 |
| FC       | 71.341526 | 165.551545|
| FL       |  5.433199 | 11.496257 |
| FS       |  0.421501 | 1.305365  |
| GL       |  8.530961 | 10.327010 |

From this, we can see that CC, DU, FL, FS and GL do not have very high standard deviation, hence we can use their mean value to replace their missing values. On the other hand, for other values we must use median instead of mean since they have high std.



In [None]:
mean_features = ['CC', 'DU', 'FL', 'FS', 'GL']
median_features = ['BQ', 'CB', 'EL', 'FC']

for feature in mean_features:
    df[feature] = df[feature].fillna(df[feature].mean())
    
for feature in median_features:
    df[feature] = df[feature].fillna(df[feature].median())

In [None]:
plt.figure(figsize=(10, 13))
ax = sns.boxplot(data=df, orient = "h")
plt.title('Outlier Detection')
plt.show()

In [None]:
def detect_outliers(df,n,features):
    outlier_indices = []
    # iterate over features(columns)
    for col in features:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col],75)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers from numeric features
outliers_to_drop = detect_outliers(df, 2 ,df.columns)

In [None]:
df.loc[outliers_to_drop]

In [None]:
df.drop(df.loc[outliers_to_drop].index, inplace=True)

In [None]:
df.shape

In [None]:
df['Class'].value_counts()

In [None]:
df_majority = df[(df['Class']==0)] 
df_minority = df[(df['Class']==1)] 
# upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples= 263,  # to match majority class
                                 stratify = df_minority, # distribute classes and other chracteristics
                                 random_state=0)  # reproducible results
# Combine majority class with upsampled minority class
df = pd.concat([df_minority_upsampled, df_majority])

In [None]:
df['Class'].value_counts()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

X = df.iloc[:,:-1]
calc_vif(X)

Only GL, EJ_A and EJ_B have high VIF.

In [None]:
df.drop(['GL', 'EJ_A', 'EJ_B'], axis = 1, inplace = True)

In [None]:
Y = df['Class']
X = df.drop('Class', axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,stratify = Y, random_state = 0)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [None]:
def custom_objective(y_true, y_pred):
    eps = 1e-15
    y_pred = np.clip(y_pred, eps, 1 - eps)
    gradient = -(y_true - y_pred)
    hessian = np.ones_like(gradient)
    return gradient, hessian

# Custom scorer for balanced log loss
def balanced_log_loss(y_true, y_pred):
    eps = 1e-15
    y_pred = np.clip(y_pred, eps, 1 - eps)
    loss_numerator = - (1 / np.sum(y_true == 0)) * np.sum((1 - y_true) * np.log(1 - y_pred)) - (1 / np.sum(y_true == 1)) * np.sum(y_true * np.log(y_pred))
    return loss_numerator / 2

In [None]:
from sklearn.metrics import make_scorer
balanced_log_loss_scorer = make_scorer(balanced_log_loss, greater_is_better=False, needs_proba=True)

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

In [None]:
xgb_params = {
    'n_estimators': [100, 300, 500, 700, 1000],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.02, 0.05]
        }

catboost_params = {
    'iterations': [100, 300, 500, 700, 1000],
    'learning_rate': [0.01, 0.02, 0.05],
    'depth': [3, 4, 5],
    'subsample': [0.6, 0.8, 1.0]
}

lgbm_params = {
    'n_estimators': [100, 300, 500, 700, 1000],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.02, 0.05]
}

svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

In [None]:
xgb = XGBClassifier(objective=custom_objective)
cat = CatBoostClassifier(loss_function='Logloss', verbose = 0)
lgbm = LGBMClassifier()
svm = SVC(probability = True)

folds = 2
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 0)

random_search_xgb = RandomizedSearchCV(xgb, param_distributions=xgb_params, n_iter=param_comb, scoring=balanced_log_loss_scorer, n_jobs=4, cv=skf.split(X_train, Y_train), random_state=0 )
random_search_xgb.fit(X_train, Y_train)

random_search_cat = RandomizedSearchCV(cat, param_distributions=catboost_params, n_iter=param_comb, scoring=balanced_log_loss_scorer, n_jobs=4, cv=skf.split(X_train, Y_train), random_state=0 )
random_search_cat.fit(X_train, Y_train)

random_search_lgbm = RandomizedSearchCV(lgbm, param_distributions=lgbm_params, n_iter=param_comb, scoring=balanced_log_loss_scorer, n_jobs=4, cv=skf.split(X_train, Y_train), random_state=0 )
random_search_lgbm.fit(X_train, Y_train)

random_search_svm = RandomizedSearchCV(svm, param_distributions=svm_params, n_iter=param_comb, scoring=balanced_log_loss_scorer, n_jobs=4, cv=skf.split(X_train, Y_train), random_state=0 )
random_search_svm.fit(X_train, Y_train)

In [None]:
xgb_best = random_search_xgb.best_estimator_
cat_best = random_search_cat.best_estimator_
lgbm_best = random_search_lgbm.best_estimator_
svm_best = random_search_svm.best_estimator_

In [None]:
from sklearn.ensemble import StackingClassifier
final_estimator = LGBMClassifier()
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_best),
        ('catboost', cat_best),
        ('lgbm', lgbm_best),
        ('svm', svm_best)
    ],
    final_estimator=final_estimator,
    cv=StratifiedKFold(n_splits=2, shuffle=True, random_state=0) 
)

stacking_model.fit(X_train, Y_train)

In [None]:
log_loss_list = []
for train_index, val_index in skf.split(X_train, Y_train):
    log_loss_list.append((
            balanced_log_loss(Y_train.iloc[train_index], stacking_model.predict_proba(X_train.iloc[train_index])[:, 1]),
            balanced_log_loss(Y_train.iloc[val_index], stacking_model.predict_proba(X_train.iloc[val_index])[:, 1])
    ))

print('Loss on training:', sum(row[0] for row in log_loss_list) / len(log_loss_list))
print('Loss on validation:', sum(row[1] for row in log_loss_list) / len(log_loss_list))

In [None]:
print("Evaluation on unseen data: ", balanced_log_loss(Y_test, stacking_model.predict_proba(X_test)[:, 1]))

In [None]:
test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
test_df.head()

In [None]:
test_df.fillna(test_df.median(), inplace = True)

In [None]:
id_feature = test_df['Id']
test_features = test_df.drop(['Id','GL', 'EJ'], axis = 1)
test_features = scaler.fit_transform(test_features)
predictions = stacking_model.predict_proba(test_features)
submission = pd.DataFrame(id_feature)
submission[['class_0', 'class_1']] = predictions
submission.to_csv('/kaggle/working/submission.csv')
submission.head()

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)