In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [2]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
y_df_train = df_train.pop('Class')

df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")


In [4]:
print("Total size of  train dataset "+ str(df_train.size))
print("Shape of train dataset "+ str(df_train.shape))
print("Total records in train dataset "+ str(df_train.shape[0]))

Total size of  train dataset 35169
Shape of train dataset (617, 57)
Total records in train dataset 617


In [5]:
print("Total size of  test dataset "+ str(df_test.size))
print("Shape of test dataset "+ str(df_test.shape))
print("Total records in test dataset "+ str(df_test.shape[0]))

Total size of  test dataset 285
Shape of test dataset (5, 57)
Total records in test dataset 5


**if the column is of object type convert it to catgory type as algorithms work with numeric data only**

In [6]:
for column in df_train.loc[:, df_train.columns != 'Id']:
    if(df_train[column].dtype == 'O'):
       df_train[column] = df_train[column].astype('category')
       df_train[column] = df_train[column].cat.codes
    df_train[column].fillna(value=df_train[column].mean(), inplace=True)
    
    
for column in df_test.loc[:, df_test.columns != 'Id']:
    if(df_test[column].dtype == 'O'):
       df_test[column] = df_test[column].astype('category')
       df_test[column] = df_test[column].cat.codes
    df_test[column].fillna(value=df_test[column].mean(), inplace=True)

**move the data points outside of the 25 and 75 percentile range to the boundary.
<br> Reference : https://www.analyticsvidhya.com/blog/2022/09/dealing-with-outliers-using-the-iqr-method/**

In [7]:
for column in df_train.loc[:, df_train.columns != 'Id']:
    iqr = df_train[column].quantile(0.75) - df_train[column].quantile(0.25)
    cutoff = iqr * 1.5
    lowerLimit = df_train[column].quantile(0.25) - cutoff
    higherLimit = df_train[column].quantile(0.75) + cutoff
    df_train[column].replace(to_replace = df_train[column][df_train[column]>higherLimit].tolist(),
                       value = higherLimit,
                       inplace=True)
    df_train[column].replace(to_replace = df_train[column][df_train[column]<lowerLimit].tolist(),
                       value = lowerLimit,
                       inplace=True)
    
    
for column in df_test.loc[:, df_test.columns != 'Id']:
    iqr = df_test[column].quantile(0.75) - df_test[column].quantile(0.25)
    cutoff = iqr * 1.5
    lowerLimit = df_test[column].quantile(0.25) - cutoff
    higherLimit = df_test[column].quantile(0.75) + cutoff
    df_test[column].replace(to_replace = df_test[column][df_test[column]>higherLimit].tolist(),
                       value = higherLimit,
                       inplace=True)
    df_test[column].replace(to_replace = df_test[column][df_test[column]<lowerLimit].tolist(),
                       value = lowerLimit,
                       inplace=True)

**Applying standardScaler on the data**

In [8]:
print("number of train records before scaling "+str(df_train.shape[0]))
print("number of test records before scaling "+str(df_test.shape[0]))

number of train records before scaling 617
number of test records before scaling 5


In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_train_scaled = sc.fit_transform(df_train.loc[:, df_train.columns != 'Id'])

#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
df_test_scaled = sc.transform(df_test.loc[:, df_test.columns != 'Id'])

In [10]:
print("number size of records after scaling "+str(df_train_scaled.size))
print("number size of records after scaling "+str(df_test_scaled.size))

print("number of records after scaling "+str(df_train_scaled.shape[0]))
print("number of records after scaling "+str(df_test_scaled.shape[0]))

print("shape of records after train records scaling "+str(df_train_scaled.shape))
print("shape of records after test records scaling "+str(df_test_scaled.shape))



number size of records after scaling 34552
number size of records after scaling 280
number of records after scaling 617
number of records after scaling 5
shape of records after train records scaling (617, 56)
shape of records after test records scaling (5, 56)


**principal component analysis to identify best features impacting the result**

In [11]:
from sklearn.decomposition import PCA
principal=PCA(n_components=0.99)
principal.fit(df_train_scaled)
X_train=principal.transform(df_train_scaled)



In [12]:
X_test=principal.transform(df_test_scaled)

In [13]:
print("number of train records after PCA "+str(len(X_train)))
print("number of test records after PCA "+str(len(X_test)))

number of train records after PCA 617
number of test records after PCA 5


In [14]:
print("size of scaled train dataset "+str(X_train.size))
print("size of scaled test dataset "+str(X_test.size))

size of scaled train dataset 29616
size of scaled test dataset 240


In [15]:
print("shape of scaled train dataset "+str(X_train.shape))
print("shape of scaled test dataset "+str(X_test.shape))

shape of scaled train dataset (617, 48)
shape of scaled test dataset (5, 48)


**adding back 'Id' after rescaling and PCA**

In [16]:
reshaped_arr = df_train['Id'].to_numpy()[:,np.newaxis]
result_array_train = np.hstack((reshaped_arr, X_train))

reshaped_arr = df_test['Id'].to_numpy()[:,np.newaxis]
result_array_test = np.hstack((reshaped_arr, X_test))

In [17]:
print("Total records in result train dataset "+ str(result_array_train.shape))
print("Total records in result test dataset "+ str(result_array_test.shape))


Total records in result train dataset (617, 49)
Total records in result test dataset (5, 49)


**split data into training and test**

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(result_array_train, y_df_train, test_size=0.2)

In [19]:
print("Total size in training dataset "+ str(x_train.size))
print("Total size in test dataset "+ str(x_test.size))

print("Total records in training dataset "+ str(x_train.shape[0]))
print("Total records in test dataset "+ str(x_test.shape[0]))

Total size in training dataset 24157
Total size in test dataset 6076
Total records in training dataset 493
Total records in test dataset 124


In [20]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth':[None,3,5,8,10],
    'n_estimators': [200,500,800,1000],
    'subsample':[0.5,0.8],
    'colsample_bytree': [0.5,0.8]
}

xgb = xgb.XGBClassifier()
# Instantiate the grid search model
xgb_grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

xgb_grid_search.fit(x_train[:,1:], y_train)
print('Best Score: %s' % xgb_grid_search.best_score_)
print('Best Hyperparameters: %s' % xgb_grid_search.best_params_)



from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb

xgb = xgb.XGBClassifier(
    objective='binary:logistic',  # Binary classification task
    colsample_bytree = xgb_grid_search.best_params_.get("colsample_bytree"), 
    max_depth = xgb_grid_search.best_params_.get("max_depth"), 
    n_estimators= xgb_grid_search.best_params_.get("n_estimators"), 
    subsample= xgb_grid_search.best_params_.get("subsample"),              
)

# Train the model on the training data
xgb.fit(x_train[:,1:], y_train)

print("score on test(xgb): " + str(xgb.score(x_test[:,1:], y_test)))
print("score on train(xgb): "+ str(xgb.score(x_train[:,1:], y_train)))

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=200, subsample=0.5; total time=   0.2s
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=200, subsample=0.5; total time=   0.2s
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=200, subsample=0.5; total time=   0.2s
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=200, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=500, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=500, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=500, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.5, max_depth=None, n_estimators=500, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.5, max_depth=None, 

Best Score: 0.8965986394557823
Best Hyperparameters: {'colsample_bytree': 0.8, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.5}


score on test(xgb): 0.8709677419354839
score on train(xgb): 1.0


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Create the parameter grid based on the results of random search 
param_grid = {
    'min_samples_split': [3,5,10,15],
    'max_depth': [None,3,5,8,10],
    'n_estimators': [200,500,800,1000,1500]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
rf_grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
rf_grid_search.fit(x_train[:,1:], y_train)

# n_estimators = number of decision trees
rf = RandomForestClassifier(bootstrap = False, 
                            max_depth = rf_grid_search.best_params_.get("max_depth"), 
                            min_samples_split = rf_grid_search.best_params_.get("min_samples_split"), 
                            n_estimators = rf_grid_search.best_params_.get("n_estimators"))

rf.fit(x_train[:,1:], y_train)
print("score on test(rf): " + str(rf.score(x_test[:,1:], y_test)))
print("score on train(rf): "+ str(rf.score(x_train[:,1:], y_train)))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=500, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=500, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=800, subsample=0.5; total time=   0.6s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=800, subsample=0.5; total time=   0.7s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=800, subsample=0.5; total time=   0.7s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=800, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=800, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=1000, subsample=0.5; total time=   0.8s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=1000, subsample=0.5; total time=   0.8s
[CV] END colsample_bytree=0.8, max_depth=8, n_estimators=1000, subsampl

score on test(rf): 0.8629032258064516
score on train(rf): 1.0


In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

#adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())

# Define the parameter grid to search over
dtc_param_grid = {
    'min_samples_split': [5,10,15,20],
    'min_samples_leaf': [3,5,8],
    'max_depth': [3,5,8,10,12],
    'criterion':['entropy','gini']
}

dtc_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), 
                           param_grid=dtc_param_grid, 
                           cv=5, scoring='accuracy', verbose=2)
dtc_grid_search.fit(x_train[:,1:], y_train)

print('Best Score: %s' % dtc_grid_search.best_score_)
print('Best Hyperparameters: %s' % dtc_grid_search.best_params_)


from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=dtc_grid_search.best_params_.get("max_depth"),
                             min_samples_split=dtc_grid_search.best_params_.get("min_samples_split"),
                             min_samples_leaf=dtc_grid_search.best_params_.get("min_samples_leaf"),
                             criterion=dtc_grid_search.best_params_.get("criterion")
                            )
dtc.fit(x_train[:,1:], y_train)
print("score on test(dtc): "  + str(dtc.score(x_test[:,1:], y_test)))
print("score on train(dtc): " + str(dtc.score(x_train[:,1:], y_train)))

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=10; total time=   0.0s
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=10; total time=   0.0s
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=10; total time=   0.0s
[CV] END criterion=entropy, max_depth=3, min_samples_leaf=3, min_samples_split=10; total time=   0.0s
[CV] END criterion=entro

Best Score: 0.8681302824159968
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 8, 'min_samples_split': 5}


score on test(dtc): 0.7983870967741935
score on train(dtc): 0.8945233265720081


In [23]:
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import GridSearchCV
adaboost = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=dtc_grid_search.best_params_.get("max_depth"),
                             min_samples_split=dtc_grid_search.best_params_.get("min_samples_split"),
                             criterion=dtc_grid_search.best_params_.get("criterion")))

# Define the parameter grid to search over
param_grid = {
      'n_estimators': [500,800,1000,1500,2000]
}

# Initialize GridSearchCV for hyperparameter search
ada_grid_search = GridSearchCV(estimator=adaboost, 
                           param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2)
ada_grid_search.fit(x_train[:,1:], y_train)
print('Best Score: %s' % ada_grid_search.best_score_)
print('Best Hyperparameters: %s' % ada_grid_search.best_params_)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=dtc_grid_search.best_params_.get("max_depth"),
                             min_samples_split=dtc_grid_search.best_params_.get("min_samples_split"),
                             criterion=dtc_grid_search.best_params_.get("criterion")),
                        learning_rate=0.5,
                        n_estimators=ada_grid_search.best_params_.get("n_estimators"))

adb.fit(x_train[:,1:], y_train)
print("score on test (adb): " + str(adb.score(x_test[:,1:], y_test)))
print("score on train (adb): "+ str(adb.score(x_train[:,1:], y_train)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ...................................n_estimators=500; total time=   7.0s
[CV] END ...................................n_estimators=500; total time=   6.3s
[CV] END ...................................n_estimators=500; total time=   6.2s
[CV] END ...................................n_estimators=500; total time=   6.5s
[CV] END ...................................n_estimators=500; total time=   6.4s
[CV] END ...................................n_estimators=800; total time=  10.8s
[CV] END ...................................n_estimators=800; total time=  10.2s
[CV] END ...................................n_estimators=800; total time=  10.2s
[CV] END ...................................n_estimators=800; total time=  10.9s
[CV] END ...................................n_estimators=800; total time=  10.2s
[CV] END ..................................n_estimators=1000; total time=  12.7s
[CV] END ..................................n_esti

Best Score: 0.8600494743351887
Best Hyperparameters: {'n_estimators': 800}


score on test (adb): 0.8467741935483871
score on train (adb): 1.0


In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bg = BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=dtc_grid_search.best_params_.get("max_depth"),
                             min_samples_split=dtc_grid_search.best_params_.get("min_samples_split"),
                             criterion=dtc_grid_search.best_params_.get("criterion")))

# Define the parameter grid to search over
bag_param_grid = {
    'n_estimators': [5,10,20,50,80],
    'max_features': [0.5,0.8],
    'max_samples':  [0.5,0.8]
}

# Initialize GridSearchCV for hyperparameter search
bag_grid_search = GridSearchCV(estimator=bg, 
                           param_grid=bag_param_grid, 
                           cv=5, scoring='accuracy', verbose=2)
bag_grid_search.fit(x_train[:,1:], y_train)
print('Best Score: %s' % bag_grid_search.best_score_)
print('Best Hyperparameters: %s' % bag_grid_search.best_params_)

bg=BaggingClassifier(DecisionTreeClassifier(max_depth=dtc_grid_search.best_params_.get("max_depth"),
                             min_samples_split=dtc_grid_search.best_params_.get("min_samples_split"),
                             criterion=dtc_grid_search.best_params_.get("criterion")),
                                            max_samples=bag_grid_search.best_params_.get("max_samples"),
                                            max_features=bag_grid_search.best_params_.get("max_features"),
                                            n_estimators=bag_grid_search.best_params_.get("n_estimators"))
bg.fit(x_train[:,1:], y_train)
print("score on test (bg): " + str(bg.score(x_test[:,1:], y_test)))
print("score on train (bg): "+ str(bg.score(x_train[:,1:], y_train)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ..max_features=0.5, max_samples=0.5, n_estimators=5; total time=   0.0s
[CV] END ..max_features=0.5, max_samples=0.5, n_estimators=5; total time=   0.0s
[CV] END ..max_features=0.5, max_samples=0.5, n_estimators=5; total time=   0.0s
[CV] END ..max_features=0.5, max_samples=0.5, n_estimators=5; total time=   0.0s
[CV] END ..max_features=0.5, max_samples=0.5, n_estimators=5; total time=   0.0s
[CV] END .max_features=0.5, max_samples=0.5, n_estimators=10; total time=   0.1s
[CV] END .max_features=0.5, max_samples=0.5, n_estimators=10; total time=   0.1s
[CV] END .max_features=0.5, max_samples=0.5, n_estimators=10; total time=   0.0s
[CV] END .max_features=0.5, max_samples=0.5, n_estimators=10; total time=   0.0s
[CV] END .max_features=0.5, max_samples=0.5, n_estimators=10; total time=   0.1s
[CV] END .max_features=0.5, max_samples=0.5, n_estimators=20; total time=   0.1s
[CV] END .max_features=0.5, max_samples=0.5, n_

Best Score: 0.8600907029478458
Best Hyperparameters: {'max_features': 0.8, 'max_samples': 0.5, 'n_estimators': 80}


score on test (bg): 0.8306451612903226
score on train (bg): 0.896551724137931


In [25]:
from sklearn.svm import SVC

svm=SVC(probability=True) 
svm.fit(x_train[:,1:], y_train)
print("score on test (svm): " + str(svm.score(x_test[:,1:], y_test)))
print("score on train (svm): "+ str(svm.score(x_train[:,1:], y_train)))

score on test (svm): 0.9193548387096774
score on train (svm): 0.9858012170385395


In [26]:
from sklearn.svm import LinearSVC

linearsvm=LinearSVC(C=0.01)
linearsvm.fit(x_train[:,1:], y_train)
print("score on test (linearsvm): " + str(linearsvm.score(x_test[:,1:], y_test)))
print("score on train (linearsvm): "+ str(linearsvm.score(x_train[:,1:], y_train)))

score on test (linearsvm): 0.9032258064516129
score on train (linearsvm): 0.9371196754563894


In [27]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(solver='liblinear')
log_reg = logisticRegr.fit(x_train[:,1:], y_train)
print("score on test (log_reg): " + str(log_reg.score(x_test[:,1:],y_test)))
print("score on train (log_reg): "+ str(log_reg.score(x_train[:,1:],y_train)))

score on test (log_reg): 0.8790322580645161
score on train (log_reg): 0.9553752535496958


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier

import numpy as np


voting_clf=VotingClassifier(estimators=[('xgb',xgb),
                                 ('rf',rf),
                                 ('adb',adb),
                                 ('dtc',dtc),
                                 ('bg',bg),
                                 ('svm',svm),
                                 ('log_reg',log_reg)],voting='soft')
#calibrated = CalibratedClassifierCV(evc, method='sigmoid', cv=5)
voting_clf.fit(x_train[:,1:], y_train)
predictions = np.round(voting_clf.predict_proba(result_array_test[:,1:]),decimals=1)
#evc.fit(x_train, y_train)
#print("score on test (VC): " + str(calibrated.score(x_test[:,1:], y_test)))
#print("score on train (VC): "+ str(calibrated.score(x_train[:,1:], y_train)))


In [29]:
reshaped_arr = result_array_test[:,0][:,np.newaxis]
result_array = np.hstack((reshaped_arr, predictions))

In [30]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")


header = "Id,class_0,class_1"
filename = "submission.csv"

with open(filename, 'w') as file:
    file.write(header + '\n')

with open(filename, 'a') as file:
    np.savetxt(file, result_array, delimiter=',', fmt='%s')

19