<a href="https://www.kaggle.com/code/nyagami/kidney-stones-prediction?scriptVersionId=130511112" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<strong style="font-size:26px">KIDNEY STONES PREDICTION</strong>

# 1. Introduction
## 1.1 Evaluation
Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

## 1.2 Submission File
For each id in the test set, you must predict the probability of target (likelihood of the presence of a kidney stone). The file should contain a header and have the following format:

| id | target |
|----|--------|
| 414 | 0.5    |
| 415 | 0.1    |
| 416 | 0.9    |


## 1.3 Dataset Description
The dataset for this competition (both train and test) was generated from a deep learning model trained on the Kidney Stone Prediction based on Urine Analysis dataset. 

# 2. Import Libraries and Preview Datasets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.preprocessing import StandardScaler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Read train and test sets
train = pd.read_csv('/kaggle/input/playground-series-s3e12/train.csv', index_col = 'id')
test = pd.read_csv('/kaggle/input/playground-series-s3e12/test.csv', index_col = 'id')

# Extract the target column from train set
y_train = train['target']
train = train.drop('target',axis = 1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

# 3. Exploratory Data Analysis

In [None]:
train.describe()

## 3.1 Pairplot

In [None]:
sns.pairplot(train)
plt.show()

## 3.2 Correlation heatmap
There seems to be strong positive correlation between `urea` and `osmo`. 

In [None]:
corrs = train.corr()
mask = np.triu(np.ones_like(corrs, dtype = bool))
sns.heatmap(train.corr(), mask = mask, cmap = 'BrBG', annot = True, square = True)
plt.title('Correlation between variables')
plt.show()

# 4. Preprocessing
## 4.1 Missing values
The datasets have no missing values

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

## 4.2. Checking for variance
The variables have vastly different variances that could affect generalization. For instance, the variance of `osmo` is almost 55,000 times greater than that of `gravity`. Standardization of the variables will be done to convert them to the same scale.

In [None]:
train.var()

## 4.3 Standardization

In [None]:
scaler = StandardScaler()
# Fit transform the train set
train[train.columns] = scaler.fit_transform(train[train.columns])
X_train = train
# Transform the test set
test[train.columns] = scaler.fit_transform(test[train.columns])
X_test = test

# 5. Feature Selection
We will evaluate the contribution of each variable, and how important each variable is towards predicting the target variable

In [None]:
roc_score = make_scorer(roc_auc_score)
kf = KFold(n_splits = 6, shuffle = True, random_state = 987)

## 5.1 Feature selection with Random Forests
Using random forests, all variables have a significant contribution towards explaining the variance. However, calc explains the highest variance of 32% with the rest of the variables having a contribution of between 12% and 15%. Using this model, no further feature selection will be done since the all variables have a significant contribution to the model. 

In [None]:
rf = RandomForestClassifier()

# Fit the classifier
rf.fit(X_train, y_train)

# Retrieve the feature importances
rf_importance = pd.Series(rf.feature_importances_, index = rf.feature_names_in_)
rf_importance = rf_importance.sort_values(ascending = True)

# Visualizing the feature importances
fig = plt.figure(figsize = (7,3))
rf_importance.plot(kind = 'barh', width =0.4)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
rf = RandomForestClassifier()

for features in list(range(6,0,-1)):
    rfe = RFE(estimator = rf, n_features_to_select = features, step = 1)
    results = cross_val_score(rfe, X_train,y_train, cv = kf, scoring = roc_score)
    print(f'Average roc_score is {np.mean(results)} with {features} features')

## 5.2 Feature selection with xgboost
Using xgboost, all features have a significant contribution to explaining the variance in the target variable. Calc and pH however have the highest variance with the other variables variance being significant to the model. 

In [None]:
# Convert the data to dmatrix
kidney_matrix = xgb.DMatrix(data = X_train, label = y_train)
params = {'objective':'binary:logistic'}

# Train data using dmatrix format
xgb_clf = xgb.train(dtrain = kidney_matrix, params = params, num_boost_round = 10)

# Plot the feature importances
fig, ax  = plt.subplots(figsize = (7,3))
xgb.plot_importance(xgb_clf, ax = ax, height = 0.4)
plt.show()

In [None]:
xgb_cl = xgb.XGBClassifier(objective = 'binary:logistic')

for features in list(range(6,0,-1)):
    rfe = RFE(estimator = xgb_cl, n_features_to_select = features, step = 1)
    results = cross_val_score(rfe, X_train,y_train, cv = kf, scoring = roc_score)
    print(f'Average roc_score is {np.mean(results)} with {features} features')   


## 5.3 Feature selection with Logistic Regression
In logistic regression, the size of the coefficients determine their magnitude of contribution. Those close to zero have little contribution while large coefficients are an indicator that the variable has a high effect on the target variable. In our problem, the `calc` variable has the biggest coefficient.

In [None]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)
logreg_coeffs = pd.DataFrame({'coeffs':logreg.coef_[0], 'features':logreg.feature_names_in_}).set_index('features').sort_values(by='coeffs')
fig, ax  = plt.subplots(figsize = (7,3))
logreg_coeffs.plot(kind = 'barh',ax = ax)
plt.grid(True)
plt.xlabel('Coefficients')
plt.title('Coefficients of logistic regression')
plt.legend("")
plt.show()

In [None]:
logreg = LogisticRegression()

for features in list(range(6,0,-1)):
    rfe = RFE(estimator = logreg, n_features_to_select = features, step = 1)
    results = cross_val_score(rfe, X_train,y_train, cv = kf, scoring = roc_score)
    print(f'Average roc_score is {np.mean(results)} with {features} features')   


# 6. Model Prediction

In [None]:
classifiers = [
    #('logistic regression',LogisticRegression()),
    ('random forest', RandomForestClassifier()),
    ('xgboost',xgb.XGBClassifier(objective='binary:logistic')), 
    ('gradient boosting',GradientBoostingClassifier(loss='deviance')),
    ('svc',SVC(probability = True)),
    ('ada',AdaBoostClassifier())
]

In [None]:
vc = VotingClassifier(estimators = classifiers, voting = 'soft', n_jobs =-1)

classifiers = [
    ('logistic regression',LogisticRegression()),
    ('random forest', RandomForestClassifier()),
    ('xgboost',xgb.XGBClassifier(objective='binary:logistic')), 
    ('gradient boosting',GradientBoostingClassifier(loss='deviance')),
    ('svc',SVC(probability = True)),
    ('ada',AdaBoostClassifier()),
    ('voting classifier',vc)
]
for name, clf in classifiers:
    results = cross_val_score(clf, X_train,y_train, cv = kf, scoring = roc_score)
    print(f'Average roc_score is {np.mean(results)} for {name} classifier')

# 7. Hyperparameter Tuning
## 7.1. Logistic Regression

In [None]:
logreg = LogisticRegression(n_jobs = -1)
param_grid = {
    'C':[0.1]
}

logreg_cv = GridSearchCV(logreg, param_grid = param_grid, cv=kf, scoring = roc_score)
logreg_cv.fit(X_train, y_train)
print('Average score',logreg_cv.cv_results_['mean_test_score'][0])
print(logreg_cv.best_params_)

logreg_tuned = logreg_cv.best_estimator_

## 7.2. Random forest

In [None]:
rf = RandomForestClassifier(n_jobs = -1, random_state = 987)
param_grid = {
    'n_estimators':[200], 'max_depth':[2], 'max_samples':[0.4]
}

rf_cv = GridSearchCV(rf, param_grid = param_grid, cv = kf, scoring = roc_score)
rf_cv.fit(X_train, y_train)
print('Average score',rf_cv.cv_results_['mean_test_score'][0])
print(rf_cv.best_params_)

rf_tuned = rf_cv.best_estimator_

## 7.3 Xgboost

In [None]:
xgb_cl = xgb.XGBClassifier(random_state = 987, n_jobs = -1)
param_grid = {
    'objective':['binary:logistic'], 'n_estimators':[10], 'reg_alpha':[0.001]
}

xgb_cl_cv = GridSearchCV(xgb_cl, param_grid = param_grid, cv = kf, scoring = roc_score)
xgb_cl_cv.fit(X_train, y_train)
print('Average score',xgb_cl_cv.cv_results_['mean_test_score'][0])
print(xgb_cl_cv.best_params_)

xgb_tuned = xgb_cl_cv.best_estimator_

## 7.4 Gradient boosting

In [None]:
gradient = GradientBoostingClassifier(random_state = 987)

param_grid = {
    'loss':['deviance'],'n_estimators':[40], 'subsample':[0.4], 'ccp_alpha':[0.001]
}

gradient_cv = GridSearchCV(gradient, param_grid = param_grid, cv = kf, scoring = roc_score)
gradient_cv.fit(X_train, y_train)

print('Average score',gradient_cv.cv_results_['mean_test_score'][0])
print(gradient_cv.best_params_)

gradient_tuned = gradient_cv.best_estimator_

## 7.5 SVC

In [None]:
svc = SVC(probability = True, random_state = 987)

param_grid = {
    'C':[1]
}

svc_cv = GridSearchCV(svc, param_grid = param_grid, cv = kf, scoring = roc_score)
svc_cv.fit(X_train, y_train)

print('Average score',svc_cv.cv_results_['mean_test_score'][0])
print(svc_cv.best_params_)

svc_tuned = svc_cv.best_estimator_

## 7.6. AdaBoost

In [None]:
ada = AdaBoostClassifier(random_state = 987)

param_grid = {
    'base_estimator':[rf_tuned], 'learning_rate':[0.2], 'n_estimators':[40]
}

ada_cv = GridSearchCV(ada, param_grid = param_grid, cv = kf, scoring = roc_score,verbose=1)
ada_cv.fit(X_train, y_train)

print('Average score',ada_cv.cv_results_['mean_test_score'][0])
print(ada_cv.best_params_)

ada_tuned = ada_cv.best_estimator_

# 8. Predictions

In [None]:
classifiers = [
    ('xgboost',xgb_tuned), 
    ('ada',ada_tuned),
    ('random forest', rf_tuned),
    ('gradient boosting',gradient_tuned), 
    ('svc',svc_tuned),
    ('logistic regression',logreg_tuned)
]

vc_tuned = VotingClassifier(estimators = classifiers, voting = 'soft', n_jobs =-1)

In [None]:
classifiers = [
    ('xgboost',xgb_tuned), 
    ('ada',ada_tuned),
    ('random forest', rf_tuned),
    ('gradient boosting',gradient_tuned),    
    ('svc',svc_tuned),
    ('logistic regression',logreg_tuned),
    ('voting classifier',vc_tuned)
]

df_list = []

for name, clf in classifiers:
    results = cross_val_score(clf, X_train,y_train, cv = kf, scoring = roc_score)
    #print(f'Average roc_score is {np.mean(results)} for {name} classifier')
    df_list.append({'classifier':name, 'roc_score':np.mean(results)})

df = pd.DataFrame(df_list).sort_values(by = 'roc_score', ascending = False)
df

In [None]:
vc_tuned.fit(X_train, y_train)
y_preds = vc_tuned.predict_proba(X_test)[:,1]
#y_preds = np.round(y_preds,2)
submission_df = pd.DataFrame({'id':X_test.index,'target':y_preds})
submission_df.head()

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index = False)