In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline 
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score
import joblib 

In [2]:
df=pd.read_csv(r"Train.csv")

In [3]:
dffeature=df.copy()

In [4]:
dff=dffeature.drop(['feature_7','feature_10'],axis=1)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X=dff.drop("labels",axis=1)
y=dff.labels

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
import xgboost
from xgboost import XGBClassifier
model1 = XGBClassifier()

In [9]:
model1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
predtrain=model1.predict(X_train)
predtest=model1.predict(X_test)

In [11]:
def performa(y,y_pred):
    confusion=confusion_matrix(y, y_pred)
    accuracy=accuracy_score(y, y_pred)
    precision=precision_score(y, y_pred)
    recall=recall_score(y, y_pred)
    ROC_AUC=roc_auc_score(y, y_pred)
    print("confusion_matrix\n",confusion)
    print("accuracy: ",accuracy)
    print("precision: ",precision)
    print("recall: ",recall)
    print("ROC_AUC: ",ROC_AUC)

# Train

In [12]:
performa(y_train,predtrain)

confusion_matrix
 [[23715   241]
 [  812  2358]]
accuracy:  0.9611811546118115
precision:  0.9072720277029627
recall:  0.7438485804416404
ROC_AUC:  0.8668942351198017


# Test

In [13]:
performa(y_test,predtest)

confusion_matrix
 [[5775  210]
 [ 426  371]]
accuracy:  0.9062223532881156
precision:  0.6385542168674698
recall:  0.465495608531995
ROC_AUC:  0.7152039446168748


# Tuning 

In [14]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [15]:
## Hyper Parameter Optimization

params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [16]:
classifier=xgboost.XGBClassifier()


In [17]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [18]:
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   28.5s finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=No

In [19]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [20]:
model2=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=5, missing=np.nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
model2.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [22]:
predtrain=model2.predict(X_train)
predtest=model2.predict(X_test)

# Train

In [23]:
performa(y_train,predtrain)

confusion_matrix
 [[23746   210]
 [  874  2296]]
accuracy:  0.9600383396003834
precision:  0.9162011173184358
recall:  0.7242902208201892
ROC_AUC:  0.8577620748448916


# Test 

In [24]:
performa(y_test,predtest)

confusion_matrix
 [[5809  176]
 [ 446  351]]
accuracy:  0.9082866411088174
precision:  0.6660341555977229
recall:  0.44040150564617314
ROC_AUC:  0.7054973275933456


In [25]:
from sklearn.model_selection import cross_val_score
cross_val_score(model2, X_train, y_train,cv=5).mean()

0.9058466947841168

In [26]:
cross_val_score(model2, X_test, y_test,cv=5).mean()

0.907696136932284

In [27]:
accuracy=[0.9082866411088174,0.9600383396003834]
precision=[0.6660341555977229,0.9162011173184358]
recall=[0.44040150564617314,0.7242902208201892]
ROC=[0.7054973275933456,0.8577620748448916]
cross_val=[0.907696136932284,0.9058466947841168]
df_final_performa=pd.DataFrame({"accuracy":accuracy,
                               "precision":precision,
                               "recall":recall,
                               "ROC":ROC,
                               "cross_val":cross_val},index={"test","train"})

In [28]:
df_final_performa

Unnamed: 0,accuracy,precision,recall,ROC,cross_val
train,0.908287,0.666034,0.440402,0.705497,0.907696
test,0.960038,0.916201,0.72429,0.857762,0.905847


# ---------------------------------------------------------------------------------------------------------------

# Final output 

In [3]:
dff=pd.read_csv(r"Test.csv")

In [30]:
dff.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,0.571051,0.406843,0.984523,0.011016,-0.569351,-0.411453,-0.25194,0,1,1,0,0,0,0,11,3
1,-1.12408,-0.166935,0.503892,-0.322932,0.721811,0.547323,0.182198,0,2,1,0,0,0,0,5,1
2,0.476877,0.145079,-0.577529,-0.691828,-0.24656,-0.411453,-0.25194,0,1,1,0,0,0,0,1,3
3,1.606965,-0.447419,1.825628,-0.983062,7.177616,-0.411453,-0.25194,1,1,0,0,1,0,0,5,3
4,-0.935732,-0.364653,-1.178318,-0.322932,0.07623,-0.411453,-0.25194,8,2,1,0,1,0,2,8,3


In [31]:
final_data=dff.drop(['feature_7','feature_10'],axis=1)

In [32]:
dff["Pred_labels"]=model2.predict(final_data)

In [33]:
dff.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,Pred_labels
0,0.571051,0.406843,0.984523,0.011016,-0.569351,-0.411453,-0.25194,0,1,1,0,0,0,0,11,3,1
1,-1.12408,-0.166935,0.503892,-0.322932,0.721811,0.547323,0.182198,0,2,1,0,0,0,0,5,1,0
2,0.476877,0.145079,-0.577529,-0.691828,-0.24656,-0.411453,-0.25194,0,1,1,0,0,0,0,1,3,0
3,1.606965,-0.447419,1.825628,-0.983062,7.177616,-0.411453,-0.25194,1,1,0,0,1,0,0,5,3,0
4,-0.935732,-0.364653,-1.178318,-0.322932,0.07623,-0.411453,-0.25194,8,2,1,0,1,0,2,8,3,0


In [34]:
dff.to_csv("Submission_file.csv",index=False)