### Import Required Librabries

In [64]:
import pyforest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score,classification_report, cohen_kappa_score


### Read Dataset

In [2]:
train = pd.read_csv("challenge2_dataset.csv")
test = pd.read_csv("challenge2_prediction.csv")

In [3]:
train.head()

Unnamed: 0,1,2,3,Target
0,0.6,0.31,13.0,False
1,0.74,0.33,21.0,False
2,0.54,0.19,,True
3,0.46,,9.0,False
4,0.77,0.15,15.0,True


In [72]:
train['Target'].value_counts()

0    1557
1     420
Name: Target, dtype: int64

In [78]:
print(train['Target'].sum()/len(train.Target)*100)


21.2443095599393


# The Data Is Truly Imbalanced

In [26]:
test.head()

Unnamed: 0,1,2,3
0,0.74,0.21,22.0
1,0.43,0.15,14.0
2,0.77,0.1,15.0
3,0.57,0.31,15.0
4,0.8,0.31,20.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1995 entries, 0 to 1994
Data columns (total 4 columns):
1         1992 non-null float64
2         1992 non-null float64
3         1993 non-null float64
Target    1984 non-null object
dtypes: float64(3), object(1)
memory usage: 62.4+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855 entries, 0 to 854
Data columns (total 3 columns):
1    851 non-null float64
2    850 non-null float64
3    851 non-null float64
dtypes: float64(3)
memory usage: 20.1 KB


Drop Null values as there were only few.

In [6]:
train.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

Label Encoding to convert string/boolean to integer

In [7]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()  
train['Target']= label_encoder.fit_transform(train['Target']) 
train['Target'].unique() 

array([0, 1], dtype=int64)

Creating a base model before applying to real test data

In [8]:
X=train.iloc[:,:-1]
y=train.loc[:,'Target']

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

Setting up a pipeline 

In [10]:
pipeline_Lr=Pipeline([('scalar',StandardScaler()),('Lr',LogisticRegression())])

In [11]:
pipeline_Dt=Pipeline([('Dt',DecisionTreeClassifier())])

In [12]:
pipeline_Rf=Pipeline([('Rf',RandomForestClassifier())])

In [13]:
pipeline_Xg=Pipeline([('Xg',XGBClassifier())])

In [14]:
pipelines = [pipeline_Lr, pipeline_Dt, pipeline_Rf,pipeline_Xg]

In [15]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [16]:
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest', 3: 'XGB'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)



In [17]:
for i,model in enumerate(pipelines):
    print("{} Train Accuracy: {}".format(pipe_dict[i],model.score(X_train,y_train)))

Logistic Regression Train Accuracy: 0.7903109182935647
Decision Tree Train Accuracy: 1.0
RandomForest Train Accuracy: 1.0
XGB Train Accuracy: 0.9992769342010123


In [18]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.7777777777777778
Decision Tree Test Accuracy: 0.9511784511784511
RandomForest Test Accuracy: 0.9545454545454546
XGB Test Accuracy: 0.9629629629629629


In [19]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy : {}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy : XGB


In [71]:
print(metrics.confusion_matrix(y_test, y_predict))
print(metrics.classification_report(y_test, y_predict))


[[461   1]
 [ 40  92]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       462
           1       0.99      0.70      0.82       132

    accuracy                           0.93       594
   macro avg       0.95      0.85      0.89       594
weighted avg       0.94      0.93      0.93       594



Predicting Test data

In [20]:
Xgb_Pred = pipeline_Xg.predict(test)

In [21]:
Xgb_Pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,

In [25]:
np.where(Xgb_Pred==1,'True','False')

array(['False', 'False', 'True', 'False', 'False', 'False', 'False',
       'False', 'False', 'False', 'False', 'False', 'False', 'False',
       'False', 'True', 'False', 'False', 'False', 'False', 'False',
       'True', 'False', 'False', 'False', 'False', 'True', 'False',
       'False', 'False', 'False', 'False', 'False', 'False', 'False',
       'False', 'True', 'False', 'False', 'False', 'False', 'True',
       'False', 'False', 'False', 'False', 'True', 'False', 'False',
       'False', 'False', 'True', 'False', 'True', 'True', 'False',
       'False', 'False', 'False', 'False', 'False', 'False', 'False',
       'False', 'True', 'False', 'False', 'False', 'False', 'False',
       'False', 'True', 'True', 'True', 'False', 'False', 'False',
       'False', 'True', 'True', 'False', 'False', 'False', 'False',
       'True', 'True', 'True', 'False', 'False', 'False', 'False',
       'False', 'False', 'False', 'False', 'False', 'True', 'True',
       'False', 'True', 'False', 'False',

In [28]:
sub = pd.DataFrame({'1' : test['1'],
                    '2' : test['2'],
                    '3' : test['3'],
                   'Target': Xgb_Pred})
sub.to_csv('submission_Xgb_classification.csv', index=False)

In [38]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

In [48]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
   'n_estimators': range(60, 180, 40),
    'learning_rate': [0.1, 0.01, 0.05]
        }

In [49]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=params,
    scoring = 'roc_auc',
    n_jobs = 5,
    cv = 5,
    verbose=True
)

In [50]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3645 candidates, totalling 18225 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    9.0s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   17.6s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:   29.5s
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:   45.5s
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed:  1.1min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed:  1.5min
[Parallel(n_jobs=5)]: Done 2440 tasks      | elapsed:  2.0min
[Parallel(n_jobs=5)]: Done 3190 tasks      | elapsed:  2.6min
[Parallel(n_jobs=5)]: Done 4040 tasks      | elapsed:  3.3min
[Parallel(n_jobs=5)]: Done 4990 tasks      | elapsed:  4.2min
[Parallel(n_jobs=5)]: Done 6040 tasks      | elapsed:  5.0min
[Parallel(n_jobs=5)]: Done 7190 tasks      | elapsed:  6.0min
[Parallel(n_jobs=5)]: Done 8440 tasks      | elapsed:  7.0min
[Parallel(n_jobs=5)]: Done 9790 tasks      | elapsed:  8.2min
[Parallel(n_jobs=5)]: Done 11240 tasks      | elapsed:  9.4mi

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constrai...
                                     validate_parameters=False,
                                     verbosity=None),
             iid='warn', n_jobs=5,
             param_grid={'colsample_bytree': [0.6, 0.8, 1.0],
                         'gamma': [0.5, 1, 1.5, 2, 5],
                         'learning_rate': [0.1, 0.0

In [51]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=140, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=0.6,
              tree_method=None, validate_parameters=False, verbosity=None)

In [52]:
tunned_xgb = grid_search.score(X_test,y_test)

In [53]:
tunned_xgb

0.9961629279811098

In [55]:
tunned_Xgb = grid_search.predict(test)

In [56]:
np.where(tunned_Xgb==1,'True','False')

array(['False', 'False', 'True', 'False', 'False', 'False', 'False',
       'False', 'False', 'False', 'False', 'False', 'False', 'False',
       'False', 'False', 'False', 'False', 'False', 'False', 'False',
       'True', 'False', 'False', 'False', 'False', 'True', 'False',
       'False', 'False', 'False', 'False', 'False', 'False', 'False',
       'False', 'True', 'False', 'False', 'False', 'False', 'True',
       'False', 'False', 'False', 'False', 'True', 'False', 'False',
       'False', 'False', 'True', 'False', 'True', 'True', 'False',
       'False', 'False', 'False', 'False', 'False', 'False', 'False',
       'False', 'True', 'False', 'False', 'False', 'False', 'False',
       'False', 'True', 'True', 'True', 'False', 'False', 'False',
       'False', 'True', 'True', 'False', 'False', 'False', 'False',
       'True', 'True', 'True', 'False', 'False', 'False', 'False',
       'False', 'False', 'False', 'False', 'False', 'False', 'True',
       'False', 'True', 'False', 'False

In [57]:
sub = pd.DataFrame({'1' : test['1'],
                    '2' : test['2'],
                    '3' : test['3'],
                   'Target': tunned_Xgb})
sub.to_csv('submission__tunned_Xgb_classification.csv', index=False)

In [61]:
from sklearn.pipeline import make_pipeline

In [62]:
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 100, 1000],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [63]:
best_model.score(X_test,y_test)

0.930976430976431

In [67]:
y_predict = best_model.predict(X_test)

In [68]:
print(metrics.confusion_matrix(y_test, y_predict))
print(metrics.classification_report(y_test, y_predict))


[[461   1]
 [ 40  92]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       462
           1       0.99      0.70      0.82       132

    accuracy                           0.93       594
   macro avg       0.95      0.85      0.89       594
weighted avg       0.94      0.93      0.93       594



The Target variable being imbalanced, we need to perform sampling such as SMOTE