In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [21]:
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

In [22]:
train_df = train_df.drop('ID',axis=1)
test_df = test_df.drop('ID',axis=1)
train_objs_num = len(train_df)
df = pd.concat(objs=[train_df, test_df], axis=0,ignore_index=True)
dataset_preprocessed = pd.get_dummies(df,drop_first=True)
train_preprocessed = dataset_preprocessed[:train_objs_num]
test_preprocessed = dataset_preprocessed[train_objs_num:]
X = train_preprocessed.drop('Is_Lead',axis=1)
Xtest = test_preprocessed.drop('Is_Lead',axis=1)
y = train_preprocessed['Is_Lead']

In [23]:
# import library
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_smote))
x_smote, y_smote = X , y



Original dataset shape Counter({0.0: 187437, 1.0: 58288})
Resample dataset shape Counter({0.0: 187437, 1.0: 187437})


In [25]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=0)

In [26]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from xgboost.sklearn import XGBClassifier

In [27]:
# XGBClassifier()

# n_jobs=-1
# verbosity=2
# silent=0
# tree_method = 'gpu_hist'

In [28]:
space = {'booster': hp.choice('booster', ['gbtree', 'gblinear' , 'dart']),
         'colsample_bylevel': hp.uniform('colsample_bylevel', 0.7, 1),
         'colsample_bynode': hp.uniform('colsample_bynode', 0.7, 1),
         'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1),
         'gamma': hp.uniform('gamma', 0, 0.5),
         'scale_pos_weight': hp.quniform('scale_pos_weight', 1, 100,5),
         'learning_rate' : hp.choice('learning_rate', [0.001,0.01,0.1,0.3]),
         'max_delta_step' : hp.choice('max_delta_step', [0,1,2]),
         'max_depth' : hp.choice('max_depth', [3,5,7,9]),
         'min_child_weight' : hp.choice('min_child_weight', [1,3,5]),
         'objective' : hp.choice('objective', ['binary:logistic', 'binary:logitraw' , 'binary:hinge']),
         'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10]),
         'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10]),
         'subsample' : hp.choice('subsample', [0.5,0.7,0.9,1]),
         'eval_metric': hp.choice('eval_metric', ['auc'])

    }

In [32]:
def objective(space):
    model = XGBClassifier(
                          booster = space['booster'],
                          colsample_bylevel = space['colsample_bylevel'],
                          colsample_bynode = space['colsample_bynode'],

                          colsample_bytree = space['colsample_bytree'],
                          gamma = space['gamma'],
                          scale_pos_weight = space['scale_pos_weight'],
                          learning_rate = space['learning_rate'], 
                          max_delta_step = space['max_delta_step'],

                          max_depth = space['max_depth'],
                          min_child_weight = space['min_child_weight'], 
                          objective = space['objective'],
                          reg_alpha = space['reg_alpha'],
                          reg_lambda = space['reg_lambda'], 

                          subsample = space['subsample'],
                          eval_metric = space['eval_metric'],
                          tree_method = 'gpu_hist',
                          n_jobs=-1,
                          verbosity=1
                          )
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    score = cross_val_score(model, X_train, y_train, n_jobs=-1, verbose=1, scoring='roc_auc', cv=cv,).mean()
    

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }

In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 30,
            trials= trials)
best

  0%|          | 0/30 [00:00<?, ?it/s, best loss: ?]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.



  0%|          | 0/30 [00:51<?, ?it/s, best loss: ?]


KeyboardInterrupt: ignored

In [13]:
# li = ['auc', 'error' , 'logloss']
# dic ={}
# for i,v in enumerate(li):
#   dic[i] = v
# dic

In [None]:
booster = {0: 'gbtree', 1: 'gblinear', 2: 'dart'}
objective = {0: 'binary:logistic', 1: 'binary:logitraw', 2: 'binary:hinge'}
learning_rate = {0: 0.001, 1: 0.01, 2: 0.1, 3: 0.3}

max_delta_step = {0: 0, 1: 1, 2: 2}
max_depth = {0: 3, 1: 5, 2: 7, 3: 9}
min_child_weight = {0: 1, 1: 3, 2: 5}

reg_alpha = {0: 1e-05, 1: 0.01, 2: 0.1, 3: 1, 4: 10}
reg_lambda = {0: 1e-05, 1: 0.01, 2: 0.1, 3: 1, 4: 10}
subsample = {0: 0.5, 1: 0.7, 2: 0.9, 3: 1}

eval_metric = {0: 'auc', 1: 'error', 2: 'logloss'}


In [None]:
print(booster[best['booster']])
print(learning_rate[best['learning_rate']])
print(objective[best['objective']])

print(max_delta_step[best['max_delta_step']])
print(max_depth[best['max_depth']])
print(min_child_weight[best['min_child_weight']])

print(reg_alpha[best['reg_alpha']])
print(reg_lambda[best['reg_lambda']])
print(subsample[best['subsample']])

print(eval_metric[best['eval_metric']])

In [None]:
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score

In [None]:
trainedforest = XGBClassifier(
                          booster = booster[best['booster']],
                          colsample_bylevel = best['colsample_bylevel'],
                          colsample_bynode = best['colsample_bynode'],

                          colsample_bytree = best['colsample_bytree'],
                          gamma = best['gamma'],
                          scale_pos_weight = best['scale_pos_weight'],
                          learning_rate = learning_rate[best['learning_rate']], 
                          max_delta_step = max_delta_step[best['max_delta_step']],

                          max_depth = max_depth[best['max_depth']],
                          min_child_weight = min_child_weight[best['min_child_weight']], 
                          objective = objective[best['objective']],
                          reg_alpha = reg_alpha[best['reg_alpha']],
                          reg_lambda = reg_lambda[best['reg_lambda']], 

                          subsample = subsample[best['subsample']],
                          eval_metric = eval_metric[best['eval_metric']],
                          tree_method = 'gpu_hist',
                          n_jobs=-1,
                          verbosity=2,
                          silent=0
                          ).fit(X,y)
predictionforest = trainedforest.predict(X_test)
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
print(roc_auc_score(y_test,predictionforest))

In [None]:
pred = trainedforest.predict_proba(Xtest)
sns.distplot(pred)

In [33]:
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')
result = pd.DataFrame(test_df.ID,columns=['ID'])

li = []
for i in pred:
  li.append(i[1])

result['Is_Lead'] = li
result.to_csv('result8.csv',index=False)