In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [2]:
train = pd.read_csv(r'train_s3TEQDk.csv')
test = pd.read_csv(r'test_mSzZ8RL.csv')

In [3]:
train.shape, test.shape

((245725, 11), (105312, 10))

In [22]:
train['source'] = 'train'
test['source'] = 'test'
data = pd.concat([train,test],ignore_index=True)
data.head(10)

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead,source
0,0,73,18,1,2,43,0,13.860193,0,0.0,train
1,0,30,27,2,0,32,0,13.274205,0,0.0,train
2,0,56,18,3,2,26,0,14.210464,1,0.0,train
3,1,34,20,2,0,19,0,13.061453,0,0.0,train
4,0,30,32,2,0,33,0,13.69536,0,0.0,train
5,1,56,11,3,0,32,0,13.207004,1,0.0,train
6,1,62,32,1,2,20,1,13.870709,1,1.0,train
7,0,48,15,3,2,13,0,13.005209,1,0.0,train
8,0,40,33,3,1,38,0,14.057895,0,0.0,train
9,0,55,18,3,1,49,1,14.515752,0,0.0,train


In [7]:
data.shape

(351037, 12)

In [8]:
data.isnull().sum()

ID                          0
Gender                      0
Age                         0
Region_Code                 0
Occupation                  0
Channel_Code                0
Vintage                     0
Credit_Product          41847
Avg_Account_Balance         0
Is_Active                   0
Is_Lead                105312
source                      0
dtype: int64

In [9]:
data['Credit_Product'].replace(np.nan,'Yes',inplace=True)

In [10]:
data['Avg_Account_Balance'] = np.log(data['Avg_Account_Balance'])

In [11]:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

#Drop unnecessary columns:
test.drop(['Is_Lead','source'],axis=1,inplace=True)
train.drop('source',axis=1,inplace=True)



train.drop('ID',axis=1,inplace=True)
test.drop("ID",axis=1,inplace = True)



In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_mod = ['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active']
for i in var_mod:
    train[i] = le.fit_transform(train[i])
    test[i] = le.fit_transform(test[i])


In [13]:
X = train.drop('Is_Lead',axis=1)
y = train['Is_Lead']


In [14]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

def cross_val(X, y, model, params, folds=9):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict_proba(x_test)[:, 1]
        roc_score = roc_auc_score(y_test, pred)
        print(f"roc_auc_score: {roc_score}")
        print("-"*50)
    
    return alg



In [15]:
lgb_params= {'learning_rate': 0.045, 
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 10, 
             'max_depth': 27, 
             'reg_alpha': 8.457, 
             'reg_lambda': 6.853, 
             'subsample': 0.749}

In [16]:
from lightgbm import LGBMClassifier
lgb_model = cross_val(X, y, LGBMClassifier, lgb_params)


Fold: 0
Training until validation scores don't improve for 100 rounds
[400]	valid_0's binary_logloss: 0.378501
[800]	valid_0's binary_logloss: 0.37827
Early stopping, best iteration is:
[802]	valid_0's binary_logloss: 0.378264
roc_auc_score: 0.8539183817995556
--------------------------------------------------
Fold: 1
Training until validation scores don't improve for 100 rounds
[400]	valid_0's binary_logloss: 0.382606
Early stopping, best iteration is:
[524]	valid_0's binary_logloss: 0.382519
roc_auc_score: 0.8500343931608946
--------------------------------------------------
Fold: 2
Training until validation scores don't improve for 100 rounds
[400]	valid_0's binary_logloss: 0.384762
Early stopping, best iteration is:
[607]	valid_0's binary_logloss: 0.38465
roc_auc_score: 0.847290061663613
--------------------------------------------------
Fold: 3
Training until validation scores don't improve for 100 rounds
[400]	valid_0's binary_logloss: 0.383285
Early stopping, best iteration is:


In [21]:
pred_test_lgb = lgb_model.predict_proba(test)[:,1]


sample_submission = pd.read_csv(r'sample_submission_eyYijxG.csv')
sample_submission['Is_Lead'] = pred_test_lgb
sample_submission.to_csv(f'submission.csv',index=False)


In [17]:
xgb_params= {'n_estimators': 20000, 
             'max_depth': 6, 
             'learning_rate': 0.0201, 
             'reg_lambda': 29.326, 
             'subsample': 0.818, 
             'colsample_bytree': 0.235, 
             'colsample_bynode': 0.820, 
             'colsample_bylevel': 0.453}


In [18]:
from xgboost import XGBClassifier
xgb_model = cross_val(X, y, XGBClassifier, xgb_params)

Fold: 0
[0]	validation_0-logloss:0.68616
[400]	validation_0-logloss:0.39841
[800]	validation_0-logloss:0.38836
[1200]	validation_0-logloss:0.38522
[1600]	validation_0-logloss:0.38395
[2000]	validation_0-logloss:0.38290
[2400]	validation_0-logloss:0.38231
[2800]	validation_0-logloss:0.38185
[3200]	validation_0-logloss:0.38163
[3600]	validation_0-logloss:0.38129
[4000]	validation_0-logloss:0.38109
[4400]	validation_0-logloss:0.38085
[4461]	validation_0-logloss:0.38084
roc_auc_score: 0.852306070779921
--------------------------------------------------
Fold: 1
[0]	validation_0-logloss:0.68608
[400]	validation_0-logloss:0.40060
[800]	validation_0-logloss:0.39111
[1200]	validation_0-logloss:0.38838
[1600]	validation_0-logloss:0.38720
[2000]	validation_0-logloss:0.38626
[2400]	validation_0-logloss:0.38573
[2800]	validation_0-logloss:0.38534
[3200]	validation_0-logloss:0.38511
[3600]	validation_0-logloss:0.38477
[4000]	validation_0-logloss:0.38460
[4400]	validation_0-logloss:0.38440
[4800]	val

KeyboardInterrupt: 

In [None]:
cat_params= {'n_estimators': 20000, 
                  'depth': 4, 
                  'learning_rate': 0.023, 
                  'colsample_bylevel': 0.655, 
                  'bagging_temperature': 0.921, 
                  'l2_leaf_reg': 10.133}


In [None]:
from catboost import CatBoostClassifier
cat_model = cross_val(X, y, CatBoostClassifier, cat_params)

In [None]:
pred_test_lgb = lgb_model.predict_proba(test)[:,1]
pred_test_xgb = xgb_model.predict_proba(test)[:,1]
pred_test_cat = cat_model.predict_proba(test)[:,1]
prediction = (pred_test_lgb + pred_test_cat+pred_test_xgb)/3

In [None]:
sample_submission = pd.read_csv(r'sample_submission_lgb_xgb_cat.csv')