### Loading Important Libraries 

In [1]:
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import KBinsDiscretizer , LabelEncoder
from scipy import stats
from sklearn.metrics import roc_auc_score
import lightgbm
from lightgbm import LGBMClassifier

### Loading Data

In [2]:
#Loading train data 
train_data = pd.read_csv("train.csv")
#Loading test data 
test_data =  pd.read_csv("test.csv")

In [3]:
#Numerical columns
num_cols = ['Age', 'Vintage', 'Avg_Account_Balance']
#Categorical columns
cat_cols = ['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active']
#Target
target = 'Is_Lead'

### Data Preprocessing 

Missing Value Imputation 

In [4]:
## Filling missing values with another level, treating them as a separate category
## "UA" = Unavailable
train_data = train_data.fillna("UA")
test_data = test_data.fillna("UA")

Outlier Removal

In [5]:
## Removing 'Avg Account Balance' in train data,
## Which are larger than max value of 'Avg Account Balance' in test data.

train_data = train_data[train_data['Avg_Account_Balance']<=max(test_data.Avg_Account_Balance)].reset_index(drop=True)

### Feature Extraction / Feature Engineering 

Numerical Features 

In [6]:
## Log Transforming 'Avg Account Balance'
train_data['Avg_Account_Balance_log'] = np.log(train_data['Avg_Account_Balance'])
test_data['Avg_Account_Balance_log'] = np.log(test_data['Avg_Account_Balance'])

In [7]:
## Boxcox transformation for 'Vintage' column
fitted_data, fitted_lambda = stats.boxcox(np.abs(train_data['Vintage']))
train_data['Vintage_box'] = fitted_data
fitted_data, fitted_lambda = stats.boxcox(np.abs(test_data['Vintage']))
test_data['Vintage_box'] = fitted_data

In [8]:
## Binning 'Age' 
bins = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
bins.fit(train_data['Age'].values.reshape(-1,1))
train_data['Age_quantiles'] = bins.transform(train_data['Age'].values.reshape(-1,1)).astype(int)
test_data['Age_quantiles'] = bins.transform(test_data['Age'].values.reshape(-1,1)).astype(int)

In [9]:
## Binning 'Avg Account Balance' 
bins = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
bins.fit(train_data['Vintage'].values.reshape(-1,1))
train_data['Vintage_quantiles'] = bins.transform(train_data['Vintage'].values.reshape(-1,1)).astype(int)
test_data['Vintage_quantiles'] = bins.transform(test_data['Vintage'].values.reshape(-1,1)).astype(int)

Categorical Features

In [10]:
# Label Encoding the categorical features 

## Custom fuction for label encoding 

def df_lbl_enc(df):
    for c in cat_cols:
        lbl = LabelEncoder()
        df[c] = lbl.fit_transform(df[c])
    return df

## Label Encoding Categorical Columns in train data 

train_data = df_lbl_enc(train_data)

## Label Encoding Categorical Columns in test data 

test_data = df_lbl_enc(test_data)

In [11]:
# Frequency Encoding the 'Region Code'.
## Train Data 
dic = train_data['Region_Code'].value_counts(normalize=True).to_dict()
train_data['Region_Code'] = train_data['Region_Code'].map(dic)
## Test Data
dic = test_data['Region_Code'].value_counts(normalize=True).to_dict()
test_data['Region_Code'] = test_data['Region_Code'].map(dic)

In [19]:
## New Categorical Columns After Feature Extraction 

new_cat_cols = ['Gender','Occupation','Channel_Code', 'Credit_Product', 'Is_Active', 'Age_quantiles','Vintage_quantiles']

### Train/Test Split 

In [20]:
## Preparing Train data 
## Dropping few columns
X_train = train_data.drop(['ID','Is_Lead','Vintage', 'Avg_Account_Balance'], axis=1)
y_train = train_data[target].values
## Preparing Test data 
## Dropping few columns
ID = test_data['ID']
X_test = test_data.drop(['ID', 'Vintage', 'Avg_Account_Balance'], axis=1)

### Model Building 

In [40]:
from catboost import CatBoostClassifier

Cat_model = CatBoostClassifier(loss_function = 'Logloss',
                               depth = 8,
                               learning_rate =0.1,
                               iterations=300,
                               )

Cat_model.fit(X_train,y_train,cat_features = new_cat_cols)

0:	learn: 0.6144556	total: 196ms	remaining: 58.8s
1:	learn: 0.5532944	total: 364ms	remaining: 54.3s
2:	learn: 0.5055481	total: 559ms	remaining: 55.4s
3:	learn: 0.4691851	total: 712ms	remaining: 52.7s
4:	learn: 0.4419488	total: 911ms	remaining: 53.7s
5:	learn: 0.4212151	total: 1.11s	remaining: 54.6s
6:	learn: 0.4059975	total: 1.24s	remaining: 52s
7:	learn: 0.3936128	total: 1.42s	remaining: 51.7s
8:	learn: 0.3839357	total: 1.58s	remaining: 51s
9:	learn: 0.3766059	total: 1.78s	remaining: 51.7s
10:	learn: 0.3706626	total: 1.95s	remaining: 51.3s
11:	learn: 0.3659076	total: 2.17s	remaining: 52s
12:	learn: 0.3620484	total: 2.35s	remaining: 51.8s
13:	learn: 0.3595186	total: 2.43s	remaining: 49.6s
14:	learn: 0.3572005	total: 2.59s	remaining: 49.2s
15:	learn: 0.3552708	total: 2.79s	remaining: 49.5s
16:	learn: 0.3537208	total: 3.04s	remaining: 50.6s
17:	learn: 0.3524185	total: 3.26s	remaining: 51s
18:	learn: 0.3513302	total: 3.5s	remaining: 51.8s
19:	learn: 0.3507556	total: 3.64s	remaining: 51s
2

<catboost.core.CatBoostClassifier at 0x2c2c7c4ddc0>

### Prediction / Train Score 

In [41]:
#Predicting on train data 
train_pred = Cat_model.predict_proba(X_train)[:,1]
# Getting ROC_AUC Score 
Train_score_LGB = roc_auc_score(y_train,train_pred)
# Printing Score 
print(Train_score_LGB)

0.8824517705694443


### Predicting on test data and final submission 

In [42]:
#Predicting on test data 
test_pred = Cat_model.predict_proba(X_test)[:,1]

In [43]:
# Predictions 
res = pd.DataFrame(test_pred)
ID = pd.DataFrame(ID)
res = res.rename(columns={res.columns[0]: 'Is_Lead'})
submission = pd.concat([ID,res], axis = 1)

submission.to_csv("Catboost_last.csv",index = False)