In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import catboost as cb
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import ( StratifiedKFold, RepeatedStratifiedKFold,
                                     GridSearchCV, KFold )
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_auc_score 

from collections import Counter
import optuna 

In [2]:
warnings.simplefilter(action='ignore')
%matplotlib inline
plt.style.use('seaborn-notebook')
plt.rcParams["figure.figsize"] = (12, 6)
sns.set(context="paper", font="monospace") 

In [3]:
def check_stats(data):
    stats = []
    for col in data.columns:
        stats.append((col, data[col].nunique(), data[col].isnull().sum() * 100 / data.shape[0],
                    round((data[col].value_counts(normalize=True, dropna=False).values[0] * 100),2), 
                    data[col].dtype))
        
    stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'missing values %', 
                                            'biggest category %', 'type']) 
    return stats_df.sort_values('missing values %', ascending=False) 

In [4]:
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')
sub_df = pd.read_csv('SampleSubmission.csv') 
print(train_df.shape, test_df.shape) 

(1568, 27) (672, 26)


In [5]:
# a = train_df[train_df['Year_of_Birth'] < 1940].index 
# train_df.drop(a, axis=0, inplace=True) 

In [6]:
# train_df.reset_index(drop=True, inplace=True) 

In [7]:
train_df['Marital_Status'].value_counts() 

Married     615
Together    409
Single      329
Divorced    159
Widow        50
Alone         3
YOLO          2
Absurd        1
Name: Marital_Status, dtype: int64

In [8]:
test_df['Marital_Status'].value_counts() 

Married     249
Together    171
Single      151
Divorced     73
Widow        27
Absurd        1
Name: Marital_Status, dtype: int64

In [9]:
for i in ['Absurd', 'YOLO']: 
    train_df.loc[train_df['Marital_Status'] == i, ['Marital_Status']] = 'others'
    test_df.loc[test_df['Marital_Status'] == i, ['Marital_Status']] = 'others'

In [10]:
value = train_df['Disposable_Income'].median()
train_df.fillna(value, inplace=True)
test_df.fillna(value, inplace=True)

In [11]:
# train_df['Disposable_Income'] = train_df['Disposable_Income'].interpolate(method="linear")
# test_df['Disposable_Income'] = test_df['Disposable_Income'].interpolate(method="linear")

In [12]:
ntrain = train_df.shape[0]
ntest = test_df.shape[0]

data = pd.concat((train_df, test_df)).reset_index(drop=True)
data['Date_Customer'] = pd.to_datetime(data['Date_Customer'])
data['date_year'] = data['Date_Customer'].dt.year
data['date_month'] = data['Date_Customer'].dt.month
data['date_quat'] = data['Date_Customer'].dt.quarter 

data['rwp'] = data['Recency'] / (data['WebVisitsMonth'] + 1)
data['cata'] = data['Amount_on_MeatProducts'] / (data['CatalogPurchases'] + 1)
data['yearly_expenses'] = (data[['Amount_on_Fruits', 'Amount_on_MeatProducts', 'Amount_on_FishProducts',
                                'Amount_on_SweetProducts', 'Amount_on_GoldProds']].sum(axis=1)) / 3
data['spending_ratio'] = data['Disposable_Income'] / data['yearly_expenses'] 


In [13]:
# drop_list = [f'Cmp{i}Accepted' for i in range(1,6)]
# # drop_list = drop_list + ['Any_Complain', 'Disposable_Income','WebVisitsMonth','Year_of_Birth'] 
# drop_list = drop_list + ['Any_Complain'] 
# data.drop(drop_list, axis=1, inplace=True) 

In [14]:
# data['Disposable_Income'] = data['Disposable_Income'].interpolate(method="linear")

In [15]:
# data = pd.get_dummies(data, columns=['Education_Level', 'Marital_Status'], drop_first=True)

In [16]:
for feat in ['Marital_Status', 'Education_Level']:
    data[feat] = pd.factorize(data[feat])[0] 

In [17]:
data.drop(['ID', 'Date_Customer'], axis=1, inplace=True)

In [18]:
train = data[:ntrain]
test = data[ntrain:]
target = train['Response'] 
print(train.shape, test.shape, target.shape) 

(1568, 32) (672, 32) (1568,)


In [19]:
train.drop('Response', axis=1, inplace=True)
test.drop('Response', axis=1, inplace=True)
print(train.shape, test.shape, target.shape) 

(1568, 31) (672, 31) (1568,)


### Xgb 

In [20]:
weight = Counter(target)[0] / Counter(target)[1]
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=24) 
xgb_ = xgb.XGBClassifier(scale_pos_weight=weight, n_jobs=-1, max_depth=4, 
                        random_state=64, sub_sample=0.8, n_estimators=500,
                        gamma=1, min_child_weight=10, colsample_bytree=0.8)


In [21]:
mean_train, mean_test_val = [], []
test_pred = np.zeros(test.shape[0])
val_pred = np.zeros(train.shape[0])


for fold, (train_index, test_index) in enumerate(skf.split(train, target)):
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]

    print(f'=====================Fold{fold +1}=====================')
    xgb_.fit(X_train, y_train, early_stopping_rounds=200, eval_metric='auc',
            eval_set=[(X_test, y_test)], verbose=250)
    train_predict = xgb_.predict_proba(X_train)[:,1]
    test_predict = xgb_.predict_proba(X_test)[:,1]
    val_pred[test_index] = test_predict
    test_pred += xgb_.predict_proba(test)[:,1]

    print(f'\nTrain score: {roc_auc_score(y_train, train_predict)}')
    print(f'\nTest score: {roc_auc_score(y_test, test_predict)}')

    mean_train.append(roc_auc_score(y_train, train_predict))
    mean_test_val.append(roc_auc_score(y_test, test_predict))

test_pred = test_pred / 10
print(f'Average Train score for 10 folds split: {np.mean(mean_train)}') 
print(f'Average Test score for 10 folds split: {np.mean(mean_test_val)}')
print(f'Standard deviation for 10 folds split: {np.std(mean_test_val)}') 

Parameters: { "sub_sample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-auc:0.79511
[243]	validation_0-auc:0.89850

Train score: 0.995613284592051

Test score: 0.9069548872180451
Parameters: { "sub_sample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-auc:0.77976
[229]	validation_0-auc:0.88659

Train score: 0.9900871120790231

Test score: 0.9119674185463658
Parameters: { "sub_sample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or so

In [22]:
pd.Series(test_pred).describe() 

count    672.000000
mean       0.237126
std        0.253954
min        0.019728
25%        0.047505
50%        0.114212
75%        0.348421
max        0.967661
dtype: float64

In [50]:
threshold = 0.4
predictions = [1 if p > threshold else 0 for p in test_pred] 

In [51]:
sub_df['Response'] = predictions
sub_df['Response'].value_counts() 

0    523
1    149
Name: Response, dtype: int64

In [49]:
sub_df.to_csv('csv/test-xgboost.csv', index=False) 

In [40]:
sub_df['Response'] = test_pred 
sub_df.to_csv('csv/to-weight-test-xgboost.csv', index=False) 

In [43]:
print(523/672)
print(1329/(1329 + 239)) 

0.7782738095238095
0.8475765306122449


### LightGBM

In [44]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2020)
lgb_ = lgb.LGBMClassifier(random_state=2021, scale_pos_weight=weight, n_estimators=1500,
                        colsample_bytree=0.8, min_child_samples=10, subsample=0.7,
                        subsample_freq=5, num_leaves=120, metric='auc', learning_rate=0.01,
                        max_depth=7) 

In [45]:
mean_train, mean_test_val = [], []
test_pred = np.zeros(test.shape[0])
val_pred = np.zeros(train.shape[0])


for fold, (train_index, test_index) in enumerate(skf.split(train, target)):
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]

    print(f'=====================Fold{fold +1}=====================')
    lgb_.fit(X_train, y_train, early_stopping_rounds=200, eval_metric='auc',
            eval_set=[(X_test, y_test)], verbose=250)
    train_predict = lgb_.predict_proba(X_train)[:,1]
    test_predict = lgb_.predict_proba(X_test)[:,1]
    val_pred[test_index] = test_predict
    test_pred += lgb_.predict_proba(test)[:,1]

    print(f'\nTrain score: {roc_auc_score(y_train, train_predict)}')
    print(f'\nTest score: {roc_auc_score(y_test, test_predict)}')

    mean_train.append(roc_auc_score(y_train, train_predict))
    mean_test_val.append(roc_auc_score(y_test, test_predict))

test_pred = test_pred / 10
print(f'Average Train score for 10 folds split: {np.mean(mean_train)}') 
print(f'Average Test score for 10 folds split: {np.mean(mean_test_val)}')
print(f'Standard deviation for 10 folds split: {np.std(mean_test_val)}') 

Training until validation scores don't improve for 200 rounds
[250]	valid_0's auc: 0.878759
Early stopping, best iteration is:
[240]	valid_0's auc: 0.880326

Train score: 0.9970230224780274

Test score: 0.8803258145363408
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.932018

Train score: 0.9193766041844911

Test score: 0.9320175438596492
Training until validation scores don't improve for 200 rounds
[250]	valid_0's auc: 0.854323
[500]	valid_0's auc: 0.870927
[750]	valid_0's auc: 0.880013
[1000]	valid_0's auc: 0.886591
Early stopping, best iteration is:
[1006]	valid_0's auc: 0.887845

Train score: 0.9998658318425762

Test score: 0.8878446115288221
Training until validation scores don't improve for 200 rounds
[250]	valid_0's auc: 0.890038
[500]	valid_0's auc: 0.892857
Early stopping, best iteration is:
[325]	valid_0's auc: 0.89599

Train score: 0.9987069300770008

Test score: 0.8959899749373434
Training until validati

In [47]:
pd.Series(test_pred).describe() 

count    672.000000
mean       0.179398
std        0.186587
min        0.035637
25%        0.050163
50%        0.088352
75%        0.225735
max        0.773399
dtype: float64

In [65]:
threshold = 0.27
predictions = [1 if p > threshold else 0 for p in test_pred]  

In [66]:
sub_df['Response'] = predictions
sub_df['Response'].value_counts()

0    527
1    145
Name: Response, dtype: int64

In [67]:
sub_df.to_csv('csv/test-lgbm.csv', index=False)  

In [68]:
sub_df['Response'] = test_pred 
sub_df.to_csv('csv/to-weight-test-lgbm.csv', index=False) 

### Weighted

In [87]:
xgb_pred = pd.read_csv('csv/to-weight-test-xgboost.csv')
lgb_pred = pd.read_csv('csv/to-weight-test-lgbm.csv') 

pred = ((0.6 * xgb_pred['Response'] + 0.4 * lgb_pred['Response']) + (0.65 * xgb_pred['Response'] + 0.35 * lgb_pred['Response'])) / 2 

In [88]:
pd.Series(pred).describe() 

count    672.000000
mean       0.215478
std        0.227295
min        0.025694
25%        0.048987
50%        0.107061
75%        0.307521
max        0.892554
Name: Response, dtype: float64

In [95]:
threshold = 0.27
predictions = [1 if p > threshold else 0 for p in pred]  

In [96]:
sub_df['Response'] = predictions
sub_df['Response'].value_counts() 

0    472
1    200
Name: Response, dtype: int64

In [97]:
sub_df.to_csv('csv/test-weighted.csv', index=False)  