# First models

In [88]:
import pandas as pd
import numpy as np
import category_encoders as ce
import catboost as cb
import xgboost as xgb
import lightgbm as lgb

import sys
import json

from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import precision_score, confusion_matrix, f1_score, make_scorer, explained_variance_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier

sys.path.append('../src')
from converters import MLConverter, CatBoostConverter

In [89]:
df = pd.read_csv('../data/train.txt', sep=' ')
df = df.reset_index(drop=True)
print(f'Columns: {len(df.columns)}')
df.head()

Columns: 231


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,class
0,,,,,,931.0,7.0,,,,...,catzS2D,LM8l689qOp,,ELof,szEZ,ZI9m,ib5G6X1eUxUn6,,,0
1,,,,,,245.0,7.0,,,,...,bTV7qqc,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,,0
2,,,,,,791.0,7.0,,,,...,2JfQ3DB,jySVZNlOJy,,ELof,7aLG,RAYp,F2FyR07IdsN7I,am7c,,0
3,,,,,,1036.0,7.0,,,,...,hHJsvbM,LM8l689qOp,,,Qcbd,6fzt,SbOd7O8ky1wGNxp0Arj0Xs,,,0
4,,,,,,518.0,7.0,,,,...,APgVoGr,LM8l689qOp,,,kwS7,02N6s8f,xwM2aC7IdeMC0,,,0


In [90]:
data_json= {}

In [91]:
max_categories = 2000
fit_na_threshold = 0.1
fill_na_threshold = 0.5

In [92]:
train_df, test_df = train_test_split(df, test_size=0.3)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
na_cols = train_df.columns[train_df.isna().all()].tolist()
train_df = train_df.drop(na_cols, axis=1)
test_df = test_df.drop(na_cols, axis=1)
data_json['na_columns'] = na_cols
print(f'Columns: {len(train_df.columns)}, {len(test_df.columns)}')

Columns: 213, 213


In [93]:
len(train_df), len(test_df), all(train_df.columns == test_df.columns)

(28000, 12000, True)

### Drop NA columns

In [94]:
na_cols = train_df.columns[train_df.isna().all()].tolist()
train_df = train_df.drop(na_cols, axis=1)
test_df = test_df.drop(na_cols, axis=1)
data_json['na_columns'] = na_cols
print(f'Columns: {len(train_df.columns)}, {len(test_df.columns)}')

Columns: 213, 213


### Find categorical columns


In [95]:
cat_cols = train_df.select_dtypes(object).columns.tolist()  # categorical columns
unique_values = sorted([(col_name, len(train_df[col_name].unique())) for col_name in cat_cols], key=lambda x: x[1])
chosen_cat_cols = [v[0] for v in unique_values if v[1] < max_categories]  # cat columns with less than MAX_CAT unique cats
many_cat_cols = list(set(cat_cols) - set(chosen_cat_cols))  # cat columns with more than MAX_CAT unique cats
train_df = train_df.drop(many_cat_cols, axis=1)
test_df = test_df.drop(many_cat_cols, axis=1)
data_json['category_columns'] = unique_values

# encoder = ce.OrdinalEncoder(cols=chosen_cat_cols)
# encoder.fit(train_df)
# train_df = encoder.transform(train_df)
# test_df = encoder.transform(test_df)

encoder = ce.TargetEncoder(cols=chosen_cat_cols, smoothing=300)
encoder.fit(train_df, train_df.loc[:, 'class'])
train_df = encoder.transform(train_df)
test_df = encoder.transform(test_df)


print(f'Columns: {len(train_df.columns)}, {len(test_df.columns)}')

Columns: 205, 205


### Columns which may be used for fitting

In [96]:
data_json['na_percent'] = sorted(train_df.isna().mean().to_dict().items(), key=lambda x: x[1])

In [97]:
fit_cols = train_df.columns[train_df.isna().mean() < fit_na_threshold].tolist()
fit_cols = list(set(fit_cols).union(set(chosen_cat_cols)))
print(len(fit_cols))

34


### Other columns with more than ?% NA 

In [98]:
# na_cols = train_df.columns[train_df.isna().mean() > fill_na_threshold]
# train_df = train_df.drop(na_cols, axis=1)
# test_df = test_df.drop(na_cols, axis=1)
# print(f'Columns: {len(train_df.columns)}')

In [99]:
to_fill_cols = train_df.columns[train_df.isna().mean() < fill_na_threshold]
# to_fill_cols = set(to_fill_cols) - set(fit_cols)
# to_fill_cols = set(df.columns) - set(fit_cols)
print(f'To fill columns: {len(to_fill_cols)}')

To fill columns: 72


### Testing fillers

In [100]:
scores = {}

for fill_col in tqdm(to_fill_cols):
    non_na_idx = ~train_df[fill_col].isna()
    tmp_fit_cols = ~train_df[fit_cols].columns.isin([fill_col, 'class'])
    # model =  lgb.sklearn.LGBMRegressor(n_estimators=200)
    # model = cb.CatBoostRegressor(iterations=300, task_type="GPU", logging_level='Silent')
    model = xgb.sklearn.XGBRegressor(n_estimators=100, n_jobs=-1)
    X = train_df.loc[non_na_idx, tmp_fit_cols]
    y = train_df.loc[non_na_idx, fill_col]
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
    train_X.reset_index(drop=True), train_y.reset_index(drop=True) 
    test_X.reset_index(drop=True), test_y.reset_index(drop=True) 
    model.fit(train_X, train_y) #, categorical_feature=chosen_cat_cols)
    pred_y = model.predict(test_X)
    score = explained_variance_score(test_y, pred_y)
    scores[fill_col] = score

HBox(children=(IntProgress(value=0, max=72), HTML(value='')))




In [101]:
sorted(scores.items(), key=lambda x: x[1])

[('Var74', -7.8203801428132085),
 ('Var44', -0.04849636220187015),
 ('Var143', -0.02752007606228446),
 ('Var173', -0.00906780344427216),
 ('Var196', -0.008266437549029781),
 ('Var57', -0.0021810325874314085),
 ('Var195', -0.0011698509113944322),
 ('Var208', 0.0),
 ('Var219', 6.255427256696144e-05),
 ('Var203', 0.00012221435247827817),
 ('Var205', 0.0002705952016610613),
 ('Var94', 0.007203802202929044),
 ('Var197', 0.011500012042159002),
 ('Var218', 0.019853549152461736),
 ('Var125', 0.019856466283390328),
 ('class', 0.032587289664636754),
 ('Var221', 0.03853017690054217),
 ('Var192', 0.04552200245153726),
 ('Var207', 0.04553573700778324),
 ('Var204', 0.04700375619945585),
 ('Var227', 0.058434971799780744),
 ('Var223', 0.07768010663099334),
 ('Var78', 0.09075889986908381),
 ('Var126', 0.11395548167507963),
 ('Var181', 0.12635013465367018),
 ('Var226', 0.12776618463453826),
 ('Var193', 0.131940896305893),
 ('Var211', 0.13592954205037233),
 ('Var228', 0.13849800336122864),
 ('Var212', 0.

In [102]:
chosen_to_fill = [k for k in scores if scores[k] > 0.8]
rest = [k for k in scores if scores[k] < 0.8]

In [103]:
len(fit_cols), len(rest), len(chosen_to_fill), len(train_df.columns), len(test_df.columns)

(34, 57, 15, 205, 205)

In [104]:
train_df[chosen_to_fill].describe()

Unnamed: 0,Var6,Var7,Var13,Var21,Var22,Var24,Var25,Var28,Var35,Var38,Var119,Var153,Var160,Var213,Var215
count,24834.0,24824.0,24824.0,24834.0,25116.0,23896.0,25116.0,25114.0,25116.0,25116.0,24834.0,25116.0,25116.0,28000.0,28000.0
mean,1307.633486,6.81671,1251.28553,231.091568,285.933867,4.428356,95.799331,223.469824,0.727027,2553521.0,898.884996,6140802.0,38.45477,0.072797,0.072658
std,2568.988161,6.319764,2920.999731,528.492834,657.521844,9.492922,197.306871,94.987921,3.042433,3016627.0,2004.275881,4349993.0,94.54411,0.002007,0.003644
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-46.8,0.0,0.0,0.0,0.0,0.0,0.059801,0.043131
25%,518.0,0.0,0.0,112.0,135.0,0.0,16.0,166.56,0.0,7444.5,425.0,1224142.0,10.0,0.073107,0.073107
50%,854.0,7.0,232.0,144.0,180.0,2.0,48.0,220.08,0.0,1184787.0,560.0,7970520.0,22.0,0.073107,0.073107
75%,1421.0,7.0,1612.0,228.0,285.0,6.0,120.0,265.76,0.0,4522248.0,895.0,10371530.0,42.0,0.073107,0.073107
max,131761.0,140.0,197872.0,24940.0,31175.0,494.0,7432.0,3071.68,110.0,18846900.0,105060.0,13875160.0,4030.0,0.073107,0.073107


In [105]:
for fill_col in tqdm(chosen_to_fill):
    non_na_idx = ~train_df[fill_col].isna()
    tmp_fit_cols = ~train_df[fit_cols].columns.isin([fill_col, 'class'])
    model = xgb.sklearn.XGBRegressor(n_estimators=100, n_jobs=-1)
    # model =  lgb.sklearn.LGBMRegressor(n_estimators=200)
    train_X = train_df.loc[non_na_idx, tmp_fit_cols]
    train_y = train_df.loc[non_na_idx, fill_col]
    model.fit(train_X, train_y) #, categorical_feature=chosen_cat_cols)
    pred_X = train_df.loc[:, tmp_fit_cols]
    pred_y = model.predict(pred_X)
    train_df.loc[:, fill_col] = train_df.loc[:, fill_col].fillna(pd.Series(pred_y))
    pred_X = test_df.loc[:, tmp_fit_cols]
    pred_y = model.predict(pred_X)
    test_df.loc[:, fill_col] = test_df.loc[:, fill_col].fillna(pd.Series(pred_y))

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [106]:
train_df.loc[:, chosen_to_fill].isna().mean()

Var6      0.0
Var7      0.0
Var13     0.0
Var21     0.0
Var22     0.0
Var24     0.0
Var25     0.0
Var28     0.0
Var35     0.0
Var38     0.0
Var119    0.0
Var153    0.0
Var160    0.0
Var213    0.0
Var215    0.0
dtype: float64

In [115]:
means = train_df.loc[:, rest].mean()
train_df.loc[:, rest] = train_df.loc[:, rest].fillna(means)
test_df.loc[:, rest] = test_df.loc[:, rest].fillna(means)

In [108]:
len(train_df.columns)

205

In [109]:
# df.to_csv('../data/features.csv', index=False)

# PART II

In [110]:
# df = pd.read_csv('../data/features.csv')

In [111]:
# len(df.columns)

In [116]:
train_X = train_df.drop('class', axis=1)
train_y = train_df['class']

test_X = test_df.drop('class', axis=1)
test_y = test_df['class']

In [86]:
# model = cb.CatBoostClassifier(iterations=1000, depth=8, eval_metric="AUC", task_type='GPU', logging_level='Silent', l2_leaf_reg=6)
# model.fit(train_X, train_y, cat_features=chosen_cat_cols)

# model = lgb.sklearn.LGBMClassifier()
# model.fit(train_X, train_y, categorical_feature=chosen_cat_cols) 

# model = xgb.sklearn.XGBClassifier(n_estimators=100)
# model.fit(train_X, train_y)


<catboost.core.CatBoostClassifier at 0x7fc6468ce160>

In [117]:
model = xgb.sklearn.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=100, reg_lambda=3)
model.fit(train_X, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=3, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [118]:
def custom_score(y_true, y_pred):
    ind = np.argsort(y_pred)[::-1]
    check_size = int(np.ceil(0.1 * len(y_true)))
    y_true = y_true[ind][:check_size]
    return np.mean(y_true)

y_preds_proba = model.predict_proba(test_X)[:,1]
c_score = custom_score(test_y, y_preds_proba)
auc = roc_auc_score(test_y, y_preds_proba)
c_score, auc

(0.39666666666666667, 0.8490104964675271)

In [23]:
idx = np.argsort(model.feature_importances_)[::-1][:64]
best_features = np.array(model.feature_names_)[idx]
np.sum(model.feature_importances_[idx])

97.18913484805886

In [24]:
model = cb.CatBoostClassifier(iterations=1000, depth=8, eval_metric="AUC", task_type='GPU', logging_level='Silent', l2_leaf_reg=3)
                             # one_hot_max_size=32)
model.fit(train_X[best_features], train_y, cat_features=set(chosen_cat_cols).intersection(set(best_features)))

<catboost.core.CatBoostClassifier at 0x7f67f22d7160>

In [25]:
y_preds_proba = model.predict_proba(test_X[best_features])[:,1]
c_score = custom_score(test_y.values, y_preds_proba)
auc = roc_auc_score(test_y.values, y_preds_proba)
c_score, auc

(0.41, 0.869078457125441)

In [63]:
# model = cb.CatBoostClassifier(iterations=500, task_type='GPU', logging_level='Silent')
model = RandomForestRegressor()
scorers = {
    'custom': make_scorer(custom_score, greater_is_better=True, needs_proba=True),
    'roc_auc': make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True),
}
scores = cross_validate(model, X, y, scoring=scorers, cv=3)

In [64]:
scores



{'fit_time': array([30.34519362, 29.9092176 , 29.61446047]),
 'score_time': array([0.22051287, 0.21404934, 0.22123337]),
 'test_custom': array([0.42728636, 0.39130435, 0.39805097]),
 'train_custom': array([0.57030371, 0.54105737, 0.56730409]),
 'test_roc_auc': array([0.87386933, 0.85345332, 0.85991854]),
 'train_roc_auc': array([0.94899372, 0.93676128, 0.94856453])}