In [234]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


In [158]:
train, target = pd.read_csv('./input/orange_small_churn_data.train'), \
pd.read_csv('./input/orange_small_churn_labels.train', header=-1)

In [159]:
X, X_holdout, y, y_holdout = train_test_split(train, target, test_size=5000, random_state=42, stratify=target)

Preprocessing

In [160]:
# выставим порог отсева пустых значений на 95%
X = X[X.columns[(X.isnull().sum() / X.shape[0] ) < 0.95]].copy()

In [161]:
X.dtypes.value_counts()

float64    42
object     34
int64       1
dtype: int64

In [162]:
f_cat = X.columns[X.dtypes == "object"]
f_num = X.columns[X.dtypes != "object"]

In [163]:
# for linear model 
medians = X[f_num].median()
X_lin = X[f_num].fillna(medians).copy()

cat_dummies = pd.get_dummies(X[f_cat].fillna(-1))


In [167]:
# сразу отсеим совсем бесполезные фичи по порогу значимости chi2
from sklearn.feature_selection import chi2
chi2_pval = chi2(cat_dummies, y)[1]

X_lin = pd.concat([X_lin, cat_dummies.loc[:, chi2_pval < 0.05]], axis=1)

In [194]:
# for tree-based model

X_tree = X.fillna(-1).copy()
for c in f_cat:
    X_tree[c] = pd.factorize(X[c])[0]

Cross-validation

In [235]:
def cross_val(X, y, model, kf):
    X, y = np.array(X), np.array(y).reshape(-1)
    cv_scores = np.zeros((5,4), dtype=np.float32)

    for i, (train_index, val_index) in enumerate(kf.split(X, y)):

        print( "Fold ", i)

        y_train, y_val = y[train_index].copy(), y[val_index].copy()
        X_train, X_val = X[train_index, :].copy(), X[val_index, :].copy()
        
        fit_model = model.fit(X_train, y_train)
        pred = fit_model.predict(X_val)

        cv_scores[i, :] = [f1_score(y_val, pred), 
                           precision_score(y_val, pred), 
                           recall_score(y_val, pred), 
                           roc_auc_score(y_val, pred)]
        
    return cv_scores


In [236]:
def print_metrics(cv_scores):
    metrics = ['f1', 'precision', 'recall', 'roc auc']
    cvmean = cv_scores.mean(0)
    for i in range(4):
        print("{} = {:.5f}".format(metrics[i], cvmean[i]))

In [238]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

In [239]:
# random forest 
model = RandomForestClassifier(n_estimators=100, max_depth=6, class_weight='balanced')
cv_scores = cross_val(X_tree,y,model,kf)
print_metrics(cv_scores)

Fold  0
Fold  1
Fold  2
Fold  3
Fold  4
f1 = 0.22007
precision = 0.13440
recall = 0.60713
roc auc = 0.64639


In [240]:
# log reg
model = LogisticRegression(class_weight='balanced')
cv_scores = cross_val(X_lin,y,model,kf)
print_metrics(cv_scores)

Fold  0
Fold  1
Fold  2
Fold  3
Fold  4
f1 = 0.18241
precision = 0.10792
recall = 0.58909
roc auc = 0.59883


In [241]:
model = XGBClassifier(n_estimators=100, max_depth=6)
cv_scores = cross_val(X_tree,y,model,kf)
print_metrics(cv_scores)

Fold  0
Fold  1
Fold  2
Fold  3
Fold  4
f1 = 0.05012
precision = 0.63787
recall = 0.02612
roc auc = 0.51241
