In [1]:
import numpy as np
import pandas as pd
import os
import sklearn
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, RFECV, SelectFromModel
from imblearn.over_sampling import SMOTE
from boruta import BorutaPy
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

Using TensorFlow backend.


## Read Data

In [2]:
def read_data():
    print("############# Read Data #############")
    
    train_orig = pd.read_csv('../input/train.csv')
    test_orig = pd.read_csv('../input/test.csv')
    
    return train_orig, test_orig

## Preprocess Data

In [3]:
def preprocess_data(train_orig, test_orig, scale_features = True):
    print("############# Preprocess Data #############")
    
    train, test = train_orig.copy(), test_orig.copy()
    train, target = train.drop(['target', 'id'], 1), train['target']
    test = test.drop('id', 1)
    
    if scale_features:
        scaler = StandardScaler()
        train = pd.DataFrame(scaler.fit_transform(train))
        test = pd.DataFrame(scaler.fit_transform(test))
        
    return train, test, target

## SMOTE

In [4]:
def smote(train, target):
    print("############# SMOTE #############")
    
    columns = train.columns
    sm = SMOTE(sampling_strategy = 'minority')
    train, target = sm.fit_resample(train, target)
    train = pd.DataFrame(train)
    target = pd.DataFrame(target)
    train.columns = columns
    return train, target

## Feature Engineering

In [5]:
def create_features(train, test, add_pca = False, add_kmeans = False):
    print("############# Feature Engineering #############")
    
    train['sum'] = train.sum(axis = 1)
    train['mean'] = train.mean(axis = 1)
    train['std'] = train.std(axis = 1)
    train['min'] = train.min(axis = 1)
    train['max'] = train.max(axis = 1)
    train['var'] = train.var(axis = 1)
    train['skew'] = train.skew(axis = 1)
    train['kurtosis'] = train.kurtosis(axis = 1)
    
    
    test['sum'] = test.sum(axis = 1)
    test['mean'] = test.mean(axis = 1)
    test['std'] = test.std(axis = 1)
    test['min'] = test.min(axis = 1)
    test['max'] = test.max(axis = 1)
    test['var'] = test.var(axis = 1)
    test['skew'] = test.skew(axis = 1)
    test['kurtosis'] = test.kurtosis(axis = 1)
    
    return train, test

## Looking the feature distributions

In [6]:
# sns.kdeplot(atrain['3'], bw = 0.5, label = "train")
# sns.kdeplot(test['3'], bw = 0.5, label = "test")

In [7]:
# np.mean(train.var())

## Feature Selection

In [8]:
def get_feature_importances(data, target, shuffle = False, seed = 42):
    
    # Gather real features
    train_features = data.columns
    
    # Go over fold and keep track of CV score (train and valid) and feature importances
    # Shuffle target if required
    y = target.copy()
    if shuffle:
        # Here you could as well use a binomial distribution
        y = target.copy().sample(frac = 1.0)
    
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(data, y, free_raw_data = False, silent = True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'rf',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 6,
        'seed': seed,
        'bagging_freq': 1,
        'n_jobs': -1
    }
    
    # Fit the model
    clf = lgb.train(params = lgb_params, train_set = dtrain, num_boost_round = 200)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance_gain"] = clf.feature_importance(importance_type = 'gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type = 'split')
    imp_df['trn_score'] = roc_auc_score(y, clf.predict(data))
    return imp_df

def get_feature_scores(true_df, noise_df):
    
    correlation_scores = []
    for feature in true_df['feature'].unique():
        
        # Gain score
        f_null_imps = noise_df.loc[noise_df['feature'] == feature, 'importance_gain'].values
        f_act_imps = true_df.loc[true_df['feature'] == feature, 'importance_gain'].values
        gain_score = 100 * (f_null_imps < f_act_imps).sum() / f_null_imps.size
        
        # Split score
        f_null_imps = noise_df.loc[noise_df['feature'] == feature, 'importance_split'].values
        f_act_imps = true_df.loc[true_df['feature'] == feature, 'importance_split'].values
        split_score = 100 * (f_null_imps < f_act_imps).sum() / f_null_imps.size
        
        correlation_scores.append((feature, split_score, gain_score))
    return correlation_scores

def get_imp_features_using_thresholding(correlation_scores, data, target):
    
    # Fit LightGBM
    def score_feature_selection(data, train_features, target):
        dtrain = lgb.Dataset(data[train_features], target, free_raw_data = False, silent = True)
        lgb_params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'learning_rate': .1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'num_leaves': 31,
            'max_depth': 5,
            'seed': 13,
            'n_jobs': 4,
            'min_split_gain': .00001,
            'reg_alpha': .00001,
            'reg_lambda': .00001,
            'metric': 'auc'
        }

        # Fit the model
        hist = lgb.cv(
            params = lgb_params, 
            train_set = dtrain, 
            num_boost_round = 2000,
            nfold = 5,
            stratified = True,
            shuffle = True,
            early_stopping_rounds = 50,
            verbose_eval = 0,
            seed = 17
        )

        # Get the last mean / std values 
        return hist['auc-mean'][-1], hist['auc-stdv'][-1]
    
    best_features_gain = []
    best_features_split = []
    max_gain_ = -10000
    max_split_ = -10000
    print('\n')
    for threshold in [0, 10, 20, 30 , 40, 50 ,60 , 70, 80 , 90, 95, 99]:
        split_feats = [_f for _f, _score, _ in correlation_scores if _score >= threshold]
        gain_feats = [_f for _f, _, _score in correlation_scores if _score >= threshold]
        
        print('Threshold %3d' % threshold)
        split_results = score_feature_selection(data = data, train_features = split_feats, target = target)
        gain_results = score_feature_selection(data = data, train_features = gain_feats, target = target)
        
        if gain_results[0] > max_gain_:
            best_features_gain = gain_feats
            max_gain_ = gain_results[0]
            
        if split_results[0] > max_split_:
            best_features_split = split_feats
            max_split_ = split_results[0]
    
    return best_features_gain

def feature_selector(train, target, best_params = None, num_features = 100, num_permutations = 100, method = "rfe", model_name = "rforest"):
    print("############# Feature Selection #############")
    
    if model_name == 'logistic':
        model = LogisticRegression(solver = "liblinear", penalty = 'l1')
    elif model_name == "rforest":
        model = RandomForestClassifier(n_estimators = 500, max_depth = 5, random_state = 42, class_weight = "balanced")
    elif model_name == "xgb":
        model = xgb.XGBClassifier(n_estimators = 500, random_state = 42, max_depth = 5)
    else:
        return
    
    if method == "rfe":
        selector = RFE(model, num_features)
        selector.fit(train, target.values.ravel())
        selected_features = train.columns[selector.get_support()]
    elif method == "boruta":
        boruta_model = RandomForestClassifier(max_depth = 5, random_state = 42, class_weight = "balanced", n_jobs = -1)
        boruta = BorutaPy(boruta_model, n_estimators = 'auto', verbose = 5)
        boruta.fit(train.values, target.values)
        selected_features = train.columns[boruta.support_]
    elif method == "lasso":
        lasso = Lasso(alpha = 0.0335, selection = "random", tol = 0.01, random_state = 42)
        model = SelectFromModel(lasso, threshold = -np.inf, max_features = num_features)
        model.fit(train, target)
        selected_features = train.columns[model.get_support()]
    elif method == 'null_importances':
        true_imp = get_feature_importances(data = train, target = target)
        
        # Calculate the permutation null distribution
        num_permutations = num_permutations
        null_imp_df = pd.DataFrame()
        for i in range(num_permutations):
            
            print(i, end = " ")
            
            # Get current run importances
            imp_df = get_feature_importances(data = train, target = target, shuffle = True)
            imp_df['run'] = i + 1 
            
            # Concat the latest importances with the old ones
            null_imp_df = pd.concat([null_imp_df, imp_df], axis = 0)
            
        # Get feature scores
        scores = get_feature_scores(true_df = true_imp, noise_df = null_imp_df)
        selected_features = get_imp_features_using_thresholding(correlation_scores = scores, 
                                                              data = train, 
                                                              target = target)
    
    return selected_features

## GridSearchCV

In [9]:
def grid_search(train, target, selected_features = None, cv = 5, model_name = "logistic"):
    print("############# Grid Search #############")
    
    if selected_features is None:
        selected_features = train.columns
        
    if model_name == "logistic":
        model = LogisticRegression(random_state = 42)
        param_grid = {
            'class_weight' : ['balanced'], 
            'penalty' : ['l1'],
            'solver': ['liblinear'],
            'C' : np.arange(0.01, 0.1, 0.01)
        }
    elif model_name == "svm":
        model = SVC(random_state = 42)
        param_grid = {
            'C': np.arange(0.02, 0.03, 0.001),
            'class_weight': ['balanced'],
            'gamma': ['auto'],
            'probability': [True],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
        }
    elif model_name == "lasso":
        model = Lasso(random_state = 42)
        param_grid = {
            'alpha' : [0.022, 0.021, 0.02, 0.019, 0.023, 0.024, 0.025, 0.026, 0.027, 0.029, 0.031],
            'tol'   : [0.0013, 0.0014, 0.001, 0.0015, 0.0011, 0.0012, 0.0016, 0.0017]
        }
    else:
        return

    grid = GridSearchCV(estimator = model, cv = cv, param_grid = param_grid , scoring = 'roc_auc', verbose = 1, n_jobs = -1)
    grid.fit(train[selected_features], target)

    print("Best Score:" + str(grid.best_score_))
    print("Best Parameters: " + str(grid.best_params_))

    return grid.best_params_

## Train Models

In [10]:
def train_model(train, 
                target, 
                test, 
                best_params, 
                selected_features = None, 
                n_folds = 11, 
                n_repeats = 15,
                stratify = True,
                model_name = 'logistic'):
    print("############# Train Model #############")
    
    if selected_features is None:
        selected_features = train.columns
    
    train = train[selected_features]
    test = test[selected_features]
        
    train_predictions = np.zeros((train.shape[0], 1))
    test_predictions = np.zeros((test.shape[0], 1))
    
    if stratify:
        cv = RepeatedStratifiedKFold(n_splits = n_folds, random_state = 420, n_repeats = n_repeats)
    else:
        cv = RepeatedKFold(n_splits = n_folds, random_state = 420, n_repeats = n_repeats)
    cv.get_n_splits(train, target)

    cv_scores = []
    fold = 1
    coefs = []
    for train_idx, valid_idx in cv.split(train, target):
        xtrain, xvalid = train.iloc[train_idx], train.iloc[valid_idx]
        ytrain, yvalid = target.iloc[train_idx], target.iloc[valid_idx]
        
        if model_name == "logistic":
            model = LogisticRegression(**best_params)
        elif model_name == "svm":
            model = SVC(**best_params)
        elif model_name == "lasso":
            model = Lasso(**best_params) if best_params else Lasso()
        elif model_name == "ridge":
            model = Ridge(**best_params)
        else:
            return
        model.fit(xtrain, ytrain.values.ravel())
        coefs.append(model.coef_)
        
        if model_name in ['logistic']:
            valid_preds = model.predict_proba(xvalid)[:, 1]
        else:
            valid_preds = model.predict(xvalid).clip(0, 1)
        train_predictions[valid_idx] = valid_preds.reshape(-1, 1)

        scr = roc_auc_score(yvalid.values, valid_preds)
        cv_scores.append(scr)
        print("Fold = {}. AUC = {}.".format(fold, scr))
        
        if model_name in ['logistic']:
            test_preds = model.predict_proba(test)[:, 1]
        else:
            test_preds = model.predict(test).clip(0, 1)
        test_predictions += test_preds.reshape(-1, 1)
        fold += 1
    test_predictions = test_predictions * 1./(n_folds*n_repeats)
    print("Mean Score: {}. Std Dev: {}".format(np.mean(cv_scores), np.std(cv_scores)))
    
    return test_predictions, coefs

In [11]:
# Read the data and scale features
train_orig, test_orig = read_data()

# Preprocess data
train, test, target = preprocess_data(train_orig, test_orig, scale_features = False)

# Feature engineering
train, test = create_features(train, test)

############# Read Data #############
############# Preprocess Data #############
############# Feature Engineering #############


In [12]:
# Find different sets of best features
# best_features_rfe_rforest = feature_selector(train, target, method = "rfe", num_features = 30, model_name = "rforest")
# best_features_rfe_xgb = feature_selector(train, target, method = "rfe", num_features = 30, model_name = 'xgb')
# best_features_rfe_logistic = feature_selector(train, target, method = "rfe", num_features = 30, model_name = 'logistic')
# best_features_boruta = feature_selector(train, target, method = "boruta")
# best_features_null_importances = feature_selector(train, target, method = "null_importances", num_permutations = 100)

In [13]:
best_features_lasso = feature_selector(train, target, method = "lasso", num_features = 10)
best_features_lasso

############# Feature Selection #############


Index(['33', '65', '73', '80', '91', '117', '199', '217', '226', '295'], dtype='object')

In [14]:
# best_features_rfe_logistic = ['33', '42', '43', '65', '67', '69', '73', '82', '90', '91', '95', '101',
#        '108', '117', '130', '132', '134', '149', '165', '168', '183', '199',
#        '217', '239', '258', '259', '261', '272', '293', '295']

# best_features_rfe_rforest = ['33', '42', '43', '65', '67', '69', '73', '82', '90', '91', '95', '101',
#        '108', '117', '130', '132', '134', '149', '165', '168', '183', '199',
#        '217', '239', '258', '259', '261', '272', '293', '295']

# best_features_rfe_xgb = ['9', '16', '17', '30', '33', '35', '48', '51', '65', '91', '100', '102',
#        '106', '117', '118', '131', '134', '157', '214', '217', '219', '237',
#        '249', '250', '268', '282', 'sum', 'std', 'min', 'var']

# best_features_boruta = ['17', '24', '33', '65', '80', '91', '117', '217', 'sum']

# best_features_null_importances = ['7',
#  '9',
#  '16',
#  '24',
#  '33',
#  '43',
#  '45',
#  '48',
#  '50',
#  '65',
#  '70',
#  '73',
#  '82',
#  '83',
#  '91',
#  '101',
#  '104',
#  '108',
#  '117',
#  '127',
#  '131',
#  '133',
#  '141',
#  '147',
#  '151',
#  '157',
#  '164',
#  '165',
#  '176',
#  '179',
#  '183',
#  '189',
#  '194',
#  '198',
#  '199',
#  '211',
#  '217',
#  '225',
#  '226',
#  '227',
#  '258',
#  '295',
#  '298',
#  'sum',
#  'mean',
#  'min',
#  'max',
#  'var']

In [15]:
# best_features_rfe_rforest = ['16', '17', '24', '30', '33', '39', '63', '65', '70', '73', '82', '91', 
#                              '101', '117', '164', '183', '189', '194', '199', '201', '217', '230', '231', 
#                              '237', '272', '295']
# best_features_rfe_rforest = [ '65', '33', '15', '69', '73', '79', '91', '82', '46', '201', '217', '295', '289', 
#                              '258', '249', '281', '285', '164', '117', '100', '198', '101', '237', '165', '115', 
#                              '199', '146', '119', '134']

# best_features_rfe_rforest = ['33', '65']

In [16]:
# Find best params
# best_params_logistic = grid_search(train, target, cv = 3, selected_features = best_features, model_name = 'logistic')
# best_params_svm = grid_search(train, target, cv = 3, selected_features = best_features, model_name = 'svm')
# best_params_lasso = grid_search(train, target, cv = 10, selected_features = best_features, model_name = 'lasso')

## Logistic Regression

In [17]:
# best_params_logistic = {'C': 0.2, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear', 'random_state': 42}
# predictions_logistic = train_model(train = train, 
#                                    target = target, 
#                                    test = test, 
#                                    best_params = best_params_logistic, 
#                                    n_folds = 10, 
#                                    n_repeats = 3, 
#                                    selected_features = best_features_rfe_rforest,
#                                    model_name = "logistic")

## SVM

In [18]:
# best_params_svm = {'C': 0.007, 'class_weight': 'balanced', 'kernel': 'rbf', 'gamma': 'auto'}
# predictions_svm = train_model(train = train, 
#                             target = target, 
#                             test = test, 
#                             best_params = best_params_svm, 
#                             n_folds = 10, 
#                             n_repeats = 3, 
#                             selected_features = None,
#                             model_name = "svm")

## Lasso Regression

In [19]:
best_params_lasso = {'alpha': 0.0445, 'tol': 0.01, 'selection': 'random', 'random_state': 42, 'max_iter': 1000}
predictions_lasso_2, coefs = train_model(train = train, 
                                        target = target, 
                                        test = test, 
                                        best_params = best_params_lasso, 
                                        n_folds = 10, 
                                        n_repeats = 3,
                                        stratify = True,
                                        selected_features = best_features_lasso,
                                        model_name = "lasso")

############# Train Model #############
Fold = 1. AUC = 0.8125.
Fold = 2. AUC = 0.8402777777777778.
Fold = 3. AUC = 0.875.
Fold = 4. AUC = 0.8888888888888888.
Fold = 5. AUC = 0.9027777777777778.
Fold = 6. AUC = 0.861111111111111.
Fold = 7. AUC = 0.7222222222222222.
Fold = 8. AUC = 0.8611111111111112.
Fold = 9. AUC = 0.923611111111111.
Fold = 10. AUC = 0.8402777777777779.
Fold = 11. AUC = 0.9166666666666667.
Fold = 12. AUC = 0.7013888888888888.
Fold = 13. AUC = 0.8750000000000001.
Fold = 14. AUC = 0.8888888888888888.
Fold = 15. AUC = 0.9791666666666667.
Fold = 16. AUC = 0.8194444444444445.
Fold = 17. AUC = 0.8958333333333333.
Fold = 18. AUC = 0.8819444444444445.
Fold = 19. AUC = 0.8125.
Fold = 20. AUC = 0.9097222222222222.
Fold = 21. AUC = 0.9305555555555556.
Fold = 22. AUC = 0.8402777777777778.
Fold = 23. AUC = 0.9375.
Fold = 24. AUC = 0.7847222222222222.
Fold = 25. AUC = 0.9652777777777778.
Fold = 26. AUC = 0.7430555555555556.
Fold = 27. AUC = 0.9583333333333333.
Fold = 28. AUC = 0.67

In [20]:
# predictions_lasso_1, coefs = train_model(train = train, 
#                                         target = target, 
#                                         test = test, 
#                                         best_params = best_params_lasso, 
#                                         n_folds = 10, 
#                                         n_repeats = 3,
#                                         stratify = True,
#                                         selected_features = best_features_rfe_logistic,
#                                         model_name = "lasso")

# predictions_lasso_2, coefs = train_model(train = train, 
#                                         target = target, 
#                                         test = test, 
#                                         best_params = best_params_lasso, 
#                                         n_folds = 10, 
#                                         n_repeats = 3,
#                                         stratify = True,
#                                         selected_features = best_features_rfe_rforest,
#                                         model_name = "lasso")

# predictions_lasso_3, coefs = train_model(train = train, 
#                                         target = target, 
#                                         test = test, 
#                                         best_params = best_params_lasso, 
#                                         n_folds = 10, 
#                                         n_repeats = 3,
#                                         stratify = True,
#                                         selected_features = best_features_rfe_xgb,
#                                         model_name = "lasso")

# predictions_lasso_4, coefs = train_model(train = train, 
#                                         target = target, 
#                                         test = test, 
#                                         best_params = best_params_lasso, 
#                                         n_folds = 10, 
#                                         n_repeats = 3,
#                                         stratify = True,
#                                         selected_features = best_features_boruta,
#                                         model_name = "lasso")

# predictions_lasso_5, coefs = train_model(train = train, 
#                                         target = target, 
#                                         test = test, 
#                                         best_params = best_params_lasso, 
#                                         n_folds = 10, 
#                                         n_repeats = 3,
#                                         stratify = True,
#                                         selected_features = best_features_null_importances,
#                                         model_name = "lasso")

In [21]:
# predictions_lasso = np.mean([predictions_lasso_1, predictions_lasso_2, predictions_lasso_3, predictions_lasso_4, predictions_lasso_5], axis = 0)

## Submission

In [22]:
submit = pd.read_csv('../input/sample_submission.csv')
submit["target"] = predictions_lasso_2
submit.to_csv("submission.csv", index = False)
submit.head(10)

Unnamed: 0,id,target
0,250,0.754301
1,251,0.719607
2,252,0.647965
3,253,0.705634
4,254,0.602006
5,255,0.536024
6,256,0.569001
7,257,0.371922
8,258,0.793336
9,259,0.418175
