In [1]:
import numpy as np
import pandas as pd
import os
import sklearn
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, RFECV
from boruta import BorutaPy

%matplotlib inline

## Read Data

In [2]:
def read_data():
    print("############# Read Data #############")
    
    train_orig = pd.read_csv('../input/train.csv')
    test_orig = pd.read_csv('../input/test.csv')
    
    return train_orig, test_orig

## Preprocess Data

In [3]:
def preprocess_data(train_orig, test_orig, scale_features = True):
    print("############# Preprocess Data #############")
    
    train, test = train_orig.copy(), test_orig.copy()
    train, target = train.drop(['target', 'id'], 1), train['target']
    test = test.drop('id', 1)
    
    if scale_features:
        scaler = StandardScaler()
        train = pd.DataFrame(scaler.fit_transform(train))
        test = pd.DataFrame(scaler.fit_transform(test))
        
    return train, test, target

## SMOTE

In [4]:
def smote(train, test):
    return

## Feature Engineering

In [5]:
def create_features():
    return

## Feature Selection

In [6]:
def feature_selector(train, target, best_params = None, num_features = 100, method = "rfe", model_name = "logistic"):
    print("############# Feature Selection #############")
    
    if model_name == 'logistic':
        model = LogisticRegression(**best_params) if best_params else LogisticRegression(solver = "liblinear")
        model.fit(train, target)
    elif model_name == "rforest":
        return
    elif model_name == "xgb":
        return
    else:
        return
    
    if method == "rfe":
        selector = RFE(model, num_features, step = 1)
        selector.fit(train.values, target.values)
        selected_features = train.columns[selector.support_]
    elif method == "boruta":
        return
    elif method == 'eli5':
        return
    else:
        return
    
    return selected_features

## GridSearchCV

In [7]:
def grid_search(train, target, selected_features = None, cv = 5, model_name = "logistic"):
    print("############# Grid Search #############")
    
    if selected_features is None:
        selected_features = train.columns
        
    if model_name == "logistic":
        model = LogisticRegression(random_state = 42)
        param_grid = {
            'class_weight' : ['balanced'], 
            'penalty' : ['l2', 'l1'],
            'solver': ['liblinear', 'saga'],
            'C' : np.arange(0.1, 0.2, 0.001)
        }
    elif model_name == "svm":
        model = SVC(random_state = 42)
        param_grid = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'class_weight': ['balanced'],
            'gamma': ['auto'],
            'probability': [True],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
        }

    grid = GridSearchCV(estimator = model, cv = cv, param_grid = param_grid , scoring = 'roc_auc', verbose = 1, n_jobs = -1)
    grid.fit(train[selected_features], target)

    print("Best Score:" + str(grid.best_score_))
    print("Best Parameters: " + str(grid.best_params_))

    return grid.best_params_

## Train Models

In [8]:
def train_model(train, 
                target, 
                test, 
                best_params, 
                selected_features = None, 
                n_folds = 11, 
                n_repeats = 15, 
                model_name = 'logistic'):
    print("############# Train Model Logistic #############")
    
    if selected_features is None:
        selected_features = train.columns
        
    train_predictions = np.zeros((train.shape[0], 1))
    test_predictions = np.zeros((test.shape[0], 1))

    cv = RepeatedStratifiedKFold(n_splits = n_folds, random_state = 420, n_repeats = n_repeats)
    cv.get_n_splits(train, target)

    cv_scores = []
    fold = 1
    for train_idx, valid_idx in cv.split(train, target):
        xtrain, xvalid = train.iloc[train_idx][selected_features], train.iloc[valid_idx][selected_features]
        ytrain, yvalid = target.iloc[train_idx], target.iloc[valid_idx]
        
        if model_name == "logistic":
            model = LogisticRegression(**best_params)
        elif model_name == "svm":
            model = SVC(**best_params)
        else:
            return
        model.fit(xtrain, ytrain)

        valid_preds = model.predict_proba(xvalid)[:, 1]
        train_predictions[valid_idx] = valid_preds.reshape(-1, 1)

        scr = roc_auc_score(yvalid.values, valid_preds)
        cv_scores.append(scr)
        print("Fold = {}. AUC = {}.".format(fold, scr))

        test_preds = model.predict_proba(test[selected_features])[:, 1]
        test_predictions += test_preds.reshape(-1, 1)
        fold += 1
    test_predictions = test_predictions * 1./(n_folds*n_repeats)
    print("Mean Score: {}. Std Dev: {}".format(np.mean(cv_scores), np.std(cv_scores)))
    
    return test_predictions

In [9]:
# Read the data and scale features
train_orig, test_orig = read_data()

# Preprocess data
train, test, target = preprocess_data(train_orig, test_orig, scale_features = False)

############# Read Data #############
############# Preprocess Data #############


In [10]:
# Find best features
best_features = feature_selector(train, target, num_features = 20, model_name = "logistic")

############# Feature Selection #############


In [11]:
# Find best params
best_params_logistic = grid_search(train, target, cv = 3, selected_features = best_features, model_name = 'logistic')
best_params_svm = grid_search(train, target, cv = 3, selected_features = best_features, model_name = 'svm')

############# Grid Search #############
Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:    8.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Score:0.9277948287910553
Best Parameters: {'C': 0.10500000000000001, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}
############# Grid Search #############
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Score:0.9256969951083158
Best Parameters: {'C': 1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.7s finished


In [12]:
# Get predictions
predictions_logistic = train_model(train = train, 
                                   target = target, 
                                   test = test, 
                                   best_params = best_params_logistic, 
                                   n_folds = 10, 
                                   n_repeats = 10, 
                                   selected_features = best_features,
                                   model_name = "logistic")

############# Train Model Logistic #############
Fold = 1. AUC = 0.9375.
Fold = 2. AUC = 0.9375.
Fold = 3. AUC = 0.9375.
Fold = 4. AUC = 0.9722222222222222.
Fold = 5. AUC = 0.9305555555555556.
Fold = 6. AUC = 0.9236111111111112.
Fold = 7. AUC = 0.8055555555555556.
Fold = 8. AUC = 0.9930555555555555.
Fold = 9. AUC = 0.8958333333333334.
Fold = 10. AUC = 0.9305555555555556.
Fold = 11. AUC = 0.9722222222222222.
Fold = 12. AUC = 0.7638888888888891.
Fold = 13. AUC = 0.9097222222222223.
Fold = 14. AUC = 0.9583333333333334.
Fold = 15. AUC = 0.9722222222222222.
Fold = 16. AUC = 0.9097222222222223.
Fold = 17. AUC = 0.9583333333333333.
Fold = 18. AUC = 0.951388888888889.
Fold = 19. AUC = 0.9166666666666666.
Fold = 20. AUC = 0.9652777777777778.
Fold = 21. AUC = 0.9444444444444445.
Fold = 22. AUC = 0.986111111111111.
Fold = 23. AUC = 0.9722222222222222.
Fold = 24. AUC = 0.7986111111111112.
Fold = 25. AUC = 1.0.
Fold = 26. AUC = 0.9027777777777779.
Fold = 27. AUC = 0.9791666666666666.
Fold = 28. AUC

In [13]:
predictions_svm = train_model(train = train, 
                            target = target, 
                            test = test, 
                            best_params = best_params_svm, 
                            n_folds = 10, 
                            n_repeats = 10, 
                            selected_features = best_features,
                            model_name = "svm")

############# Train Model Logistic #############
Fold = 1. AUC = 0.9166666666666667.
Fold = 2. AUC = 0.9722222222222222.
Fold = 3. AUC = 0.9097222222222222.
Fold = 4. AUC = 0.9513888888888888.
Fold = 5. AUC = 0.9444444444444444.
Fold = 6. AUC = 0.9097222222222223.
Fold = 7. AUC = 0.798611111111111.
Fold = 8. AUC = 0.9930555555555555.
Fold = 9. AUC = 0.9236111111111112.
Fold = 10. AUC = 0.9027777777777778.
Fold = 11. AUC = 0.9444444444444444.
Fold = 12. AUC = 0.8055555555555556.
Fold = 13. AUC = 0.9305555555555556.
Fold = 14. AUC = 0.9375.
Fold = 15. AUC = 0.9791666666666666.
Fold = 16. AUC = 0.875.
Fold = 17. AUC = 0.8819444444444444.
Fold = 18. AUC = 0.9722222222222222.
Fold = 19. AUC = 0.9305555555555556.
Fold = 20. AUC = 0.9722222222222222.
Fold = 21. AUC = 0.9513888888888888.
Fold = 22. AUC = 0.986111111111111.
Fold = 23. AUC = 0.9375.
Fold = 24. AUC = 0.7777777777777778.
Fold = 25. AUC = 1.0.
Fold = 26. AUC = 0.9166666666666667.
Fold = 27. AUC = 0.9791666666666666.
Fold = 28. AUC 

In [14]:
# Ensemble the predictions
predictions = (predictions_logistic + predictions_svm)/2

## Submission

In [15]:
submit = pd.read_csv('../input/sample_submission.csv')
submit["target"] = predictions
submit.to_csv("submission.csv", index = False)
submit.head(10)

Unnamed: 0,id,target
0,250,0.636664
1,251,0.458076
2,252,0.76464
3,253,0.865834
4,254,0.880059
5,255,0.880226
6,256,0.674761
7,257,0.030773
8,258,0.818908
9,259,0.045504
