In [9]:
import numpy as np
import pandas as pd
import missingno as msno
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib import pyplot as plt
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import r2_score, fbeta_score, mean_squared_error, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split, KFold, StratifiedKFold
import xgboost
import time

In [10]:
data = np.load('X_y.npz')
X, y = data['X'], data['y']
X_other, X_test, y_other, y_test = train_test_split(X,y,test_size = 0.2,random_state=42,stratify=y)

In [11]:
help(KFold)

Help on class KFold in module sklearn.model_selection._split:

class KFold(_UnsupportedGroupCVMixin, _BaseKFold)
 |  KFold(n_splits=5, *, shuffle=False, random_state=None)
 |
 |  K-Fold cross-validator.
 |
 |  Provides train/test indices to split data in train/test sets. Split
 |  dataset into k consecutive folds (without shuffling by default).
 |
 |  Each fold is then used once as a validation while the k - 1 remaining
 |  folds form the training set.
 |
 |  Read more in the :ref:`User Guide <k_fold>`.
 |
 |  For visualisation of cross-validation behaviour and
 |  comparison between common scikit-learn split methods
 |  refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
 |
 |  Parameters
 |  ----------
 |  n_splits : int, default=5
 |      Number of folds. Must be at least 2.
 |
 |      .. versionchanged:: 0.22
 |          ``n_splits`` default value changed from 3 to 5.
 |
 |  shuffle : bool, default=False
 |      Whether to shuffle the data before splitting int

In [12]:
def xgb_model(X_other,y_other, X_test, y_test, random_state, verbose=1):
    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    n_splits = 5
    this_cv = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    y_other = np.reshape(np.array(y_other), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    clf = xgboost.XGBClassifier(n_jobs=-1,random_state=random_state)
    
    # find the best parameter set
    param_grid = {"learning_rate": [0.03],
                  "n_estimators": [100],
                  "seed": [0],
                  "reg_alpha": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
                  "reg_lambda": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
                  "missing": [np.nan], 
                  #"max_depth": [1,3,10,30,100,],
                  "colsample_bytree": [0.9],              
                  "subsample": [0.66]}

    pg = ParameterGrid(param_grid)

    scores = np.zeros((len(pg),n_splits))

    for i in range(len(pg)):
        if verbose >= 5:
                print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        for j, (train_index, val_index) in enumerate(this_cv.split(X_other,y_other)):
            X_train = X_other.iloc[train_index,:]
            y_train = y_other[train_index]
            X_val = X_other.iloc[val_index,:]
            y_val = y_other[val_index]
            clf.set_params(**params,early_stopping_rounds=50)
            eval_set = [(X_val, y_val)]
            clf.fit(X_train, y_train,
                    eval_set=eval_set, verbose=False)# with early stopping
            y_val_pred = clf.predict(X_val)
            scores[i,j] = accuracy_score(y_val,y_val_pred)
    scores = np.mean(scores, axis=1)

    best_params = np.array(pg)[scores == np.max(scores)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

    # test the model on the test set with best parameter set
    clf.set_params(**best_params[0],early_stopping_rounds=50)
    clf.fit(X_train,y_train,
            eval_set=eval_set, verbose=False)
    y_test_pred = clf.predict(X_test)

    if verbose >= 1:
        print ('The accuracy is:',accuracy_score(y_test,y_test_pred))
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
    if verbose >= 3:
        print("Feature importances:")
        print(clf.feature_importances_)

    return (accuracy_score(y_test,y_test_pred), y_test_pred,best_params, clf.feature_importances_)
def l1_model(X_other,y_other, X_test, y_test,random_state, verbose=1):
    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    n_splits = 5
    this_cv = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    y_other = np.reshape(np.array(y_other), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    clf = LogisticRegression(solver='saga', penalty='l1',n_jobs=-1,random_state=random_state)
    
    # find the best parameter set
    param_grid = {
    'C': 1/np.logspace(-2,2,num=5)              # Regularization parameter
}

    pg = ParameterGrid(param_grid)

    scores = np.zeros((len(pg),n_splits))

    for i in range(len(pg)):
        if verbose >= 5:
                print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        for j, (train_index, val_index) in enumerate(this_cv.split(X_other,y_other)):
            X_train = X_other.iloc[train_index,:]
            y_train = y_other[train_index]
            X_val = X_other.iloc[val_index,:]
            y_val = y_other[val_index]
            clf.set_params(**params)
            clf.fit(X_train, y_train)# with early stopping
            y_val_pred = clf.predict(X_val)
            scores[i,j] = accuracy_score(y_val,y_val_pred)
    scores = np.mean(scores, axis=1)

    best_params = np.array(pg)[scores == np.max(scores)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

    # test the model on the test set with best parameter set
    clf.set_params(**best_params[0])
    clf.fit(X_train,y_train)
    y_test_pred = clf.predict(X_test)

    if verbose >= 1:
        print ('The accuracy is:',accuracy_score(y_test,y_test_pred))
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
    if verbose >= 3:
        print("Feature importances:")
        print(clf.feature_importances_)

    return (accuracy_score(y_test,y_test_pred), y_test_pred,best_params)

def l2_model(X_other,y_other, X_test, y_test, random_state,verbose=1):
    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    n_splits = 5
    this_cv = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    y_other = np.reshape(np.array(y_other), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    clf = LogisticRegression(solver='saga', penalty='l2',n_jobs=-1,random_state=random_state)
    
    # find the best parameter set
    param_grid = {
    'C': 1/np.logspace(-2,2,num=5)              # Regularization parameter
}

    pg = ParameterGrid(param_grid)

    scores = np.zeros((len(pg),n_splits))

    for i in range(len(pg)):
        if verbose >= 5:
                print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        for j, (train_index, val_index) in enumerate(this_cv.split(X_other, y_other)):
            X_train = X_other.iloc[train_index,:]
            y_train = y_other[train_index]
            X_val = X_other.iloc[val_index,:]
            y_val = y_other[val_index]
            clf.set_params(**params)
            clf.fit(X_train, y_train)# with early stopping
            y_val_pred = clf.predict(X_val)
            scores[i,j] = accuracy_score(y_val,y_val_pred)
    scores = np.mean(scores, axis=1)

    best_params = np.array(pg)[scores == np.max(scores)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

    # test the model on the test set with best parameter set
    clf.set_params(**best_params[0])
    clf.fit(X_train,y_train)
    y_test_pred = clf.predict(X_test)

    if verbose >= 1:
        print ('The accuracy is:',accuracy_score(y_test,y_test_pred))
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
    if verbose >= 3:
        print("Feature importances:")
        print(clf.feature_importances_)

    return (accuracy_score(y_test,y_test_pred), y_test_pred,best_params)

def ela_model(X_other,y_other, X_test, y_test,random_state, verbose=1):
    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    n_splits = 5
    this_cv = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    y_other = np.reshape(np.array(y_other), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    clf = LogisticRegression(solver='saga', penalty='elasticnet',n_jobs=-1,random_state=random_state)
    
    # find the best parameter set
    param_grid = {
    'C': 1/np.logspace(-2,2,num=5),              # Regularization parameter
    'l1_ratio': np.linspace(0,1,5)
}

    pg = ParameterGrid(param_grid)

    scores = np.zeros((len(pg),n_splits))

    for i in range(len(pg)):
        if verbose >= 5:
                print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        for j, (train_index, val_index) in enumerate(this_cv.split(X_other,y_other)):
            X_train = X_other.iloc[train_index,:]
            y_train = y_other[train_index]
            X_val = X_other.iloc[val_index,:]
            y_val = y_other[val_index]
            clf.set_params(**params)
            clf.fit(X_train, y_train)# with early stopping
            y_val_pred = clf.predict(X_val)
            scores[i,j] = accuracy_score(y_val,y_val_pred)
    scores = np.mean(scores, axis=1)

    best_params = np.array(pg)[scores == np.max(scores)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

    # test the model on the test set with best parameter set
    clf.set_params(**best_params[0])
    clf.fit(X_train,y_train)
    y_test_pred = clf.predict(X_test)

    if verbose >= 1:
        print ('The accuracy is:',accuracy_score(y_test,y_test_pred))
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
    if verbose >= 3:
        print("Feature importances:")
        print(clf.feature_importances_)

    return (accuracy_score(y_test,y_test_pred), y_test_pred,best_params)

# random forest, svm, decision tree, l1, l2,
# neural network

def rf_model(X_other,y_other, X_test, y_test, random_state,verbose=1):
    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    n_splits = 5
    this_cv = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state) # 分层？
    y_other = np.reshape(np.array(y_other), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    clf = RandomForestClassifier(n_jobs=-1,random_state=random_state)
    
    # find the best parameter set
    param_grid = {'n_estimators': [100], 'max_features': ['log2', 'sqrt'],'max_depth': [2,4,8,16,32]}

    pg = ParameterGrid(param_grid)

    scores = np.zeros((len(pg),n_splits))

    for i in range(len(pg)):
        if verbose >= 5:
                print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        for j, (train_index, val_index) in enumerate(this_cv.split(X_other,y_other)):
            X_train = X_other.iloc[train_index,:]
            y_train = y_other[train_index]
            X_val = X_other.iloc[val_index,:]
            y_val = y_other[val_index]
            clf.set_params(**params)
            clf.fit(X_train, y_train)# with early stopping
            y_val_pred = clf.predict(X_val)
            scores[i,j] = accuracy_score(y_val,y_val_pred)
    scores = np.mean(scores, axis=1)

    best_params = np.array(pg)[scores == np.max(scores)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

    # test the model on the test set with best parameter set
    clf.set_params(**best_params[0])
    clf.fit(X_train,y_train)
    y_test_pred = clf.predict(X_test)

    if verbose >= 1:
        print ('The accuracy is:',accuracy_score(y_test,y_test_pred))
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
    if verbose >= 3:
        print("Feature importances:")
        print(clf.feature_importances_)

    return (accuracy_score(y_test,y_test_pred), y_test_pred,best_params, clf.feature_importances_)

def knn_model(X_other,y_other, X_test, y_test,random_state, verbose=1):
    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    n_splits = 5
    this_cv = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    y_other = np.reshape(np.array(y_other), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    clf = KNeighborsClassifier(n_jobs=-1)
    
    # find the best parameter set
    param_grid = {'n_neighbors':[2,4,8,16,32,64]}

    pg = ParameterGrid(param_grid)

    scores = np.zeros((len(pg),n_splits))

    for i in range(len(pg)):
        if verbose >= 5:
                print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        for j, (train_index, val_index) in enumerate(this_cv.split(X_other,y_other)):
            X_train = X_other.iloc[train_index,:]
            y_train = y_other[train_index]
            X_val = X_other.iloc[val_index,:]
            y_val = y_other[val_index]
            clf.set_params(**params)
            clf.fit(X_train, y_train)# with early stopping
            y_val_pred = clf.predict(X_val)
            scores[i,j] = accuracy_score(y_val,y_val_pred)
    scores = np.mean(scores, axis=1)

    best_params = np.array(pg)[scores == np.max(scores)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

    # test the model on the test set with best parameter set
    clf.set_params(**best_params[0])
    clf.fit(X_train,y_train)
    y_test_pred = clf.predict(X_test)

    if verbose >= 1:
        print ('The accuracy is:',accuracy_score(y_test,y_test_pred))
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
    if verbose >= 3:
        print("Feature importances:")
        print(clf.feature_importances_)

    return (accuracy_score(y_test,y_test_pred), y_test_pred,best_params)

def svc_model(X_other,y_other, X_test, y_test, random_state,verbose=1):
    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    n_splits = 5
    this_cv = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    y_other = np.reshape(np.array(y_other), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    clf = SVC(random_state=random_state)
    
    # find the best parameter set
    param_grid = {
    'C': np.logspace(-2,2,num=5),              # Regularization parameter
    'gamma':np.logspace(-2,2,num=5),           # Gamma values for rbf/poly/sigmoid kernels
}

    pg = ParameterGrid(param_grid)

    scores = np.zeros((len(pg),n_splits))

    for i in range(len(pg)):
        if verbose >= 5:
                print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        for j, (train_index, val_index) in enumerate(this_cv.split(X_other,y_other)):
            X_train = X_other.iloc[train_index,:]
            y_train = y_other[train_index]
            X_val = X_other.iloc[val_index,:]
            y_val = y_other[val_index]
            clf.set_params(**params)
            clf.fit(X_train, y_train)# with early stopping
            y_val_pred = clf.predict(X_val)
            scores[i,j] = accuracy_score(y_val,y_val_pred)
    scores = np.mean(scores, axis=1)

    best_params = np.array(pg)[scores == np.max(scores)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

    # test the model on the test set with best parameter set
    clf.set_params(**best_params[0])
    clf.fit(X_train,y_train)
    y_test_pred = clf.predict(X_test)

    if verbose >= 1:
        print ('The accuracy is:',accuracy_score(y_test,y_test_pred))
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
    if verbose >= 3:
        print("Feature importances:")
        print(clf.feature_importances_)

    return (accuracy_score(y_test,y_test_pred), y_test_pred,best_params)

In [19]:
for random_state in range(0,3):
        X_other,  X_test, y_other, y_test = train_test_split(X,y,train_size=0.2,shuffle=True,stratify=y)
        le = LabelEncoder() 
        y_other = le.fit_transform(y_other)
        y_test = le.transform(y_test)
        total_acc, y_test_pred, best_params, importance =  xgb_model(pd.DataFrame(X_other),pd.DataFrame(y_other), pd.DataFrame(X_test), pd.DataFrame(y_test),random_state, verbose=1)

In [20]:
y_test

array([0, 0, 0, ..., 0, 0, 1])

In [21]:
1-y_test.sum()/len(y_test)

np.float64(0.6654939487756826)