In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pprint
from sklearn.utils import shuffle
from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score

%matplotlib inline

pp = pprint.PrettyPrinter()

In [26]:
DATA_FOLDER = '/home/xiaozhouzou/datasets/bank_additional_data/bank-additional'
DATA_NAME = 'bank-additional-full.csv'
DATA_PATH = os.path.join(DATA_FOLDER, DATA_NAME)

RECOMBINE = True

NUM_Y0_PART = 10

NUM_BINS_HIST = 100

STD_INCLUDE = 2

REMOVE_NUMERICAL_OUTLIER = False

VARIANCE_THRESHOLD = 0.1

EPSILON = 0.01

TRAIN_TEST_SPLIT = 0.25

RF_PARAMS = {"n_estimators":100, "min_samples_leaf":2}

# part I: Read Data

In [14]:
def load_data(data_path):

    data = pd.read_csv(data_path, sep=";")

    data["y"] = pd.DataFrame(data['y'].map({"no":0, "yes":1}))
    duration = data["duration"]
    data = data.drop(labels=["duration"], axis=1)
    data_features, data_target = data_preparation(data)
    data = pd.concat([data_features, data_target], axis=1)

    if RECOMBINE:

        data_y0 = data.loc[data["y"] == 0]
        data_y1 = data.loc[data["y"] == 1]

        y1_rec_num = data_y1.shape[0]

        recomb_data = []

        for i in range(NUM_Y0_PART):

            y0_data_sample = data_y0.sample(n=y1_rec_num)
            sample_i = pd.concat([y0_data_sample, data_y1], axis=0)
            sample_i = shuffle(sample_i)
            sample_target = sample_i["y"]
            sample_features = sample_i.drop(labels=["y"], axis=1)
            recomb_data.append({"features":sample_features, "target":sample_target})
        
        return recomb_data, data_features, data_target
            
    else:
        
        return {"features": data_features, "target": data_target}

In [4]:
def data_preparation(data):
    
    data_target = data["y"]
    data_features = data.drop(labels=["y"], axis=1)
    
    # get datatype information and histogram of all columns
    column_info = get_column_info(data_features, data_target, vis=False)
    
    # transform categorical data to dummy variables
    categorical_column_map, data_features = categorical_to_dummy(column_info, data_features)
    
    # drop the features valued unknown
    data_features = drop_unknown_valued_features(data_features, categorical_column_map)

    if REMOVE_NUMERICAL_OUTLIER:
        # remove outliers for numerical features
        data_features, data_target = numerical_outlier_removal(data_features, 
                                                               data_target, 
                                                               column_info, 
                                                               vis=False)
    
    return data_features, data_target

In [27]:
if RECOMBINE:
    
    recombine_features = {}
    
    recomb_data, data_features, data_target = load_data(DATA_PATH)

    for ind, batch_data in enumerate(recomb_data):
        
        batch_features = batch_data["features"]
        batch_target = batch_data["target"]
        
        batch_features, selected_features = low_variance_filter(batch_features)

        X_train, X_test, y_train, y_test = train_test_split(batch_features, 
                                                            batch_target, 
                                                            test_size=TRAIN_TEST_SPLIT,
                                                            stratify=batch_target)
        
        model, features_mean, features_var = RandomForest_Predict(X_train, 
                                                                  y_train, 
                                                                  RF_PARAMS)
        
        model_features = data_features.loc[:, selected_features]
        model_features = (model_features - features_mean) / np.sqrt(features_var + 1)
        
        model_pred = model.predict(model_features)
        
        model_f1_score = f1_score(y_true=data_target, y_pred=model_pred)
        
        recombine_features["batch_feature_" + str(ind)] = model.predict(model_features)
    
        print("The " + str(ind) + " model\'s f1 score on whole data is " + 
              str(model_f1_score))
    
    X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(recombine_features), 
                                                        data_target, 
                                                        test_size=TRAIN_TEST_SPLIT,
                                                        stratify=data_target)
    RandomForest_Baseline(X_train, y_train, X_test, y_test, RF_PARAMS)
    
else:
    
    data = load_data(DATA_PATH)
    data_features = data["features"]
    data_target = data["target"]
    
    data_features, selected_features = low_variance_filter(data_features)
    
    X_train, X_test, y_train, y_test = train_test_split(data_features, 
                                                        data_target, 
                                                        test_size=TRAIN_TEST_SPLIT,
                                                        stratify=data_target)
    
#     # get the baseline f1-score for RandomForest
#     RandomForest_Baseline(X_train, y_train, X_test, y_test, RF_PARAMS)

The f1 score for batch data is 0.8747736873868438
The 0 model's f1 score on whole data is 0.3957414269748838
The f1 score for batch data is 0.8831866328509621
The 1 model's f1 score on whole data is 0.3777985587766104
The f1 score for batch data is 0.88477058735615
The 2 model's f1 score on whole data is 0.39013242704793155
The f1 score for batch data is 0.8715167946979968
The 3 model's f1 score on whole data is 0.4548972908603751
The f1 score for batch data is 0.8879655735272296
The 4 model's f1 score on whole data is 0.3837561217190337
The f1 score for batch data is 0.8879606146501566
The 5 model's f1 score on whole data is 0.3885521100489625
The f1 score for batch data is 0.8866040828490538
The 6 model's f1 score on whole data is 0.4150751209574739
The f1 score for batch data is 0.8854182309656103
The 7 model's f1 score on whole data is 0.39328707821858505
The f1 score for batch data is 0.8930817610062892
The 8 model's f1 score on whole data is 0.3820969281743135
The f1 score for ba

# part II: Pre-processing and Exploratory Data Analysis

In [5]:
def get_column_info(data_features, data_target, vis=False, verbose=False):
    
    columns = list(data_features.columns.values)
    column_dtype_dict = {}

    for column_name in columns:
        column_dtype_dict[column_name] = data_features[column_name].dtype
    
    if verbose:
        
        pp.pprint("Column Information:")
        pp.pprint(column_dtype_dict)
    
    if vis:
        
        pp.pprint("Histogram of Columns:")
    
        for column_name, column_dtype in column_dtype_dict.items():
            
            title = "histogram of " + column_name
            
            if column_dtype == "object":
                
                data_features[column_name].value_counts().plot(kind='bar', title=title)
                plt.show()
                
            else: # dtype is int64 or float64
                
                data_features[column_name].plot.hist(title=title, bins=NUM_BINS_HIST)
                plt.show()
                
        data_target.plot.hist(bins=NUM_BINS_HIST)
    
    return column_dtype_dict

## map the categorical values to dummy variables

In [6]:
def categorical_to_dummy(column_dtype_dict, data_features, verbose=False):
    
    categorical_features = [k for (k,v) in column_dtype_dict.items() if v == "object"]
    column_map_dict = {}

    for col in categorical_features:
        
        unique_val_list = list(data_features[col].unique())
        column_map_dict[col] = dict(zip(unique_val_list, list(range(len(unique_val_list)))))
        data_features[col] = data_features[col].map(column_map_dict[col])
    
    if verbose:
        pp.pprint(column_map_dict)

    data_features = pd.get_dummies(data_features, columns=categorical_features)
    
    return column_map_dict, data_features

## drop the features valued unknown

In [7]:
def drop_unknown_valued_features(features, column_map_dict):

    for col, map_dict in column_map_dict.items():
        
        for col_val, val_code in map_dict.items():
            
            if col_val == "unknown":
                
                drop_col = col + "_" + str(val_code)
                features.drop(drop_col, axis=1, inplace=True)

    return features

## outlier removal by empirical mean and standard deviation

In [8]:
def numerical_outlier_removal(data, target, column_dtype_dict, vis=False, verbose=False):

    numerical_features = [k for (k,v) in column_dtype_dict.items() 
                          if v == "int64" or v == "float64"]    

    for col in numerical_features:

        col_mean = data[col].mean()
        col_std = data[col].std()
        col_selected = data[col].between(col_mean - STD_INCLUDE * col_std, 
                                         col_mean + STD_INCLUDE * col_std, 
                                         inclusive=True)
        selected_ratio = col_selected.value_counts()[True] / col_selected.size

        if verbose:
            print(numerical_features)
            print("selected ratio for feature {} is {}".format(col, selected_ratio))

        if selected_ratio > 0.9:
        # The outliers can't be greater than 10%

            data = data.loc[col_selected]
            target = target.loc[col_selected]

            if vis:

                data[col].plot.hist(bins=NUM_BINS_HIST)
                plt.show()

    return data, target

## remove the features with too little variance

In [9]:
def low_variance_filter(features, verbose=False):
    
    feature_series = pd.Series(features.columns)
    
    feature_sel_VT = VarianceThreshold(threshold=VARIANCE_THRESHOLD)
    feature_sel_VT.fit(features)
    VTed_features = feature_sel_VT.transform(features)
    
    selected_feature_indices = feature_sel_VT.get_support(indices=True)
    selected_features = feature_series.loc[selected_feature_indices]

    if verbose:
        pp.pprint(selected_features)
    
    return VTed_features, selected_features

# Model Fit

In [22]:
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [10]:
def zScoreNormalizer(features_train):

    normalizer_zScore = StandardScaler(with_mean=True, with_std=True)
    normalizer_zScore.fit(features_train)
    features_train = normalizer_zScore.transform(features_train)
    
    return features_train, normalizer_zScore.mean_, normalizer_zScore.var_, normalizer_zScore

In [17]:
def RandomForest_Baseline(features_train, target_train, features_test, target_test, 
                          RF_param_dict):
    
    features_train, _, _, normalizer_zScore = zScoreNormalizer(features_train)
    
    baseline_clf = RandomForestClassifier(n_estimators=RF_param_dict["n_estimators"], 
                                          min_samples_leaf=RF_param_dict["min_samples_leaf"])
    baseline_clf.fit(features_train, target_train)
    
    baseline_train_f1 = f1_score(y_true=target_train, 
                                 y_pred=baseline_clf.predict(features_train),)
    print("The baseline f1 score on the training set is {}".format(baseline_train_f1))

    features_test_zNormed = normalizer_zScore.transform(features_test)
    baseline_test_f1 = f1_score(y_true=target_test, 
                                y_pred=baseline_clf.predict(features_test_zNormed),)
    print("The baseline f1 score on the test set is {}".format(baseline_test_f1))

In [18]:
def RandomForest_Predict(features, target, RF_param_dict):

    features, features_mean, features_var, _ = zScoreNormalizer(features)
    
    RF_clf = RandomForestClassifier(n_estimators=RF_param_dict["n_estimators"], 
                                    min_samples_leaf=RF_param_dict["min_samples_leaf"])
    RF_clf.fit(features, target)
    data_f1_score = f1_score(y_true=target, y_pred=RF_clf.predict(features))
    
    print("The f1 score for batch data is " + str(data_f1_score))
    
    return RF_clf, features_mean, features_var