## Data pre-processing

#### 1. SMOTE sampling

In [None]:
def after_smote_df(df, x_num_col, y, seed):
    """
    function: retain the data frame structure after using SMOTE
    df: original df
    x_num_col: all the numerical x variables
    y: y variable
    seed: random seed for oversampling
    """
    x = df.drop(y, axis=1)
    x = x[x_num_col]
    y = df[y]
    
    smote = SMOTE(random_state=seed)
    x_over, y_over = smote.fit_resample(x,y)
    
    x_over = pd.DataFrame(x_over, columns=x.columns)
    y_over = pd.DataFrame(y_over)
    df_over = pd.concat([x_over, y_over], axis=1)
    
    print('Ratio before oversampling: ',Counter(y))
    print('Ratio after oversampling: ',Counter(y_over))
    
    return df_over

## Visualization

#### * distribution plot for many variables

In [None]:
columns = df.iloc[:,1:29].columns # select variables

grid = gridspec.GridSpec(14,2) # change n_col and n_row accordingly 
plt.figure(figsize=(15,20*4)) # change fig size accordingly 

for n, col in enumerate(columns):
    ax = plt.subplot(grid[n])
    sns.distplot(df[df.Class==1][col], bins=50, color='b')
    sns.distplot(df[df.Class==0][col], bins=50, color='r')
    ax.set_title(col)
plt.show()

#### * boxplot for many variables

In [None]:
columns = ['V17','V14','V12','V10']

grid = gridspec.GridSpec(1,4)
plt.figure(figsize=(15,4))

for n, col in enumerate(columns):
    ax = plt.subplot(grid[n])
    sns.boxplot(x='Class',y=col, data=df_over)
    ax.set_title(col)

#### * heatmap

In [None]:
def heatmap(df, chart_title):
    corr = df.corr()

    plt.figure(figsize=(10,8))
    sns.heatmap(corr, cmap='coolwarm_r')
    plt.title(chart_title, fontsize=20)

## Model Evaluation

In [None]:
def print_results(headline, true_value, pred):
    """
    headline: which model is this result for
    true_value: y_test
    pred: y_pred
    """
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f2: {}".format(fbeta_score(true_value, pred, beta=2)))

In [None]:
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    cm: confusion matrix 
    classes: list of class names set for each level of y 
    title: chart title
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')