In [2]:
import os

from imblearn.combine import SMOTETomek
from itertools import cycle
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, precision_recall_curve, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import label_binarize, LabelEncoder
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import metrics

In [3]:
OUTPUT_DIR = "./output"
VIZ_DIR = "./Visualization"

### Generate plots

Plot confusion matrix (not used)

In [4]:
def plot_confusion_matrix(key_list, col_name, model_name, X_test, y_test, model, pred):
    confusion_matrix(y_test, pred)
    # accuracy_score(y_test,optyts_knnpred ) #0.8487394957983193

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.set_title('Confusion Matrix: ' + col_name + ", " + model_name)
    disp = metrics.plot_confusion_matrix(model, X_test, y_test, display_labels=key_list, ax=ax)
    disp.confusion_matrix

    plt.savefig(f"{OUTPUT_DIR}/confusion_matrix_" + col_name + "_" + model_name + ".png")
    plt.clf()

Plot Precison Recall

In [5]:
def plot_precision_recall(key_list, col_name, model_name, X, y, X_test, y_test, model, pred, use_SMOTE = True):
    n_classes = 5
    if model_name.lower() in ["knn", "random forest"]:
        y_score = np.array(model.predict_proba(X_test))[:, :, 1].T
    else: 
        y_score = model.predict_proba(X_test)

    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(0, n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])
        average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel())
    average_precision["micro"] = average_precision_score(y_test, y_score, average="micro")

    _, ax = plt.subplots(figsize=(7, 8))

    display = PrecisionRecallDisplay(
        recall=recall["micro"],
        precision=precision["micro"],
        average_precision=average_precision["micro"],
    )
    display.plot(ax=ax, name="Average precision-recall", color="gold")

    colors = cycle([
        mpl.colors.to_hex([141/255,160/255,203/255,.5], keep_alpha=True),
        mpl.colors.to_hex([231/255,138/255,195/255,.5], keep_alpha=True),
        mpl.colors.to_hex([255/255,217/255,47/255,.5], keep_alpha=True),
        mpl.colors.to_hex([252/255,141/255,98/255,.5], keep_alpha=True),
        mpl.colors.to_hex([102/255,194/255,165/255,.5], keep_alpha=True),
    ])

    ax.set_xlim([0.0, 1.0])
    
    class_name_dict = {
        0: "HB",
        1:"ICEQUAKE",
        2:"LP",
        3: "REGIONAL",
        4:"VT", 
    }

    for i, color in zip(range(n_classes), colors):
        display = PrecisionRecallDisplay(
            recall=recall[i],
            precision=precision[i],
            average_precision=average_precision[i],
        )
        class_name = class_name_dict[i]
        display.plot(ax=ax, name=f"{class_name}", color=color)

    # add the legend for the iso-f1 curves
    handles, labels = display.ax_.get_legend_handles_labels()

    # set the legend and the axes
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.legend(handles=handles, labels=labels, loc="best")
    if use_SMOTE:
        ax.set_title(col_name + ", " + model_name + ": Precision-Recall curve (with SMOTE)")
        plt.savefig(f"{OUTPUT_DIR}/precision_recall_" + col_name + "_" + model_name + "_SMOTE.png", facecolor='white', transparent=False)
    else:
        ax.set_title(col_name + ", " + model_name + ": Precision-Recall curve (without SMOTE)")
        plt.savefig(f"{OUTPUT_DIR}/precision_recall_" + col_name + "_" + model_name + ".png", facecolor='white', transparent=False)
    plt.clf()

### Define Results Template

In [6]:
def ResultsTemplate(MicSigV1):
    KeepCols = ['Year', 'Month', 'Type', 'Duration']
    template = MicSigV1.copy().loc[:, KeepCols]
    columns = ['Cleaning', 'KNN', 'SVM', 'RF', 'KMs', 'LR']
    template[columns] = None
    template.head()

    return template

### Models

kNN

In [7]:
def knn(key_list, col_name, X, y, X_train, X_test, y_train, y_test, use_SMOTE):
    # Knn Model
    no_neighbors = np.arange(2, 50, 5)
    accuracy_knn = np.zeros((no_neighbors.shape[0]))
    k = 0
    for knn in no_neighbors:
        knn_model = KNeighborsClassifier(n_neighbors=knn).fit(X_train, y_train)
        yts_knnpred = knn_model.predict(X_test)  # confusion_matrix(y_train, y_pred)
        accuracy_knn[k] = accuracy_score(y_test, yts_knnpred)
        k += 1
    opt_knn = no_neighbors[np.argmax(accuracy_knn)]
    optknn_model = KNeighborsClassifier(n_neighbors=opt_knn).fit(X_train, y_train)
    optyts_knnpred = knn_model.predict(X_test)

    #plot_confusion_matrix(key_list, col_name, "kNN", X_test, y_test, optknn_model, optyts_knnpred)
    plot_precision_recall(key_list, col_name, "kNN", X, y, X_test, y_test, optknn_model, optyts_knnpred, use_SMOTE)

    return knn_model

k Means

In [8]:
def kMeans(X, y, use_SMOTE):
    km_model = KMeans(n_clusters=5, random_state=42).fit_predict(X)
    confusion_matrix(y, km_model)
    accuracy_score(y, km_model)  # 0.2384161752316765

    return km_model

Random Forest

In [9]:
def random_forest(key_list, col_name, X, y, X_train, X_test, y_train, y_test, use_SMOTE):
    rf_model = RandomForestClassifier(n_estimators=1000, random_state=42).fit(X_train, y_train)
    yts_rf = rf_model.predict(X_test)

    #plot_confusion_matrix(key_list, col_name, "Random Forest", X_test, y_test, rf_model, yts_rf)
    plot_precision_recall(key_list, col_name, "Random Forest", X, y, X_test, y_test, rf_model, yts_rf, use_SMOTE)

    return rf_model

SVM

In [10]:
def svm(key_list, col_name, X, y, X_train, X_test, y_train, y_test, use_SMOTE):
    y_train_de_encode = np.argmax(y_train, axis=1)
    svm_model = SVC(gamma='auto', decision_function_shape='ovo', probability=True).fit(X_train, y_train_de_encode)
    yts_svm = svm_model.predict(X_test)

    #plot_confusion_matrix(key_list, col_name, "SVM", X_test, y_test, svm_model, yts_svm)
    plot_precision_recall(key_list=key_list, col_name=col_name, model_name="SVM", X=X, y=y,  X_test=X_test, y_test=y_test, model=svm_model, pred=yts_svm, use_SMOTE=use_SMOTE)

    return svm_model

Logistic Regression

In [11]:
def logistic_regression(key_list, col_name, X, y, X_train, X_test, y_train, y_test, use_SMOTE):
    y_train_decode = np.argmax(y_train, axis=1)
    lr = linear_model.LogisticRegression(multi_class='ovr', max_iter=1000).fit(X_train, y_train_decode)
    yts_lr = lr.predict(X_test)

    y_test_decode = np.argmax(y_test, axis=1)
    accuracy_score(y_test_decode, yts_lr)  # 0.8529411764705882

    #plot_confusion_matrix(key_list, col_name, "Logistic Regression", X_test, y_test, lr, yts_lr)
    plot_precision_recall(key_list, col_name, "Logistic Regression", X, y, X_test, y_test, lr, yts_lr, use_SMOTE)

    return lr

### Generate Models

Runs all models, generates plots

In [12]:
def generate_models(MicSigV1, use_SMOTE = False):
    label_encoder = LabelEncoder()
    MicSigV1['Type'] = MicSigV1['Type'].astype('str')
    MicSigV1['Type_CatNbr'] = label_encoder.fit_transform((MicSigV1['Type'].values))
    Type_Mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    key_list = [key for key in Type_Mapping.keys()]

    results_template = ResultsTemplate(MicSigV1)
    y = MicSigV1['Type_CatNbr']

    iterations = ['Raw', '2', '4', '6']
    results_list = []
    for curr_iter in iterations:
        # 3 dimensions: the Maximum Threshold Frequency (20-30 Hz), D1 Max. peak in freq.-domain, Entropy
        X = MicSigV1[["Freq_" + curr_iter, "D1_Max_" + curr_iter, "E_" + curr_iter]]

        # Classification Model - Normalized Raw Data
        # Split the data
        X_res, y_res = X, y
        if use_SMOTE:
            smt = SMOTETomek(random_state=42)
            X_res, y_res = smt.fit_resample(X, y)
        Y_enc = label_binarize(y_res, classes=y.unique().tolist())
        X_train, X_test, y_train, y_test = train_test_split(X_res, Y_enc, test_size=0.2, random_state=0)

        knn_model = knn(key_list, curr_iter, X, y, X_train, X_test, y_train, y_test, use_SMOTE)
        svm_model = svm(key_list, curr_iter, X, y, X_train, X_test, y_train, y_test, use_SMOTE)
        rf_model = random_forest(key_list, curr_iter, X, y, X_train, X_test, y_train, y_test, use_SMOTE)
        km_model = kMeans(X, y, use_SMOTE)
        lr = logistic_regression(key_list, curr_iter, X, y, X_train, X_test, y_train, y_test, use_SMOTE)

        results_df = results_template.copy()
        results_df['Cleaning'] = curr_iter
        results_df['KNN'] = knn_model.predict(X)
        results_df['SVM'] = svm_model.predict(X)
        results_df['RF'] = rf_model.predict(X)
        results_df['KMs'] = km_model
        results_df['LR'] = lr.predict(X)
        id_vars = ['Year', 'Month', 'Type', 'Duration', 'Cleaning']

        # Cast data into long form
        results_df = pd.melt(results_df, id_vars=id_vars, var_name='Model', value_name='Prediction')

        # Convert encoded labels to strings
        results_df.Prediction = results_df.Prediction.apply(lambda x: label_encoder.inverse_transform([x])[0])

        results_list.append(results_df)

    Output = pd.concat(results_list)
    Output = Output.reset_index()
    Output = Output.rename(columns={'index': 'EQ'})
    Output["Correct Prediction"] = Output["Type"]== Output["Prediction"]
    if use_SMOTE:
        smote_name = "SMOTE"
    else:
        smote_name = "WITHOUT_SMOTE"
    Output.to_csv(f'{VIZ_DIR}/classification_results_{smote_name}.csv', index=False)

### Run code

In [13]:
input_file = './clean_data.parquet'
MicSigV1 = pd.read_parquet(input_file)


# Cleaning data
ind = np.where(MicSigV1['Type'] == '''['REGIONAL']''')
ind2 = np.where(MicSigV1['Type'] == '''['LP']''')
ind3 = np.where(MicSigV1['Type'] == '''VT ''')
MicSigV1['Type'].loc[ind[0]] = "REGIONAL"
MicSigV1['Type'].loc[ind2[0]] = "LP"
MicSigV1['Type'].loc[ind3[0]] = "VT"

#generate output dirs
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

if not os.path.exists(VIZ_DIR):
    os.mkdir(VIZ_DIR)

generate_models(MicSigV1, use_SMOTE = False)
generate_models(MicSigV1, use_SMOTE = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MicSigV1['Type'].loc[ind[0]] = "REGIONAL"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MicSigV1['Type'].loc[ind2[0]] = "LP"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MicSigV1['Type'].loc[ind3[0]] = "VT"
  _, ax = plt.subplots(figsize=(7, 8))


<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>

<Figure size 504x576 with 0 Axes>