In [3]:
%%sh

npm run test

Couldn't find program: 'sh'


In [10]:
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
import sys

!{sys.executable} -m pip install numpy pandas scikit-learn sklearn xgboost 



In [11]:
import json
import numpy as np
import pandas as pd

from xgboost import XGBClassifier as xbc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

le = LabelEncoder()

# Defining S3 bucket names
seasons = ['2019', '2020', '2021', '2022']

def load_training_data(season):
    incoming = json.load(open('./frames/{}.json'.format(season)))

    return pd.DataFrame(incoming)


def create_training_info(training_df):
    # Split the
    training_sets = training_df.drop(columns=["label"])
    training_values = training_df["label"]

    # _x => training sets
    # _y => training values
    train_x, test_x, train_y, test_y = train_test_split(
        training_sets,
        training_values,
        test_size=0.2,
        shuffle=True
    )
    train_x, val_x, train_y, val_y = train_test_split(
        train_x,
        train_y,
        test_size=0.2,
        shuffle=True
    )

    print(f"Train_x Shape: {train_x.shape}")

    print(f"Train_y Shape: {train_y.shape}")

    print(f"Val_x Shape: {val_x.shape}")

    print(f"Val_y Shape: {val_y.shape}")

    print(f"Test_x Shape: {test_x.shape}")

    print(f"Test_y Shape: {test_y.shape}")

    return {
        "x": {"training": train_x, "validation": val_x, "testing": test_x},
        "y": {"training": train_y, "validation": val_y, "testing": test_y},
    }

def create_model(info):
    # ---- Model
    model = xbc(objective="binary:logistic")

    model.fit(
        info["x"]["training"].values,
        le.fit_transform(info["y"]["training"]),
        eval_set=[(info["x"]["validation"].values, info["y"]["validation"])],
    )

    print("-- model created --")
    return model

def obtain_eer(y_true, pred_probs):
    """
    Obtains the Equal Error Rate (EER) of a classification model based on its False Acceptance
    Rate (FAR) and False Rejection Rate (FRR) curves generated from a set of predicted probabilities
    of the model.

    The FAR and FRR curves are obtained by varying the decision threshold of the model for the
    positive class from 0 to 1 and recording each resulting perfomance in terms of FAR and FRR. The
    EER will be the point where both curves intersect.

    Parameters
    __________
        y_true (np.ndarray): Array with the true value of the class for each observation.
        pred_probs (np.ndarray): Bidimensional array with the predicted probabilites for each class.

    Returns
    _______
        df_res (pd.DataFrame): DataFrame containing performance results for each decision threshold.
        eer (float): Resulting EER value of the model for the given predictions.

    """
    # Extracting positive probabilities
    pos_probs = pred_probs[:, 1]

    # Obtaining number of total attempts
    n_attempts = len(pos_probs)

    # Defining output objects
    fars, frrs, cut_prob = [], [], []

    # Looping over all possible thresholds
    for threshold in np.linspace(0, 1, 100):
        # Making decision according to threshold
        positives = (pos_probs > threshold).astype(int)

        # Creating a single array with true and predicted values
        df_cases = pd.DataFrame({"true": y_true.values, "pred": positives})

        # Obtaning FAR and FRR
        far = sum((df_cases["true"] == 0) & (df_cases["pred"] == 1)) / n_attempts
        frr = sum((df_cases["true"] == 1) & (df_cases["pred"] == 0)) / n_attempts

        # Appending results to output objects
        fars.append(far)
        frrs.append(frr)
        cut_prob.append(threshold)

    # Creating DF of results
    df_res = pd.DataFrame({"cut_prob": cut_prob, "far": fars, "frr": frrs})
    df_res["diff"] = abs(df_res["far"] - df_res["frr"])

    # Finding crossing point of FAR and FRR (EER)
    df_eer = (
        df_res[df_res["diff"] == df_res["diff"].min()]
        .drop_duplicates(subset="diff", keep="first")
        .reset_index(drop=True)
    )
    eer = (df_eer["far"][0] + df_eer["frr"][0]) / 2

    return df_res, eer

def calc_statistics(info, model):
    predictions = model.predict(info["x"]["testing"].values)
    probability = model.predict_proba(info["x"]["testing"].values)

    acc_test = accuracy_score(info["y"]["testing"], predictions)

    f1_test = f1_score(info["y"]["testing"], predictions)

    # Inbuilt model feature importance

    df_featimp = pd.DataFrame(
        {
            "feature": info["x"]["training"].columns,
            "importance": model.feature_importances_,
        }
    )

    # Sorting by importance
    df_featimp = df_featimp.sort_values(by="importance", ascending=False)

    top_20 = df_featimp.head(20)
    bottom_20 = df_featimp.tail(20)

    # Calculate EER
    df_res, eer = obtain_eer(info["y"]["testing"], probability)

    return {
        "accuracy": acc_test,
        "f1": f1_test,
        "eer": eer,
        "dimensions": {
            "columns": len(info["x"]["training"].columns),
            "training": len(info["x"]["training"].values),
            "validation": len(info["x"]["validation"].values),
            "testing": len(info["x"]["testing"].values),
        },
        "importance": {
            "top": top_20.to_dict(orient="records"),
            "bottom": bottom_20.to_dict(orient="records"),
        },
    }

def run():
    for season in seasons:
        info = create_training_info(load_training_data(season))

        model = create_model(info)

        stats = calc_statistics(info, model)

        print(json.dumps(stats, ensure_ascii=False, indent="\t", skipkeys=True))

if __name__ == "__main__":
    run()

Train_x Shape: (327, 8)
Train_y Shape: (327,)
Val_x Shape: (82, 8)
Val_y Shape: (82,)
Test_x Shape: (103, 8)
Test_y Shape: (103,)


ImportError: sklearn needs to be installed in order to use this module