In [None]:
import itertools
import numpy as np
import numpy.typing as npt
import pandas as pd
import pickle as pkl
import requests
import seaborn as sns
import os

from matplotlib import pyplot as plt
from scipy.stats import uniform
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import (
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from typing import Any, Dict, List, Tuple
from xgboost import XGBClassifier

from microservice import IUMModel
from utility import (
    load_data,
    get_buckets_indices,
    get_preds_thr,
    get_most_optimal_thr,
    get_s_p,
    get_t,
    get_xgb_logreg_f1_scores,
    is_xgb_better,
    BUCKETS_CNT,
    T_ALPHA,
)


In [None]:
FEATURE_VERSION = 'v1'
FEATURE_PATH = f"features/{FEATURE_VERSION}/feature.csv"

FEATURES = [
    'number_of_advertisements',
    'number_of_tracks',
    'number_of_skips',
    'number_of_likes',
    'number_of_liked_tracks_listened',
    'number_of_tracks_in_favourite_genre',
    'total_number_of_favourite_genres_listened',
    'average_popularity_in_favourite_genres',
    'total_tracks_duration_ms',
    'number_of_different_artists',
    'average_release_date',
    'average_duration_ms',
    'explicit_tracks_ratio',
    'average_popularity',
    'average_acousticness',
    'average_danceability',
    'average_energy',
    'average_instrumentalness',
    'average_liveness',
    'average_loudness',
    'average_speechiness',
    'average_tempo',
    'average_valence',
    'average_track_name_length',
    'average_daily_cost'
]

TARGETS = [
    "premium_user_numerical",
    "will_buy_premium_next_month_numerical"
]

TARGET_AND_FEATURES = TARGETS + FEATURES


In [None]:
data_frame = pd.read_csv(FEATURE_PATH)


In [None]:
data_frame.head()


In [None]:
correlation_matrix = data_frame.loc[:, TARGET_AND_FEATURES] \
    .corr(method='spearman')

plt.figure(figsize=(16, 16))

sns.heatmap(
    correlation_matrix,
    xticklabels=correlation_matrix.columns,  # type: ignore
    yticklabels=correlation_matrix.columns,  # type: ignore
    annot=True,
    annot_kws={"fontsize": 7},
    fmt=".0%",
    vmin=-1,
    vmax=1,
)

plt.show()


In [None]:
pipeline = Pipeline([
    ("simple_imputer", SimpleImputer()),
    ("standard_scaler", StandardScaler())
])


In [None]:
TRAINING_UP_TO = 2023
TRAIN_DATA: pd.DataFrame = data_frame.loc[data_frame.year < TRAINING_UP_TO, :]
TEST_DATA: pd.DataFrame = data_frame.loc[data_frame.year >= TRAINING_UP_TO, :]
TEST_SIZE = 0.33


TODO: pipeline dobieramy na podstawie samych danych testowych


In [None]:
X_train_temp, X_test_temp, Y_train, Y_test = train_test_split(
    TRAIN_DATA[FEATURES],
    TRAIN_DATA[TARGETS],
    test_size=TEST_SIZE
)
X_train_temp: pd.DataFrame
X_test_temp: pd.DataFrame
Y_train: pd.DataFrame
Y_test: pd.DataFrame

train_data = pipeline.fit_transform(X_train_temp)
test_data = pipeline.transform(X_test_temp)
X_train = pd.DataFrame(train_data, columns=FEATURES)
X_test = pd.DataFrame(test_data, columns=FEATURES)


In [None]:
X_train.head()


In [None]:
Y_train.head()


In [None]:
# MODELS: Dict[str, List[Classi]] = dict()


In [None]:
CANDIDATES = 1

model = XGBClassifier()
# TODO: update with own parameters
randomized_search_cv = RandomizedSearchCV(
    estimator=model,
    param_distributions={
        "max_depth": np.arange(3, 30, 1),
        "eta": uniform(0, 0.1),
        "gamma": uniform(0, 1),
        "n_estimators": np.arange(10, 100, 1),
    },
    n_iter=CANDIDATES,
    scoring="f1",
    n_jobs=-1,
    verbose=3,
)


In [None]:
randomized_search_cv.fit(x_train, y_train[TARGETS])


In [None]:
y_pred = randomized_search_cv.predict(x_test)
for i, target in enumerate(TARGETS):
    matrix_y_true = y_test[target]
    matrix_y_pred = y_pred[:, i]
    f1_score_value = f1_score(matrix_y_true, matrix_y_pred)
    print(f"F1 score for {target}: {f1_score_value}")
    matrix = confusion_matrix(matrix_y_true, matrix_y_pred)
    sns.heatmap(
        matrix,
        annot=True,
        fmt='g',
        xticklabels=["0", "1"],  # type: ignore
        yticklabels=["0", "1"]  # type: ignore
    )
    plt.show()


In [None]:
_, axs = plt.subplots(1, 2, figsize=(30, 5))
axs = axs.flatten()
idx = 0
for names, values, m_name in zip(
    [model.feature_names_in_, model.feature_names_in_],
    [model.coef_[0], model.feature_importances_],
    ["LogisticRegression", "XGBClassifier"],
):
    a = axs[idx]
    a.barh(y=names, width=values, edgecolor="black")
    a.bar_label(a.containers[0], fmt="%.2f")
    a.set_title(f"{m_name} feature importances")
    idx += 1
plt.show()


In [None]:
def get_class_weights(y: pd.Series, weights: List[float]) -> np.ndarray:
    return y.apply(lambda x: weights[x]).values


def train_score(
    model_class: Any,
    xgb_params: Dict[str, Any] = dict(),
) -> Tuple[List[float], Any, pd.DataFrame, pd.DataFrame]:
    _, axs = plt.subplots(2, 5, figsize=(40, 15))
    metrics = {
        "f1_score": [],
        "precision": [],
        "recall": [],
    }
    axs = axs.flatten()
    for month_ in np.arange(10) + 1:
        train_data = data_frame.loc[data_frame.month <= month_, :]
        test_data = data_frame.loc[data_frame.month == month_ + 1, :]
        X_train, y_train = train_data[FEATURES], train_data[TARGETS]
        X_test, y_test = test_data[FEATURES], test_data[TARGETS]
        X_train = pd.DataFrame(
            pipeline.fit_transform(X_train), columns=FEATURES)
        X_test = pd.DataFrame(pipeline.transform(X_test), columns=FEATURES)
        model = model_class(**xgb_params)
        model.fit(X_train, y_train)
        y_train_proba = model.predict_proba(X_train)
        thr = get_most_optimal_thr(y_train, y_train_proba)
        y_pred_proba = model.predict_proba(X_test)
        y_pred = get_preds_thr(y_pred_proba, thr)
        a = axs[month_ - 1]
        sns.heatmap(confusion_matrix(y_test, y_pred),
                    annot=True, fmt="d", ax=a)
        a.set_title(f"{month_} test")
        metrics["f1_score"].append(f1_score(y_test, y_pred))
        metrics["precision"].append(precision_score(y_test, y_pred))
        metrics["recall"].append(recall_score(y_test, y_pred))
    return metrics, model, X_test, y_test


In [None]:
def create_plot_from_model(x_train, y_train, x_test, y_test, subplots, model_constructor, model_params=dict()):
    data_train = pipeline.fit_transform(x_train)
    data_test = pipeline.transform(x_test)
    x_train = pd.DataFrame(data_train, columns=FEATURES)
    x_test = pd.DataFrame(data_test, columns=FEATURES)
    model = model_constructor(**model_params)
    model.fit(x_train, y_train)
    y_train_proba = model.predict_proba(x_train)
    # TODO: get_most_optimal_threshold
    thr = 0.2
    y_pred_proba = model.predict_proba(x_test)
    plots = []
    for i, target in enumerate(TARGETS):
        y_pred = pd.Series((y_pred_proba[:, i] > thr).astype(int))
        matrix_y_true = y_test[target]
        matrix_y_pred = y_pred
        f1_score_value = f1_score(matrix_y_true, matrix_y_pred)
        print(f"F1 score for {target}: {f1_score_value}")
        print(f"Precision: {precision_score(matrix_y_true, y_pred)}")
        print(f"Recall: {recall_score(matrix_y_true, y_pred)}")
        matrix = confusion_matrix(matrix_y_true, matrix_y_pred)
        sns.heatmap(
            matrix,
            annot=True,
            fmt='g',
            xticklabels=["0", "1"],
            yticklabels=["0", "1"],
            annot_kws={"fontsize": 40},
            ax=subplots[i]
        )
    return plots


In [None]:
temp = 10
plots = []
MONTHS = 60
subplots = [plt.subplots(4, MONTHS//4, figsize=(100, 40))
            [1].flatten() for _ in TARGETS]
plot_statistics = []
for year, month in itertools.product(range(2019, 2023), range(1, 13)):
    temp += 1
    if temp % 10 != 0:
        continue
    data_train = data_frame.loc[
        data_frame.apply(lambda x: x.year < year or (
            x.month <= month and x.year == year), axis=1),
        :
    ]
    if len(data_train) == 0:
        continue
    data_test = data_frame.loc[
        data_frame.apply(lambda x: (x.month == month + 1 and x.year == year)
                         or (x.year == year + 1 and x.month == 1), axis=1),
        :
    ]
    x_train, y_train = data_train[FEATURES], data_train[TARGETS]
    x_test, y_test = data_test[FEATURES], data_test[TARGETS]

    plots.append(create_plot_from_model(x_train, y_train, x_test, y_test, [
                 subplot[temp] for subplot in subplots], XGBClassifier, randomized_search_cv.best_params_))

plt.show()


In [None]:
# TODO: F1 score, Precision, Recall figures


In [None]:
_, axs = plt.subplots(1, 2, figsize=(30, 5))
axs = axs.flatten()
idx = 0
for names, values, m_name in zip(
    [logreg.feature_names_in_, xgb.feature_names_in_],
    [logreg.coef_[0], xgb.feature_importances_],
    ["LogisticRegression", "XGBClassifier"],
):
    a = axs[idx]
    a.barh(y=names, width=values, edgecolor="black")
    a.bar_label(a.containers[0], fmt="%.2f")
    a.set_title(f"{m_name} feature importances")
    idx += 1
plt.show()


In [None]:
def train(
    model_class,
    model_name: str = "",
    xgb_params=dict(),
) -> None:
    X_train, y_train = TRAIN_DATA[FEATURES], TRAIN_DATA[TARGETS]
    X_train = pd.DataFrame(pipeline.fit_transform(X_train), columns=FEATURES)
    model = model_class(**xgb_params)
    model.fit(X_train, y_train)
    y_train_proba = model.predict_proba(X_train)
    threshold = 0.2
    ium_model = IUMModel(pipeline, model, threshold)

    with open(f"models/{model_name}.pkl", "wb") as f:
        pkl.dump(ium_model, f)


In [None]:
train(XGBClassifier, "xgbclassifier", randomized_search_cv.best_params_)
# train(LogisticRegression, "logistic_regression")


In [None]:
data_frame = pd.read_csv(FEATURE_PATH)

data_frame = data_frame.loc[data_frame.month == 11, :]

randomized_indices = np.random.permutation(data_frame.index)
A = data_frame.loc[randomized_indices[:int(0.5*len(data_frame))]]
B = data_frame.loc[randomized_indices[int(0.5*len(data_frame)):]]


In [None]:
def perform_test(model, data):
    # Initiate empty DataFrame
    export = {"user_id": [], "guess": [], "ground_truth": [], "model": []}
    export = pd.DataFrame(export)
    # Go over all records
    for i in range(0, len(data)):
        # Extract one record
        row_vals = data.iloc[i]
        # Save the users id and the ground truth for future use
        user_id = row_vals.user_id
        ground_truth = row_vals.is_premium
        # Delete user id, month and is premium columns as they are unnecessary
        row_vals.drop('user_id')
        row_vals.drop('month')
        row_vals.drop('is_premium')
        # Extract only the values
        row_vals = row_vals.values
        # Prepare the request
        features = ','.join(map(str, row_vals))
        request = "http://127.0.0.1:8000/models/" + model + "?features=" + features
        # Get prediction from microservice
        guess = requests.get(request).json()
        # Append prediction to DataFrame
        line = pd.DataFrame({"user_id": [user_id], "guess": [
                            guess["prediction"]], "ground_truth": [ground_truth], "model": [model]})
        export = pd.concat([export, line], ignore_index=True)
    return export


In [None]:
xgb = perform_test("xgbclassifier", A)
logic = perform_test("logistic_regression", B)


In [None]:
xgb.guess.value_counts()


In [None]:
xgb.ground_truth.value_counts()


In [None]:
logic.guess.value_counts()


In [None]:
logic.ground_truth.value_counts()


In [None]:
xgb.to_csv(os.path.join("results", "xgb.csv"), index=None)
logic.to_csv(os.path.join("results", "logic.csv"), index=None)


In [None]:
print("f1", f1_score(xgb.ground_truth, xgb.guess))
print("f1", f1_score(logic.ground_truth, logic.guess))


In [None]:
np.random.seed(1234)

print(f"{BUCKETS_CNT=}, {T_ALPHA=}")
data = load_data()
buckets_indices = get_buckets_indices(data.user_id.values)
xgb_f1_score, logreg_f1_score = get_xgb_logreg_f1_scores(data, buckets_indices)
print(f"{np.mean(xgb_f1_score)=}, {np.mean(logreg_f1_score)=}")
s_p = get_s_p(xgb_f1_score, logreg_f1_score)
t = get_t(xgb_f1_score, logreg_f1_score, s_p)
print(f"{s_p=}, {t=}")
if is_xgb_better(t):
    print("XGBClassifier is better than LogisticRegression")
else:
    print("We can't say that XGBClassifier is better than LogisticRegression")
