In [None]:
# Parameters
model = "xgboost.XGBClassifier"
threshold = 0.5
eval_metric = "auc"
objective = "binary:logistic"
params_names = ["threshold", "objective", "eval_metric"]
track = False
mlflow_tracking_uri = (
    "file:C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project/mlruns"
)
upstream = {
    "data_processing": {
        "nb": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\notebooks\\process_data.ipynb",
        "data_train": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\data\\processed_train_data.csv",
        "data_validation": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\data\\processed_validation_data.csv",
    }
}
product = {
    "nb": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\notebooks\\report-0.ipynb"
}


In [None]:
from imblearn.combine import SMOTETomek

from yellowbrick.classifier import ClassificationReport
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import time
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler as RUS

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import numpy as np
from sklearn.dummy import DummyClassifier

import xgboost as xgb
import warnings

import atexit
import importlib
import mlflow
from mlflow.exceptions import MlflowException

warnings.filterwarnings("ignore")
plt.rcParams["figure.figsize"] = (11,7)

In [None]:
# %%
model_params = {k: globals()[k] for k in params_names}
print(model_params)

# %%
if track:
    print('tracking with mlflow...')
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    @atexit.register
    def end_run():
        mlflow.end_run()
else:
    print('tracking skipped...')
    mlflow = Mock()

# %%
module, _, class_name = model.rpartition('.')
Class_ = getattr(importlib.import_module(module), class_name)
Class_

# %%
try:
    experiment_id = mlflow.create_experiment(name=class_name)
except MlflowException:
    experiment_id = mlflow.get_experiment_by_name(name=class_name).experiment_id

print(f'experiment id: {experiment_id}')

# %%
run = mlflow.start_run(experiment_id=experiment_id)

# %% tags=["mlflow-run-id"]
print(run.info.run_id)

In [None]:
def read_data(path):
    df = pd.read_csv(path, parse_dates=[0], index_col=[0])
    return df

In [None]:
df_train = read_data(upstream["data_processing"]["data_train"])
df_validation = read_data(upstream["data_processing"]["data_validation"])

In [None]:
class_weight = df_train.action.value_counts()[0] / df_train.action.value_counts()[1]
print(f"Class Weight: {class_weight}")

In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=f_classif, k=10)
X = df_train.drop(columns=['action'], axis=1)
y = df_train['action']#.map(action_dictionary)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
featureScores = featureScores.sort_values("Score", ascending=False).reset_index(drop=True)
features = featureScores.head(10).Specs

In [None]:
X = df_train.drop(columns=['action'], axis=1)
# X = X[features]
y = df_train['action']  # .map(action_dictionary)

X_valid = df_validation.drop(columns=['action'], axis=1)
# X_valid = X_valid[features]
y_valid = df_validation['action']  # .map(action_dictionary)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)


scaler_whitelist = ["Price", "ph", "pl"]
for col in X.columns.drop(scaler_whitelist):
    trans = StandardScaler()
    X_train[col] = trans.fit_transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = trans.transform(X_test[col].values.reshape(-1, 1))
    X_valid[col] = trans.transform(X_valid[col].values.reshape(-1, 1))

smote = SMOTE(random_state=11)
# X_train, y_train = smote.fit_resample(X_train, y_train)

stratified_kfold = StratifiedKFold(n_splits=3,
                                   shuffle=True,
                                   random_state=11)

xgcls = xgb.XGBClassifier(use_label_encoder=False)
param_grid = {
    "reg_alpha": [0.01, 0.5, 2, 0.2],
    "scale_pos_weight": [class_weight],
    "learning_rate": [0.1, 0.2, 0.15],
    "eval_metric": [eval_metric],
    "objective": [objective]
}
if not eval_metric:
    param_grid.pop("eval_metric")

dummy_cls = DummyClassifier(strategy="stratified")
dummy_cls.fit(X_train, y_train)

grid_search = GridSearchCV(estimator=xgcls,
                           param_grid=param_grid,
                           scoring="f1",
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')


In [None]:
best_params = grid_search.best_params_.copy()
print(f"Best Params: {best_params}")

In [None]:
cls = xgb.XGBClassifier(use_label_encoder=False, **best_params)
cls.fit(X_train, y_train)

### Feature Importances

In [None]:
feature_important = cls.get_booster().get_score(importance_type='gain')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(40, columns="score").sort_values(by = "score", ascending=True).plot(kind='barh', figsize = (20,10)) ## plot top 40 features
plt.show();

In [None]:
# Create Stratified K-Fold cross validation
def cross_val_model(model, X_test, y_test):
    scoring = ('f1', 'recall', 'precision', 'roc_auc')
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
    scores = cross_validate(model, X_test, y_test, scoring=scoring, cv=cv)

    # Obtain model scores
    print('Mean f1: %.3f' % scores['test_f1'].mean())
    print('Mean recall: %.3f' % scores['test_recall'].mean())
    print('Mean precision: %.3f' % scores['test_precision'].mean())
    print('Mean auc: %.3f' % scores['test_roc_auc'].mean())

### Train Set Cross Validation

In [None]:
# Crossvalidate Test Data
cross_val_model(cls, X_train, y_train)

## Test Set Results

In [None]:
# Test Data
y_scores = cls.predict_proba(X_test)[:, 1]
y_pred = (y_scores >= threshold).astype(int)
test_set_report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
from IPython.display import display, Markdown, Latex
Markdown(f"""#### Confusion Matrix

__Applied Threshold__: {threshold}
"""
        )

In [None]:
y_scores = cls.predict_proba(X_test)[:, 1]
y_pred = (y_scores >= threshold).astype(int)

# Plot confusion matrix
fig_cmp, ax = plt.subplots()
cmp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, y_pred),
#    display_labels=["class_1", "class_2", "class_3", ],
)

cmp.plot(ax=ax)
plt.grid(False)
plt.title(f"Confusion Matrix (Threshold = {threshold})")
plt.show();

In [None]:
y_pred = dummy_cls.predict(X_test)


# Plot confusion matrix
dummy_fig_cmp, ax = plt.subplots()
cmp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, y_pred),
#    display_labels=["class_1", "class_2", "class_3", ],
)

cmp.plot(ax=ax)
plt.grid(False)
plt.title(f"Confusion Matrix (DUMMY)")
plt.show();

### Roc Curve

In [None]:
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# predict probabilities
yhat = cls.predict_proba(X_test)
# keep probabilities for the positive outcome only
yhat = yhat[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, yhat)
# plot the roc curve for the model
fig_roc, ax = plt.subplots()
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='XGBClassifier')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.title(f"ROC Curve (Threshold = {threshold})")
# show the plot
plt.show()

### Validation Set Results

In [None]:
# Validation Data
y_scores = cls.predict_proba(X_valid)[:, 1]
y_pred = (y_scores >= threshold).astype(int)
print(classification_report(y_valid, y_pred))

## Sampling Techniques:

### Baseline

In [None]:
model_weighted = xgb.XGBClassifier(**best_params)
model_weighted.fit(X_train, y_train)
#cross_val_model(model_weighted, X_train, y_train)
y_pred = model_weighted.predict(X_test)
y_scores = model_weighted.predict_proba(X_test)[:, 1]
y_pred = (y_scores >= threshold).astype(int)
print(classification_report(y_test, y_pred))

### Undersampling

In [None]:
under = RUS(sampling_strategy=1,random_state=4)
X_train_under, y_train_under = under.fit_resample(X_train, y_train)

best_params_under = best_params.copy()
best_params_under.pop("scale_pos_weight")
model_weighted = xgb.XGBClassifier(**best_params_under)
model_weighted.fit(X_train_under, y_train_under)

#cross_val_model(model_weighted, X_train_under, y_train_under)
y_scores = model_weighted.predict_proba(X_test)[:, 1]
y_pred = (y_scores >= threshold).astype(int)
print(classification_report(y_test, y_pred))

### SMOTE

In [None]:
over = SMOTE(sampling_strategy="minority", random_state=4)
X_train_smote, y_train_smote = over.fit_resample(X_train, y_train)

best_params_under = best_params.copy()
best_params_under.pop("scale_pos_weight")
model_weighted = xgb.XGBClassifier(**best_params_under)
model_weighted.fit(X_train_smote, y_train_smote)
#cross_val_model(model_weighted, X_test, y_test)
y_scores = model_weighted.predict_proba(X_test)[:, 1]
y_pred = (y_scores >= threshold).astype(int)
print(classification_report(y_test, y_pred))

### Combined

In [None]:
from imblearn.combine import SMOTETomek
bc = SMOTETomek(random_state=4)
X_resampled, y_resampled = bc.fit_resample(X_train, y_train)

best_params_under = best_params.copy()
best_params_under.pop("scale_pos_weight")
model_weighted = xgb.XGBClassifier(**best_params_under)
model_weighted.fit(X_resampled, y_resampled)
#cross_val_model(model_weighted, X_test, y_test)
y_scores = model_weighted.predict_proba(X_test)[:, 1]
y_pred = (y_scores >= threshold).astype(int)
print(classification_report(y_test, y_pred))

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal Threshold Value is:", optimal_threshold)

### Validation Scenarios

In [None]:
def create_result_table(test_x, test_y, threshold=0.5):
    y_prob = cls.predict_proba(test_x)#[:, 1]
    #df = pd.DataFrame([y_prob.argmax(axis=1), y_prob.max(axis=1)]).T
    df = pd.DataFrame([y_prob.argmax(axis=1), y_prob[:, 1]]).T
    df.columns = ["prediction", "probability"]
    df = pd.concat([test_x.reset_index(drop=True), df], ignore_index=False, axis=1)
    df["prediction"] = np.where(df["probability"] >= threshold, 1, 0)
    df.index = test_x.index
    df["y_true"] = test_y
    df["dummy_prediction"] = df["prediction"].sample(frac=1).values
    df["Close"] = df_validation["Close"] # df_validation must be parameterized
    df["return"] = df["Close"].pct_change()
    # df["prediction_group"] = (df["prediction"] == 1).cumsum()
    # df["dummy_prediction_group"] = (df["dummy_prediction"] == 1).cumsum()
    return df

In [None]:
results = create_result_table(X_valid, y_valid, threshold=threshold)

#### Table

In [None]:
results.head()

In [None]:
def calculate_loss(df_source, df, true_prediction=False):
    global df_resid
    df_resid = pd.concat([df_source, df[["prediction", "dummy_prediction"]]], axis=1).dropna().reset_index(drop=True)
    #return df_resid
    prediction_residual_mean = np.nan
    if true_prediction:
        prediction_residual_mean = df_resid.query("prediction == 1").apply(resid, axis=1).mean()
    dummy_residual_mean = df_resid.query("dummy_prediction == 1").apply(resid, axis=1).mean()
    return prediction_residual_mean, dummy_residual_mean

#### Graph

In [None]:
hover_data = {"Close": True}

fig_obj = px.line(x="Date", y="Close", data_frame=results.sort_index().reset_index(), hover_data=hover_data)

extrema = px.scatter(x="Date", y="Close", data_frame=results.sort_index().reset_index().query("prediction == 1"),hover_data=hover_data)
extrema.update_traces(marker=dict(color='green'))

extrema_dummy = px.scatter(x="Date", y="Close", data_frame=results.sort_index().reset_index().query("y_true == 1 and prediction==1"),hover_data=hover_data)
extrema_dummy.update_traces(marker=dict(color='red'))

fig = fig_obj.data + extrema.data# + extrema_dummy.data

validation_prediction_plot = go.Figure(fig)
validation_prediction_plot.update_layout(
    title=f"Validation Graph (Threshold = {threshold})"
)

validation_prediction_plot

In [None]:
def resid(x):
    action_col = df_resid.columns.get_loc("action")
    idx = x.name
    cumsum_reversed = ((df_resid.iloc[idx::-1, action_col] == 1))
    cumsum_reversed = np.where(cumsum_reversed == True)[0]

    cumsum_straight = ((df_resid.iloc[idx:, action_col] == 1))
    cumsum_straight = np.where(cumsum_straight == True)[0]
    
    val_straight = np.inf
    val_reversed = np.inf
    if cumsum_straight.shape[0] > 0:
        cumsum_straight = cumsum_straight[0]
        val1 = df_resid.iloc[idx + cumsum_straight, df_resid.columns.get_loc("Close")]
        val2 = df_resid.iloc[idx, df_resid.columns.get_loc("Close")]
        arr2 = np.array([0, val2])
        arr1 = np.array([cumsum_straight, val1])
        #val_straight = abs(val1 - val2) / min(val1, val2) * 100
        euclidean_dist = abs(np.linalg.norm(arr2-arr1))
        val_straight = np.sqrt(np.square(euclidean_dist))
        val_straight = abs(cumsum_straight)
        
    if cumsum_reversed.shape[0] > 0:
        cumsum_reversed = cumsum_reversed[0]
        val1 = df_resid.iloc[idx - cumsum_reversed, df_resid.columns.get_loc("Close")]
        val2 = df_resid.iloc[idx, df_resid.columns.get_loc("Close")]
        arr2 = np.array([0, val2])
        arr1 = np.array([cumsum_reversed, val1])
        #val_reversed = abs(val1 - val2) / min(val1, val2) * 100
        euclidean_dist = abs(np.linalg.norm(arr2-arr1))
        val_reversed = np.sqrt(np.square(euclidean_dist))
        val_reversed = abs(cumsum_reversed)
        
    val = min(val_straight, val_reversed)
    
    return val

In [None]:
start = time.time()
def simulate_loss(df_source, valid_x, valid_y, n=100):
    dummy_losses = np.zeros(n)
    for i in range(n):
        results = create_result_table(valid_x, valid_y)
        _, dummy_loss = calculate_loss(df_source, results)
        dummy_losses[i] = dummy_loss
    return dummy_losses
dummy_losses = simulate_loss(df_validation, X_valid, y_valid,n=10)

#### Dummy Loss

In [None]:
sns.distplot(dummy_losses)
plt.title("Simulated Dummy Loss Distributions")

#### Dummy Loss means:


In [None]:
dummy_losses.mean()

#### Prediction Loss

In [None]:
prediction_loss, _ = calculate_loss(df_validation, results, true_prediction=True)
prediction_loss

In [None]:
dummy_pred_loss = (dummy_losses.mean() - prediction_loss) / min(dummy_losses.mean(), prediction_loss)

In [None]:
# Primary Estimator
fig_cr, ax = plt.subplots()
visualizer = ClassificationReport(cls, ax, classes=["neutral", "anomalous"], support=True)
visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.finalize();
ax.set_title(ax.get_title() + f"(Threshold = {threshold})");

In [None]:
# Dummy Estimator
dummu_fig_cr, ax = plt.subplots()
visualizer = ClassificationReport(dummy_cls, ax, classes=["neutral", "anomalous"], support=True)
visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.finalize();
ax.set_title(ax.get_title() + "(DUMMY)");

In [None]:
from yellowbrick.classifier import DiscriminationThreshold
fig_disc, ax = plt.subplots()
visualizer = DiscriminationThreshold(cls)
visualizer.fit(X_train, y_train)        # Fit the data to the visualizer
visualizer.finalize();
ax.set_title(ax.get_title() + f"(Threshold = {threshold})");

In [None]:
# %%
log_params = cls.get_params().copy()
log_params.update({"threshold": threshold})
mlflow.log_params(log_params)

# %%
test_set_report_0 = dict((f"0_{k}", round(v, 3)) for k, v in test_set_report["0.0"].items())
test_set_report_1 = dict((f"1_{k}", round(v, 3)) for k, v in test_set_report["1.0"].items())
mlflow.log_metrics(test_set_report_0)
mlflow.log_metrics(test_set_report_1)
mlflow.log_metric("dummy_pred_loss", dummy_pred_loss)


mlflow.log_figure(fig_cmp, 'confusion_matrix.png');
mlflow.log_figure(fig_cr, 'classification_report.png');
mlflow.log_figure(fig_roc, 'roc.png');
mlflow.log_figure(fig_disc, 'discrimination_threshold.png');
mlflow.log_figure(validation_prediction_plot, 'validation_results.html');
mlflow.log_figure(dummy_fig_cmp, 'dummy_confusion_matrix.png')
mlflow.log_figure(dummu_fig_cr, 'dummy_classification_report.png')
# %%
mlflow.sklearn.log_model(cls, artifact_path="sklearn-model");