In [None]:
# Standard packages imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

pd.set_option('future.no_silent_downcasting', True)
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Reading complete shift list from analysis script
data_clf = pd.DataFrame(pd.read_csv('./data/source_data/res/complete_shifts_clf.csv',sep=';'))

In [None]:
data_clf.head()

In [None]:
# Replacing target labels for numeric values
data_clf = data_clf.replace({'C': 0, 'T': 1})
data_clf.head()#[data_clf.time == 30]

In [None]:
# Creating control list to be bootstrapped,
# considering only frequency shifts at time 30 minutes
c_list_to_bootstrap = data_clf[
    (data_clf['group'] == 0) & (data_clf['time'] == 30)
].drop(columns=['time']).values.tolist()
c_list_to_bootstrap

In [None]:
# Creating test list to be bootstrapped,
# considering only frequency shifts at time 30 minutes
t_list_to_bootstrap = data_clf[
(data_clf['group'] == 1) & (data_clf['time'] == 30)
].drop(columns=['time']).values.tolist()
t_list_to_bootstrap

In [None]:
import random

n = 4000

# Initializing a list to store the control data
c_data = []
t_data = []

# Generating n random data points based on the minimized control and test data
for _ in range(n):
    c_original = random.choice(c_list_to_bootstrap)
    c_data.append(c_original)
    
    t_original = random.choice(t_list_to_bootstrap)
    t_data.append(t_original)


# Shuffling the data
random.shuffle(c_data)
random.shuffle(t_data)

# Creating final list 
complete_list = c_data + t_data
random.shuffle(complete_list)

# Example of the first 3 data points
for i in range(3):
    print(c_data[i])
    print(t_data[i])
    print(complete_list[i])
    print('----------------------\n')

In [None]:
# Creating bootstrapped dataframe
bootstrapped = pd.DataFrame(complete_list, columns = ['group','frequency_shift','standard_deviation']) 
bootstrapped

In [None]:
# Checking data distribution
fig, axs = plt.subplots(ncols=2, figsize= (10,5))
sns.violinplot(
    x='group', 
    y='frequency_shift', 
    data=bootstrapped, 
    hue='group', 
    inner="quart", 
    ax=axs[0]
)

sns.boxplot(
    x='group', 
    y='frequency_shift', 
    data=bootstrapped, 
    hue='group', 
    notch=False, 
    ax=axs[1]
)

axs[0].set_xlabel(None)
axs[1].set_xlabel(None)
axs[0].set_ylabel(None)
axs[1].set_ylabel(None)

fig.suptitle('Frequency shift by group - Bootstrapped')
fig.supxlabel('Group')
fig.supylabel('Nominal shift f(t) - f(0) [MHz]')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def plot_confusion_matrix(conf_matrix, name):
    plt.figure(figsize=(5, 2))
    sns.heatmap(
        conf_matrix, 
        annot=True, 
        fmt='d', 
        cmap='Blues', 
        xticklabels=["C", "T"], 
        yticklabels=["C", "T"]
    )
    
    plt.title(f"Confusion matrix - {name}", fontsize=10)
    plt.xlabel("Predicted",fontsize=10)
    plt.ylabel("True",fontsize=10)
    plt.show()

In [None]:
X = bootstrapped.iloc[:, 1:]  # Features
y = bootstrapped.iloc[:, 0]   # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train

In [None]:
X.size

## Training classifiers

#### XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import joblib

# Difining params for optimization
parameters = {
    'objective': ['binary:logistic'],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200, 300]
}


scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Creating the model
xgb_clf = xgb.XGBClassifier()

# Creating GridSearchCV with diverse metrics
grid_search = GridSearchCV(
    estimator=xgb_clf, 
    param_grid=parameters, 
    scoring=scoring, 
    refit='accuracy', 
    cv=5
)

# Fitting train data to the model
grid_search.fit(X_train, y_train)

# Evaluation results
results = grid_search.cv_results_

# Best hyperparams
best_params = grid_search.best_params_
print(f'Melhores parâmetros: {best_params}')

# Printing scores
for scorer in scoring:
    print(f"{scorer} scores:")
    print(results[f'mean_test_{scorer}'])

In [None]:
# Saving XGBoost model 
joblib.dump(grid_search, './data/source_data/res/xgboost_model.joblib')

In [None]:
# Data sampling for testing model
data_to_submit = [-102.67478924215538,	23.69349016343144]

In [None]:
# Testing model
loaded_model_xgb = joblib.load('./data/source_data/res/xgboost_model.joblib')
prediction_xgb = loaded_model_xgb.predict(np.array(data_to_submit).reshape(1, -1))
y_pred_valor = grid_search.predict(X_test)

if prediction_xgb[0] == 1:
    print(f"External agent detected, prediction for XGB is {prediction_xgb.item()} - positive")
else:
    print(f"External agent not detected, prediction for XGB is {prediction_xgb.item()} - negative")

In [None]:
# Evaluating error
conf_matrix = confusion_matrix(y_test,y_pred_valor)
plot_confusion_matrix(conf_matrix, 'XGBoost')

## Other studies
### Logistic Regression and Random Forrest

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from scipy.special import expit

clf_lr = LogisticRegression(random_state=0).fit(X_train,y_train)
w1 = clf_lr.coef_[0][0]
w0 = clf_lr.intercept_[0]

# Expit
values_x = np.linspace(X_train.min(),X_train.max(),100)
values_y = expit(w1 * values_x + w0)
curve_y = expit(w1 * X_train + w0)

# Verifying prediction probabilities
y_pred = clf_lr.predict_proba(X_test)
y_pred

In [None]:
y_pred_valor = clf_lr.predict(X_test)
y_pred_valor

In [None]:
joblib.dump(clf_lr, './data/source_data/res/logistic_regression_model.pkl')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score


# Creating classifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

# Making predictions
y_pred = clf_rf.predict(X_test)

# Evaluating metrics
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Evaluating with cross validation
scores = cross_val_score(clf_rf, X, y, cv=5)
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())

clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

# Saving Random Forest model
joblib.dump(clf_rf, './data/source_data/res/random_forest_model.joblib')

In [None]:
# Loading models
loaded_model_lr = joblib.load('./data/source_data/res/logistic_regression_model.pkl')
loaded_model_rf = joblib.load('./data/source_data/res/random_forest_model.joblib')

In [None]:
# Using loaded models with data to submit sample
# Logistic Regression
prediction_lr = loaded_model_lr.predict(np.array(data_to_submit).reshape(1, -1))

# Random Forest
prediction_rf = loaded_model_rf.predict(np.array(data_to_submit).reshape(1, -1))

In [None]:
print(prediction_lr)
print(prediction_rf)