# Exploratory Data Analysis

In [None]:
# print pythonPath 
import os
import sys

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

## Data Loading

### Download the data

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

from src.data.download import download_dataset

df = download_dataset()

### Brief Inspection

In [None]:
# View first few rows
print(df.head())

# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# Summary statistics
print("\nSummary statistics:\n", df.describe())


## Data Preprocessing

### Encode categorical

In [None]:
from src.data.preprocessing import encode_categorical

df = encode_categorical(df)

### Scale feature values

In [None]:
from src.data.preprocessing import scale_features

df = scale_features(df)

### Plot correlation matrix

In [None]:
# Convert non-numeric columns to numeric or drop them
df_numeric = df.select_dtypes(include=[float, int])

# Compute correlation matrix
corr_matrix = df_numeric.corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

### Drop feature pairs with high correlation

In [None]:
from src.features.feature_engineering import drop_high_corr

df = drop_high_corr(df, corr_matrix)
print(df.head())

## Training

### Split the data

In [None]:
from src.models.training import split_data

X_train, X_test, y_train, y_test = split_data(df, "Target")

### Create and fit the model

In [None]:
from src.models.model import create_model
from src.models.training import fit_model

model = create_model()
model = fit_model(model, X_train, y_train)

## Inference
### Make predictions

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

### Plot metrics

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
# Classification report
from sklearn.metrics import classification_report

print("Classification Report:\n", classification_report(y_test, y_pred))

### Confusion Matrix

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

### ROC Curve

In [None]:
# ROC-AUC score
from sklearn.metrics import roc_auc_score, roc_curve

y_scores = np.max(y_proba, axis=1)

roc_auc = roc_auc_score(y_test, y_scores)
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

## Hyperparameter Tuning

### Grid search

In [None]:
param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [100, 200, 300],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
}

In [None]:
# Initialize GridSearchCV
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

grid_search = GridSearchCV(
    estimator=create_model(),
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1,
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

### Best Model

In [None]:
print("Best Parameters:", grid_search.best_params_)
print(f"Best ROC-AUC Score: {grid_search.best_score_:.4f}")

### Retrain with these hyperparameters

In [None]:
best_params = grid_search.best_params_
xgb_clf_best = create_model(**best_params)
xgb_clf_best.fit(X_train, y_train)

### Evaluate the optimized model

In [None]:
# Predict on test data
y_pred_best = xgb_clf_best.predict(X_test)
y_proba_best = xgb_clf_best.predict_proba(X_test)[:, 1]

# Accuracy
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Optimized Accuracy: {accuracy_best:.4f}")

# ROC-AUC score
roc_auc_best = roc_auc_score(y_test, y_proba_best)
print(f"Optimized ROC-AUC Score: {roc_auc_best:.4f}")