# Exploratory Data Analysis

In [None]:
%poetry init
%poetry install

In [None]:
# print pythonPath 
import os
import sys

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

from src.data.download import download_dataset

df = download_dataset()

In [None]:
# View first few rows
print(df.head())

# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# Summary statistics
print("\nSummary statistics:\n", df.describe())


In [None]:
from src.data.preprocessing import encode_categorical, scale_features

df = encode_categorical(df)
df = scale_features(df)

df.head()

In [None]:
# Convert non-numeric columns to numeric or drop them
df_numeric = df.select_dtypes(include=[float, int])

# Compute correlation matrix
corr_matrix = df_numeric.corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

In [None]:
from src.features.feature_engineering import drop_high_corr

df = drop_high_corr(df, corr_matrix)
print(df.head())

In [None]:
from src.models.training import split_data

X_train, X_test, y_train, y_test = split_data(df, "Target")

In [None]:
from src.models.model import create_model
from src.models.training import fit_model

model = create_model()
model = fit_model(model, X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
# Classification report
from sklearn.metrics import classification_report

print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# ROC-AUC score
from sklearn.metrics import roc_auc_score, roc_curve

y_scores = np.max(y_proba, axis=1)

roc_auc = roc_auc_score(y_test, y_scores)
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()