# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import log2

from sklearn.datasets import load_iris
from sklearn.datasets import make_classification  # To create synthetic classification data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder  # For converting categorical to numeric


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC  # Support Vector Machine (SVM) model
from sklearn.tree import DecisionTreeClassifier, plot_tree


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from scipy.spatial import distance
from imblearn.over_sampling import SMOTE

# Reading files & Saving files

In [None]:
path = 'D:/...'
filename = '....xlsx/csv'
df = pd.read_excel(path + filename)
df = pd.read_csv(path + filename)

# to upload files into google colab
from google.colab import files
uploaded = files.upload()


In [None]:
# Step 6: Save results to an Excel file
results.to_excel("prediction_results.xlsx", index=False)
 
# Step 7: Download the Excel file
from google.colab import files
files.download("prediction_results.xlsx")

# Data Exploration

## Dataset information

In [None]:
df.info()
df.describe()
df.head()
df.tail()
df.columns

In [None]:
# check value counts
y.value_counts()

In [None]:
# Manual entropy and info gain demo
def entropy(class_counts):
    total = sum(class_counts)
    return -sum((count / total) * log2(count / total) if count else 0 for count in class_counts)
 
base_entropy = entropy([6, 4])
print("\nBase Entropy:", base_entropy)
 
split_entropy = ((5/10) * entropy([4, 1])) + ((5/10) * entropy([2, 3]))
info_gain = base_entropy - split_entropy
print("Information Gain from split:", info_gain)

## Charts

In [1]:
# Histogram
# Distribution of Risk Score
sns.histplot(df['Risk_Score'], bins=20, kde=True)
plt.title("Distribution of Risk Score")
plt.xlabel("Risk Score")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

NameError: name 'sns' is not defined

In [None]:
# Boxplot - Transaction Amount by High Value Flag
sns.boxplot(x='High_Value_Flag', y='Transaction_Amount', data=df)
plt.title("High Value Flag vs. Transaction Amount")
plt.xlabel("High Value Transaction (True/False)")
plt.ylabel("Transaction Amount")
plt.grid(True)
plt.show()

In [None]:
# barchart
avg_risk = df.groupby("Department")["Risk_Score"].mean().reset_index()
sns.barplot(x='Risk_Score', y='Department', data=avg_risk)
plt.title("Average Risk Score by Department")
plt.xlabel("Average Risk Score")
plt.ylabel("Department")
plt.grid(True)
plt.show()

In [None]:
# Scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(df["Transaction_Time"], df["Risk_Score"], alpha=0.8)
plt.title("Scatter Plot of Transaction_Time vs Risk_score")
plt.xlabel("Transaction_Time")
plt.ylabel("Risk_score")
plt.grid(True)
plt.show()

# Data Cleaning

✅ Use Mean Imputation:
- When the feature is numerical and follows a normal (symmetric) distribution.
- Example: Temperature, height, age (if no outliers).

✅ Use Median Imputation:
- When the feature is numerical but the data is skewed or has outliers.
- Example: Income, house prices, CO2 emission (skewed data).

✅ Use Mode Imputation:
- When the feature is categorical.
- Example: Gender, Color, Type, Brand.


| Data Type | Distribution | Outliers Present? | Use |
|-----------|--------------|-------------------|-----|
| Numerical | Normal        | No                | Mean |
| Numerical | Skewed        | Yes               | Median |
| Categorical | -           | -                 | Mode |

📌 Best Practices:
Impute after splitting into train/test (to avoid data leakage).

If many values are missing (>30%), consider dropping the column or using advanced techniques like KNN imputation, IterativeImputer, or even model-based imputation.

In [None]:
# check skewness
df['Income'].skew()  # > 1 or < -1 = highly skewed

# plot distribution
import seaborn as sns
sns.histplot(df['Income'], kde=True)

# check outliers with boxplot
sns.boxplot(x=df['Income'])

In [None]:
# checking missing values
df.isnull().sum()

In [None]:
# remove duplicate rows
df = df.drop_duplicates()

# Fill missing values in a column with the column mean
df['column_name'].fillna(df['column_name'].mean(), inplace=True)
# Fill missing values in a column with the column mode
df['column_name'].fillna(df['column_name'].mode(), inplace=True)

# print original data and cleaned data to compare
print("Original Data:")
print(df)
print("\nCleaned Data:")
print(df_2)

In [None]:
# Define a list of features that actually exist in the dataset
features = ['Transaction_Amount', 'Vendor_Risk_Rating',
            'After_Hours', 'High_Value_Flag',
            'Department_Risk_Score', 'Approval_Steps']
 
# Assign the selected features to X (input variables)
X = df[features]
 
# Define the target variable (what we want to predict)
y = df['High_Value_Flag']
 

In [None]:
# Convert boolean features to numeric format
X['High_Value_Flag'] = X['High_Value_Flag'].astype(int)

## Outliers
✅ Robust to Outliers
These models are not easily distorted by outliers:
| Model | Why It's Robust |
|-------|------------------|
| **Tree-based models** (e.g. Random Forest, XGBoost) | Split data based on thresholds, not affected by extreme values |
| **Huber Regressor** (from `sklearn`) | Combines least squares and robust loss |
| **RANSAC Regressor** | Actively filters out outliers during training |
| **Isolation Forest** | Specifically built for anomaly detection |
| **Quantile Regression** | Models conditional medians, less sensitive to outliers |


❌ Sensitive to Outliers
These models assume normally distributed, clean data:
| Model | Why It's Sensitive |
|-------|--------------------|
| **Linear Regression** | Uses squared error → outliers dominate loss |
| **Logistic Regression** | Sensitive to extreme values in features |
| **K-Nearest Neighbors (KNN)** | Distance-based — outliers mess up neighborhoods |
| **SVM (with linear kernel)** | Can be distorted unless using robust kernel or outlier detection first |

- what do we do with outliers?

✅ Option 1: Keep them
When: If they are genuine values, not data entry errors.
Use robust models (like Random Forest or Huber Regressor) that are less sensitive to outliers.

❌ Option 2: Remove them
When: Outliers result from data entry errors or are not relevant to your analysis.
Use .drop() after filtering out using IQR or Z-score.

🔁 Option 3: Transform them
Apply log, square root, or box-cox transformations to reduce the effect of outliers.

🧪 Option 4: Cap or impute them
Replace extreme values with thresholds (called Winsorization).
Or replace with the median or a predicted value.

In [None]:
# Detecting outliers using IQR
df = pd.read_csv("cars.csv")

Q1 = df['CO2'].quantile(0.25)
Q3 = df['CO2'].quantile(0.75)
IQR = Q3 - Q1

# Outlier condition
outliers = df[(df['CO2'] < Q1 - 1.5 * IQR) | (df['CO2'] > Q3 + 1.5 * IQR)]
print(outliers)

In [None]:
# apply log, square root or box-cox transformation to reduce effect of outliers
import numpy as np
df['CO2_log'] = np.log(df['CO2'])

In [None]:
# checking for outliers using interquartile range in all numerical columns

# Load dataset
df = pd.read_csv("cars.csv")

# Select only numeric columns
numeric_cols = df.select_dtypes(include='number')

# Detect outliers using IQR for each numeric column
for col in numeric_cols.columns:
    Q1 = numeric_cols[col].quantile(0.25)
    Q3 = numeric_cols[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = numeric_cols[(numeric_cols[col] < Q1 - 1.5 * IQR) | (numeric_cols[col] > Q3 + 1.5 * IQR)]
    
    print(f"Column: {col}")
    print(f"Number of outliers: {outliers.shape[0]}")
    print(outliers[[col]])
    print("-" * 40)


In [None]:
# removing all outliers from the dataset
filtered_df = df.copy()

# Loop through each numeric column
for col in numeric_cols.columns:
    Q1 = filtered_df[col].quantile(0.25)
    Q3 = filtered_df[col].quantile(0.75)
    IQR = Q3 - Q1
    # Keep only rows within IQR range
    filtered_df = filtered_df[(filtered_df[col] >= Q1 - 1.5 * IQR) & (filtered_df[col] <= Q3 + 1.5 * IQR)]

print("Filtered data shape:", filtered_df.shape)

Multivariate Outliers
are data points that may look normal when considered individually (univariate) but become outliers when considering combinations of features.

Example:
- A car with very high weight may seem normal.
- A car with very high volume may also seem normal.
- But a car with very high weight and very low volume could be unusual together — a multivariate outlier.

Common detection methods:
- Mahalanobis Distance (measures how far a point is from the mean in multidimensional space)
- Isolation Forest
- PCA-based anomaly detection

In [None]:
# Mahalanobis Distance Example (simplified for 2D)
import pandas as pd
import numpy as np
from scipy.spatial import distance

df = pd.read_csv("cars.csv")
X = df[['Weight', 'Volume']].dropna()

mean = X.mean().values
cov = np.cov(X.T)
inv_cov = np.linalg.inv(cov)

X['mahalanobis'] = X.apply(lambda row: distance.mahalanobis(row, mean, inv_cov), axis=1)
outliers = X[X['mahalanobis'] > 3]  # threshold can vary

print(outliers)

# Train-Test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
# Split into training and testing sets
# Avoid stratify if any class has < 2 samples
if y.value_counts().min() < 2:
    print("\n⚠️ Skipping stratify due to small class size.")
    stratify_param = None
else:
    stratify_param = y
 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=stratify_param

# Data Integration

In [None]:
# Merge datasets on a similar column
merged_data = pd.merge(df_1, df_2, on='similar_column', how='left')

In [None]:
#  Generate synthetic classification data
X, y = make_classification(
    n_samples=1000,     # Total 1000 samples
    n_features=10,      # Each sample has 10 features
    n_classes=2,        # Binary classification (2 classes: 0 and 1)
    random_state=42     # Seed for reproducibility
)

# Data Tranformation
### Encoding categorical data
- Categorical data is encoded to convert categorical data or tect data into numerical format
- most ML algorithms work only with numerical data
- encoding also helps to prevent bias in the model by ensuring that all features are equally weighted

### Data normalization
- is a form of feature scaling that transforms the range of features to a standard scale
- data scaling is required when the dataset has features of varying ranges
- normalized data enhance model performance and improve the accuracy of a model.
- it helps algorithms that rely on distance metrics such as, KNN & SVM

In [None]:
# Encode categorical variable column using one-hot encoding
df = pd.get_dummies(df, columns=['column_name'], drop_first=True)
# can remove the "drop_first=True" as alternative
df = pd.get_dummies(df, columns=['column_name'])

# Normalize 'Income' column using Min-Max scaling
df['Income_Normalized'] = (
    (df['Income'] - df['Income'].min()) /
    (df['Income'].max() - df['Income'].min())
)

# Convert EnrollmentDate to datetime
df['column_name'] = pd.to_datetime(df['column_name'])

In [None]:
# data scaling
from sklearn.preprocessing import StandardScaler
 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Encode all categorical columns into numeric values
# This is necessary because certain machine learning model require all numerical input
le = LabelEncoder()  # Create a LabelEncoder instance
for column in df.columns:
    df[column] = le.fit_transform(df[column])  # Encode each column

## Imbalanced dataset

A dataset is imbalanced when one class significantly outnumbers another.

Example:
- 95% of customers don’t churn.
- 5% of customers churn.

This can lead to a model that predicts only the majority class, but still shows high accuracy — while failing the real goal.

1. Resampling Methods

| Method | Description |
|--------|-------------|
| **Oversampling** (e.g. SMOTE) | Generate synthetic samples of the minority class |
| **Undersampling** | Remove samples from the majority class |
| **Hybrid** | Combine both |

In [None]:
# oversampling with SMOTE
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X = df.drop('Target', axis=1)
y = df['Target']

sm = SMOTE()
X_res, y_res = sm.fit_resample(X, y)

2. Algorithmic Strategies

| Strategy | Description |
|----------|-------------|
| **Class weights** | Penalize misclassification of minority class more |
| **Custom loss functions** | Modify loss to give more weight to minority class |

In [None]:
# example with logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(class_weight='balanced')

3. Proper Evaluation Metrics

Use metrics that don't get fooled by accuracy:

- Precision / Recall
- F1-score
- Confusion Matrix
- AUC-ROC / Precision-Recall Curve

✅ Which Models Work Well with Imbalanced Datasets?

| Model | Works Well? | Why? |
|-------|-------------|------|
| **Random Forest** | ✅ | Handles imbalance well; supports class weights |
| **XGBoost / LightGBM** | ✅ | Allows built-in handling of imbalance (`scale_pos_weight`) |
| **Logistic Regression (with class_weight)** | ✅ | Effective with weighting or resampling |
| **SVM** | ⚠️ | Needs tuning or balancing |
| **KNN** | ❌ | Distance-based; highly affected by imbalance |
| **Naive Bayes** | ⚠️ | Can be misled by rare events unless priors are adjusted |

# Data Reduction

In [None]:
# Drop columns that are not useful for analysis
df_3 = df.drop(columns=['ID', 'client_name'])

# Modeling

| ML Algorithm               | Classification | Regression | Notes                                                               |
| -------------------------- | -------------- | ---------- | ------------------------------------------------------------------- |
| **Linear Regression**      | ❌              | ✅          | Simple and interpretable; good for linear relationships.            |
| **Logistic Regression**    | ✅              | ❌          | For binary classification; not used for regression.                 |
| **Decision Tree**          | ✅              | ✅          | Easy to interpret; prone to overfitting.                            |
| **Random Forest**          | ✅              | ✅          | Handles non-linearity well; more robust than decision trees.        |
| **Gradient Boosting**      | ✅              | ✅          | Accurate; used in competitions (e.g., XGBoost, LightGBM, CatBoost). |
| **XGBoost**                | ✅              | ✅          | Fast and powerful; great with tabular data.                         |
| **LightGBM**               | ✅              | ✅          | Faster alternative to XGBoost; great with large datasets.           |
| **CatBoost**               | ✅              | ✅          | Handles categorical data well; easy to use.                         |
| **Support Vector Machine** | ✅              | ✅          | Good for small to medium datasets; requires scaling.                |
| **K-Nearest Neighbors**    | ✅              | ✅          | Simple; can struggle with large datasets or many features.          |
| **Naive Bayes**            | ✅              | ❌          | Only for classification; assumes feature independence.              |
| **Neural Networks (MLP)**  | ✅              | ✅          | Flexible; works for both tasks but may need more data/tuning.       |
| **Ridge/Lasso Regression** | ❌              | ✅          | Regularized linear models for regression.                           |
| **ElasticNet**             | ❌              | ✅          | Combines Ridge and Lasso for regression.                            |


## Linear regression

In [None]:
# selecting feature and target
x = df['Weight']
y = df['CO2']

# calculate linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

def predict(weight):
    return slope * weight + intercept

# apply to x
mymodel = list(map(predict, x))

# charts
plt.scatter(x, y)
plt.plot(x, mymodel, color='red')
plt.xlabel('Weight')
plt.ylabel('CO2 Emission')
plt.title('Linear Regression: CO2 vs Weight')
plt.show()

# example prediction
print("Predicted CO2 for a car weighing 2300kg:", predict(2300))

In [None]:
# Train a logistic regression model
from sklearn.linear_model import LogisticRegression
 
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

## Multiple Linear Regression

In [None]:
# selecting weight and volume for features and co2 as target
X = df[['Weight', 'Volume']]
y = df['CO2']

# create and train the model
model = LinearRegression()
model.fit(X, y)

# prediction
predictedCO2 = model.predict([[2300, 1300]])
print("Predicted CO2 (Weight=2300kg, Volume=1300cm3):", predictedCO2[0])

# show coefficients
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)


## Polynomial regression

In [None]:
# define the variables
x = df['Weight']
y = df['CO2']

# fit polynomial regression
model = np.poly1d(np.polyfit(x, y, 2))

# create line points
x_line = np.linspace(min(x), max(x), 100)
y_line = model(x_line)

# charts
plt.scatter(x, y)
plt.plot(x_line, y_line, color='green')
plt.xlabel('Weight')
plt.ylabel('CO2 Emission')
plt.title('Polynomial Regression: CO2 vs Weight')
plt.show()

# r-squared score
print("R-squared:", r2_score(y, model(x)))

In [None]:
# check skewness
df['Income'].skew() # > 1 or < -1 = highly skewed

In [None]:
# plot distribution
import seaborn as sns
sns.histplot(df['Income'], kde=True)

In [None]:
# check outliers with boxplot
sns.boxplot(x=df['Income'])

## Decision Tree

In [None]:
# Step 8: Train a Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
 
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
 
# Step 9: Evaluate the model using predictions and metrics
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
 
# Predict outcomes for the test set
y_pred = model.predict(X_test)

###########################################################################
###########################################################################
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Low Risk', 'High Risk'],
            yticklabels=['Low Risk', 'High Risk'])
plt.title("Confusion Matrix - Audit Risk Classification (Decision Tree)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.grid(True)
plt.show()
 
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Training decision tree model with entropy criterion
#  Train a Decision Tree model
model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(X_train, y_train)

# Predict the output using the trained model
y_pred = model.predict(X)  # Predict on the training data itself

## Random Forests

In [None]:
# training the model
rf_model = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# checking accuracy
acc_rf = accuracy_score(y_test, rf_preds)
model_names.append('Random Forest')
accuracy_scores.append(acc_rf)
print("\nRandom Forest Accuracy:", acc_rf)
print(classification_report(y_test, rf_preds))

In [None]:
# Random Forest regression

from sklearn.ensemble import RandomForestRegressor  # For regression

model = RandomForestRegressor()  # Instead of RandomForestClassifier
model.fit(X_train, y_train)

# Predict outcomes for the test set
y_pred = model.predict(X_test)

## Extra Trees

In [None]:
# training the model
et_model = ExtraTreesClassifier(n_estimators=100, criterion='entropy', random_state=42)
et_model.fit(X_train, y_train)
et_preds = et_model.predict(X_test)

# checking accuracy
acc_et = accuracy_score(y_test, et_preds)
model_names.append('Extra Trees')
accuracy_scores.append(acc_et)
print("\nExtra Trees Accuracy:", acc_et)
print(classification_report(y_test, et_preds))

## SVM - Support Vector Machine 

In [None]:
# 4. Train Support Vector Machine Classifier
svm_model = SVC(probability=True, random_state=42)  # Initialize model with probability enabled
svm_model.fit(X_train, y_train)                     # Train on training data
svm_preds = svm_model.predict(X_test)               # Predict on test data
svm_accuracy = accuracy_score(y_test, svm_preds)    # Calculate accuracy
svm_risk_scores = svm_model.predict_proba(X_test)[:, 1]  # Probability of class 1 (risk score)

# Model Evaluation

Intepreting **Regression Model** Performance Metrix
- r2_score - the proportion of variance in the dependent variable that's predictable from independent variables

    - Interpretation:
        - 1.0 = perfect prediction
        - 0.9999... = nearly perfect (99.999% of variance explained)
        - 0.8-0.9 is generally considered excellent for real-world applications
    - Caution: Values this high often indicate:
        - Possible data leakage (test data influencing training)
        - Overly simple dataset (maybe the problem is trivial)
        - Target variable might be included in features

- MAE - mean absolute error - average absolute difference between predictions and actual values
    
    - Interpretation:
        - On average, your predictions are off by about 868.45 units (if MAE = 868.45)
        - Example: If predicting house prices where average price is $500,000, this is excellent (~1.7% error)
        - If predicting temperatures where average is 50°F, this would be poor

- MSE - Mean Squared Error - average of squared errors
    - Interpretation:
        - More sensitive to large errors than MAE
        - Hard to interpret alone due to squaring
        - Primarily useful for comparing models (lower is better)

- RMSE - Root Mean Squared Error - square root of MSE
    - Interpretation:
        - In the same units as your original data
        - Your typical prediction error is about 6.57 units
        - More interpretable than MSE
        - RMSE > MAE suggests some larger errors in your dataset. However, this is normal for real-world data with outliers

In [None]:
# print score and model performance
# Key Metrics
print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# Visualization
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')  # Perfect prediction line
plt.show()

In [None]:
# comparing model performance
#  Print Accuracy Results
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")  # Display Random Forest accuracy
print(f"SVM Accuracy: {svm_accuracy:.2f}")           # Display SVM accuracy
 
#  Plot Accuracy Comparison - First display
models = ['Random Forest', 'SVM']    # Model names for x-axis
accuracies = [rf_accuracy, svm_accuracy]  # Accuracy values
 
plt.figure(figsize=(8, 5))           # Set figure size
plt.bar(models, accuracies)          # Create bar chart
plt.title('Model Accuracy Comparison')  # Title of the plot
plt.ylabel('Accuracy')               # Y-axis label
plt.ylim(0, 1)                       # Limit y-axis from 0 to 1
plt.grid(axis='y')                   # Show grid lines on y-axis
plt.show()                           # Display the plot
 
# Save the Plot and Show Again (this part duplicates the plot and saves it)
plt.figure(figsize=(8, 5))           # Create another figure with same settings
plt.bar(models, accuracies)
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y')
 
plt.savefig("accuracy_plot.png")     # Save the plot as an image file

In [None]:
#  Evaluate the model using predictions and metrics
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
 
# Predict outcomes for the test set
y_pred = model.predict(X_test_scaled)
 
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Low Risk', 'High Risk'],
            yticklabels=['Low Risk', 'High Risk'])
plt.title("Confusion Matrix - Audit Risk Classification")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.grid(True)
plt.show()
 
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Create a new input sample with the same features used for training
# Format: [Transaction_Amount, Vendor_Risk_Rating, After_Hours, High_Value_Flag, Department_Risk_Score, Approval_Steps]
new_input = pd.DataFrame([[
    1500,     # Transaction_Amount
    1,        # Vendor_Risk_Rating
    0,        # After_Hours (1 = yes, 0 = no)
    0,        # High_Value_Flag (1 = high value)
    2,       # Department_Risk_Score (example calculation: rating * steps)
    3         # Approval_Steps
]], columns=['Transaction_Amount', 'Vendor_Risk_Rating', 'After_Hours',
             'High_Value_Flag', 'Department_Risk_Score', 'Approval_Steps'])
 
# Scale the input using the previously fitted scaler
new_input_scaled = scaler.transform(new_input)
 
# Predict risk class (0 = Low Risk, 1 = High Risk)
predicted_class = model.predict(new_input_scaled)
predicted_prob = model.predict_proba(new_input_scaled)
 
# Output results
print("Predicted Risk Class:", "High Risk" if predicted_class[0] else "Low Risk")
print("Prediction Probabilities [Low Risk, High Risk]:", predicted_prob[0])

In [None]:
#  Plot model comparison
# ------------------------------------------
plt.figure(figsize=(8, 5))
plt.bar(model_names, accuracy_scores, color='skyblue')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, score in enumerate(accuracy_scores):
    plt.text(i, score + 0.02, f"{score:.2f}", ha='center', fontsize=12)
plt.grid(axis='y')
plt.show()

In [None]:
# ------------------------------------------
# Visualize the Decision Tree
# ------------------------------------------
from sklearn.tree import plot_tree
 
plt.figure(figsize=(20, 10))  # Adjust size as needed
plot_tree(
    dt_model,
    feature_names=features,
    class_names=[str(cls) for cls in np.unique(y)],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
#  Make predictions and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
 
#  Plot the Decision Tree
plt.figure(figsize=(12, 8))
plot_tree(model, filled=True, feature_names=data.feature_names, class_names=data.target_names)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
# Print the detailed classification metrics (precision, recall, F1-score)
print("\nClassification Report:")
print(classification_report(y, y_pred, target_names=['No', 'Yes']))

# Notes
What are these:
- Gradient Boosted Trees - gradient boosting is a methodology applied on top of another machine learning algorithm and involves 2 types of models:
    1. weak machine learning model, typically a decision tree
    2. strong machine elarning model, composed of multiple weak models.
    - At each step,a new weak model is trained to predict the error of the current strong model (pseudo response). Error = difference between prediction and regressive label. The weak model which is the error is then added to the strong model with a negative sign to reduce the error of the strong model.
    - The operation repeats until a stopping criterion is met (maximum number of iterations) or if the strong model begins to overfit as measured on a separate validation dataset.
    - shrinkage in gradient boosting is similar to learning rate in neural networks. It controls how fast the strong model is learning and helps to limit overfitting. The smaller the shrinkage the more it reduces overfitting.
- Neural Networks
    - mimics human brain, specifically neurons to identify phenomena, weigh options and make conclusions. 
    - consists of layers of nodes/artificial neurons: an input layer, 1 or more hidden layers and an output layer.
    - each node connects to others and has its own associated weight and treshold. If the output of any indivigual node is above the specific treshold value, the node will be activated, sending data to the next layer of the network.
    - each individual node is like its own linear regression model, composed of input data, weights, bias/treshold and an output.
    - steps: 
        1. Input layer is determined
        2. Weights are assigned. Weights determine the importance of any variable
        3. ALl inputs are multiplied by their respective weights and then summed.
        4. Output is passed through an activation function. If the output exceeds a given treshold, it activates the node and pass the data to the next layer, if it doesn't it no data is passed to the next layer.
        5. The output of the current node will be the input of the next node. This is called as the feedforward network.
    - MLP - multi-layer perceptrons
    - CNN - conventional neural networks - usually utilized for image recognition, patter recognition and computer vision. Uses linear algebra and matrix multiplication.
    - RNN - recurrent neural networks - identified by their feedback loops. Leveraged using time-series data to make predictions.
- Clustering Algorithms - learns the clusters on train data.

Things to learn:
- containerisation - Docker, Kubernetes
- CI/CD platforms - CircleDI
- Version control - Git
- Quality written code - unit test, linting, formatting, strict typing
- Cloud platform - AWS, GCP, Azure
- Workflow orchestration - Airflow, Argo, Metaflow
- Experiment tracking tools - MLflow, Weights & Biases

<br>

- 5 things interested in:
    - video game AI
    - weather prediction
    - Process automation
    - loan defaulters prediction
    - 

<br>

Before running an experiment, write down:
1. Hypothesis
2. What is expected to happen
3. Why expect it to happen

# Basel Accords
- **Regulatory capital** - minimum capital required by financial governance for banks to hold. It acts as a buffer when there's financial stress.
- **Economic capital** - estimation by banks on how much capital they should be holding as opposed to what regulators prescribe. Regulatory capital is managed internally and would be fitted to the bank's specific risk profile and operational strategy.

<br>

## Basel 1
- First attempt to harmonize the method of bank regulation. Proposed in 1988
- Goal: ensure maintenance of adequate capital by banks against RWA (Risk Weighted Asset - bank's assets or off balance sheet exposures, weighted according to risk), and to promote stability in the international banking system.
- Features:
    - Minimum 8% of RWA
    - risk weight of 0% for cash and 100% for corporate loans
- Weaknesses:
    - simple methodology
    - It did not take into account the differences in risk profiles for different types of loans and assets
    - Banks could practice regulatory arbitrage - exploit the system through portfolio subsitution toward assets that require less regulatory capital but pose higher risks.

<br>

## Basel 2
- Introduced in 2004, to address the shortcomings of Basel 1.
- Basel 2 is based on 3 pillars:
    1. Minimum capital requirements - expanded on Basel 1 with more sophisticated risk weighting systems
    2. Supervisory Review - Higlights the importance of sound regulatory oversight to ensure banks have adequate systems to manage risks
    3. Market discipline - enhance transparency by forcing banks to disclose their risk exposures and capital adequacy postions
- Key improvement - Internal Ratings Based (IRB) approach - enabled banks to use internal risk assessment models to calculate capital requirement.

<br>

## Basel 3
- Was developed by Basel committee in 2010 after the 2008 global financial crisis. The regulations was more stringent to prepare the financial sector against future crises.
- Features:
    - Higher capital standards - minimum requirement for Tier 1 capital has been increased
    - Leverage ratio - introduced the usage of a non-risk-based leverage ratio as a backstop to the risk-weighted capital requirements to ensure the banks do not become over-leveraged. Compares a company's debt to other financial metrics like assets, equity or earnings.
    - Liquidity requirements:
        - liquidy coverage ratio (LCR) - ensure banks are sufficiently supplied with high-quality liquid assets to withstand a 30-day liquidity stress situation.
        - Net stable funding ratio (NSFR) - to stimulate more stable funding with londer-term maturity.
    - Countercyclical buffer - banks hold extra capital for buffer which they can then use during financital stress.
- It increased the risk management of banks but the complexity of the framework depends on the bank internal models.

<br>

## Basel 4
- Ongoing process and reforms to fine tune the Basel regime.
- Main highlights of Basel 4:
    - Capital floors - minimum capital floors to limit the extent banks can reduce capital requirements using their internal models
    - Standardized approach enhancements - emphasize changes in the standardized approach to the calculation of RWA on issues such as credit and operational risk.
    - Increased disclosure - a provision to increase disclosure requirements, thereby enabling banks to give more elaborative information about their RWA and methodologies.



# IFRS9
- Issued by International Accounting Standards Board
- Introduces expected loss model to early recognize potential losses by anticipating credit risk deterioration before actual default takes place
- 3 main categories:
    1. Amoritzed cost - Held-to-collect financial assets represent assets held for collecting contractual cash flows. spreading the cost of asset or liability over the financial lifespan accounting for the time value of money.
    2. Fair value through other comprehensive income (FVOCI) - Held-for-collection-and-sale financial assets, whose cash flow represnt payments of principal and interest. Unrealized gains and losses are recognized in other comprehensive income.
    3. Fair Value through Profit or Loss (FVPL) - financial assets that do not fulfill criteria of amortized cost or FVOCI are measured at fair value through profit or loss. Includes derivatives.
- Impairment - Expected Credit Loss (ECL) model
    - The ECL model is supposed to capture the credit losses much earlier and divided into 3 stages:
        1. 12-month ECL - expected losses over the next 12 months, are assets for which there has not been a significant increase in credit risk  since intial recognition. *not much risk increase*
        2. Lifetime ECL - expected losses over the entire remaining life of the asset, significant increase in credit risk since initial recognition but the asset is not yet credit-impaired. *a lot of risk since initial*
        3. Credit-Impaired - lifetime ECL is also recognized on such assets. Interest income = gross carrying amount - loss allowance. This stage are for assets that are already in default or showing signs of default.

# Basel Accords vs IFRS 9
| Basel Accords | IFRS 9 |
| --- | --- | 
| ensure capital **buffers** to absorb expected and unexpected losses | **identifying expected credit loss** for financial reporting purposes |
| cover expected and unexpected losses | focus on expected credit losses ECL | 
| delay in payment of 90-180 days | stage 1, 2, 3 under ECL model | 
| capital buffers designed to absorb losses | recognize loss in advance before any actual default | 

- Probability of Default (PD) - Probability that each borrower will default in predefined time horizon.
- Loss Given Default (LGD) - a fraction of the total exposure lost in case of a default. It measures the severity of the loss and therefore, the amount of capital to be held against the defaulters
- Exposure at Default (EAD) - overall amount a bank may be exposed if there is a default. Includes outstanding loan amount, any increase in exposure such as undrawn credit lines up to the point of the default. Important to quantify total possible loss a bank may have to face.