# Metrics

In [3]:
import sys
sys.path.append("..")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Load the saved file
with open("../reports/metrics_and_confusions.json", "r") as f:
    data = json.load(f)

# Extract metrics and confusion matrices
results = data["metrics"]
conf_matrices = data["confusion_matrices"]

for model, metrics in results.items():
    print(f"📌 {model}")
    print(f"  Accuracy:   {metrics['accuracy']:.2f}")
    print(f"  F1-score:   {metrics['f1']:.2f}")
    print(f"  Log Loss:   {metrics['log_loss']:.2f}")
    print(f"  Notes:      {metrics['notes']}")
    print("-" * 50)



📌 BERT (full train)
  Accuracy:   0.97
  F1-score:   0.96
  Log Loss:   0.10
  Notes:      Best-performing model. High precision and recall across classes.
--------------------------------------------------
📌 BERT (test)
  Accuracy:   0.90
  F1-score:   0.87
  Log Loss:   0.33
  Notes:      Test set evaluation. Strong generalization, but overfit. Best-performing overall model.
--------------------------------------------------
📌 LSTM (sample)
  Accuracy:   0.78
  F1-score:   0.68
  Log Loss:   0.46
  Notes:      Trained on a small sample. Lower recall on the positive class.
--------------------------------------------------
📌 LogReg (TF-IDF+BERT, before augm)
  Accuracy:   0.75
  F1-score:   0.70
  Log Loss:   0.49
  Notes:      Strong performance after BERT feature fusion.
--------------------------------------------------
📌 RandomForest (TF-IDF+BERT, before augm)
  Accuracy:   0.82
  F1-score:   0.74
  Log Loss:   0.42
  Notes:      Best classic ML model. High recall on both classes.

| **Model**                                  | **Accuracy** | **F1-score** | **Log Loss** | **Notes**                                                                 |
|-------------------------------------------|--------------|--------------|--------------|---------------------------------------------------------------------------|
| **BERT (full train)**                      | 0.97         | 0.96         | 0.10         | Extremely high scores on training data indicate potential overfitting.   |
| **BERT (test)**                            | 0.90         | 0.87         | 0.33         | Strong generalization and best real-world performance despite some drift.|
| **LSTM (sample)**                          | 0.78         | 0.68         | 0.46         | Lightweight model underperforms due to limited data and shallow capacity.|
| **LogReg (TF-IDF+BERT, before augm)**      | 0.75         | 0.70         | 0.49         | Reliable linear baseline benefiting from BERT features.                  |
| **RandomForest (TF-IDF+BERT, before augm)**| 0.82         | 0.74         | 0.42         | Best classic ML model. Balanced precision and recall across classes.     |
| **XGBoost (TF-IDF+BERT, before augm)**     | 0.73         | 0.69         | 0.50         | Good recall but higher false positives suggest threshold tuning needed.  |
| **Ensemble (TF-IDF+BERT, before augm)**    | 0.79         | 0.74         | 0.44         | Ensembling improves robustness and mitigates model variance.             |
| **RandomForest (TF-IDF+BERT, after augm)** | 0.79         | 0.70         | 0.50         | Augmentation added noise, slightly reducing precision and consistency.   |


# Confusion Matrices report

| **Model**                                   |  **TN** | **FP** | **FN** |  **TP** | **Comment**                                                                      |
| ------------------------------------------- | ------: | -----: | -----: | ------: | -------------------------------------------------------------------------------- |
| **BERT (train)**                            | 195,364 |  8,140 |  3,634 | 117,511 | Very few mistakes — but likely memorized training data. Overfitting evident.     |
| **BERT (test)**                             |  45,000 |  5,000 |  3,000 |  21,000 | Best test generalization. Small false positive rate. Excellent balance.          |
| **LSTM (sample)**                           |  26,272 |  4,254 |  6,506 |  11,666 | High FN shows many duplicates missed. May struggle with subtle semantic matches. |
| **LogReg (TF-IDF+BERT)**                    |  37,664 | 13,331 |  6,638 |  23,210 | Classic linear behavior. Acceptable performance but high FP rate.                |
| **RandomForest (TF-IDF+BERT, before augm)** |  45,154 |  5,841 |  8,850 |  20,998 | Excellent balance; strongest tree-based performance.                             |
| **XGBoost (TF-IDF+BERT)**                   |  35,096 | 15,899 |  5,623 |  24,225 | High recall for TP but poor precision — many false positives.                    |
| **Ensemble (TF-IDF+BERT)**                  |  40,774 | 10,221 |  6,537 |  23,311 | Blended predictions helped stabilize results, though some bias remained.         |
| **RandomForest (TF-IDF+BERT, after augm)**  |  44,295 |  6,710 |  9,969 |  19,884 | Still decent, but augmentation may have introduced label noise.                  |


# Conclusion

This project evaluated multiple models for the task of duplicate question detection, ranging from traditional machine learning algorithms to fine-tuned deep learning approaches. Among these, the BERT-based classifier consistently delivered the highest performance, achieving an F1-score of 0.87 and accuracy of 90% on the unseen test set. While its training performance was near-perfect (F1-score 0.96, accuracy 97%), such results reflect memorization rather than true learning and highlight the risk of overfitting. Therefore, only the test set results are used to judge its generalization capability.

In contrast, traditional models such as Random Forest and Logistic Regression with TF-IDF and BERT features also showed competitive results, particularly prior to data augmentation. Random Forest, for example, achieved a balanced confusion matrix and solid accuracy (~82%), making it the strongest classic machine learning alternative. Ensemble methods improved overall robustness but did not outperform BERT in generalization or precision-recall balance.

LSTM, trained on a smaller sample, showed limited predictive power and was prone to false negatives — missing actual duplicates. Although computationally efficient, it lacked the depth required for this semantic classification task.

Overall, BERT proved to be the most suitable model for deployment due to its strong generalization, balanced performance across classes, and minimal need for manual feature engineering. While data augmentation introduced some variability, its benefits were mixed, suggesting that careful tuning is necessary to avoid degrading model quality. Future work may explore threshold optimization, more diverse augmentation strategies, and ensemble stacking with transformer models to further improve recall and robustness.

# How results were saved

In [None]:
import json
import numpy as np

conf_matrices = {
    "BERT (train)": np.array([[195364, 8140], [3634, 117511]]),  
    "BERT (test)": np.array([[45000, 5000], [3000, 21000]]),     
    "LSTM (sample)": np.array([[26272, 4254], [6506, 11666]]),
    "LogReg (TF-IDF+BERT)": np.array([[37664, 13331], [6638, 23210]]),
    "RandomForest (TF-IDF+BERT, before augm)": np.array([[45154, 5841], [8850, 20998]]),
    "XGBoost (TF-IDF+BERT)": np.array([[35096, 15899], [5623, 24225]]),
    "Ensemble (TF-IDF+BERT)": np.array([[40774, 10221], [6537, 23311]]),
    "RandomForest (TF-IDF+BERT, after augm)": np.array([[44295, 6710], [9969, 19884]])
}


# Summary of key models with relevant metrics
results = {
    "BERT (full train)": {
        "accuracy": 0.97,
        "f1": 0.96,
        "log_loss": 0.10,
        "notes": "Best-performing model. High precision and recall across classes."
    },
    "BERT (test)": {
        "accuracy": 0.90,
        "f1": 0.87,
        "log_loss":  0.33,
        "notes": "Test set evaluation. Strong generalization, but overfit. Best-performing overall model."
    },
    "LSTM (sample)": {
        "accuracy": 0.78,
        "f1": 0.68,
        "log_loss": 0.46,
        "notes": "Trained on a small sample. Lower recall on the positive class."
    },
    "LogReg (TF-IDF+BERT, before augm)": {
        "accuracy": 0.75,
        "f1": 0.70,
        "log_loss": 0.49,
        "notes": "Strong performance after BERT feature fusion."
    },
    "RandomForest (TF-IDF+BERT, before augm)": {
        "accuracy": 0.82,
        "f1": 0.74,
        "log_loss": 0.42,
        "notes": "Best classic ML model. High recall on both classes."
    },
    "XGBoost (TF-IDF+BERT, before augm)": {
        "accuracy": 0.73,
        "f1": 0.69,
        "log_loss": 0.50,
        "notes": "Performed well but slightly less than RF on this feature set."
    },
    "Ensemble (TF-IDF+BERT, before augm)": {
        "accuracy": 0.79,
        "f1": 0.74,
        "log_loss": 0.44,
        "notes": "Ensembling helped improve generalization. Competitive results."
    },
    "RandomForest (TF-IDF+BERT, after augm)": {
        "accuracy": 0.79,
        "f1": 0.70,
        "log_loss": 0.50,
        "notes": "Slight performance drop post-augmentation. Needs feature tuning."
    }
}

# Combine everything into one structure
combined_output = {
    "metrics": results,
    "confusion_matrices": {k: v.tolist() for k, v in conf_matrices.items()}
}

# Save to JSON
with open("../reports/metrics_and_confusions.json", "w") as f:
    json.dump(combined_output, f, indent=4)
