In [2]:
import numpy as np
import pandas as pd
import os

# Get the current directory 
current_dir = os.path.dirname(os.getcwd())


## Loading CSVs

In [None]:
rq1_rf_xlsx_file = os.path.join(current_dir, 'analysis/rq1_gridsearch_anovaf_fs-k1200.xlsx')
ml_pd = pd.read_excel(rq1_rf_xlsx_file, sheet_name='Probabilities', engine='openpyxl')
ml_pd = ml_pd.loc[:, ~ml_pd.columns.str.contains('^Unnamed')]

### RQ1 - RF threshold analysis

In [5]:
def calculateFscore(true_positives, false_positives, false_negatives): 
    # Calculate Precision
    if true_positives + false_positives > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0  # To handle the case when TP and FP are both zero

    # Calculate Recall
    if true_positives + false_negatives > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0  # To handle the case when TP and FN are both zero

    # Calculate F1 Score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0  # To handle the case when precision and recall are both zero

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")


def computeScoresByThreshold(df, threshold = 0.5):
    df['Predicted'] = (df['RFOREST_proba'] >= threshold).astype(int)

    # Calculate True Positives (TP), False Positives (FP), True Negatives (TN), False Negatives (FN)
    true_positives = ((df['Predicted'] == 1) & (df['Was Selected?'] == 1)).sum()
    false_positives = ((df['Predicted'] == 1) & (df['Was Selected?'] == 0)).sum()
    true_negatives = ((df['Predicted'] == 0) & (df['Was Selected?'] == 0)).sum()
    false_negatives = ((df['Predicted'] == 0) & (df['Was Selected?'] == 1)).sum()

    print(f"True Negatives: {true_negatives}")
    print(f"True Positives: {true_positives}")
    print(f"False Negatives: {false_negatives}")
    print(f"False Positives: {false_positives}")

    calculateFscore(true_positives, false_positives, false_negatives)


In [6]:
computeScoresByThreshold(ml_pd, 0.5)

True Negatives: 430
True Positives: 24
False Negatives: 14
False Positives: 83
Precision: 0.2243
Recall: 0.6316
F1 Score: 0.3310


In [7]:
computeScoresByThreshold(ml_pd, 0.60)

True Negatives: 468
True Positives: 20
False Negatives: 18
False Positives: 45
Precision: 0.3077
Recall: 0.5263
F1 Score: 0.3883


In [8]:
computeScoresByThreshold(ml_pd, 0.65)

True Negatives: 485
True Positives: 16
False Negatives: 22
False Positives: 28
Precision: 0.3636
Recall: 0.4211
F1 Score: 0.3902


In [9]:
computeScoresByThreshold(ml_pd, 0.70)

True Negatives: 498
True Positives: 14
False Negatives: 24
False Positives: 15
Precision: 0.4828
Recall: 0.3684
F1 Score: 0.4179
