In [None]:
import pandas as pd
import os

# Get the current directory 
current_dir = os.path.dirname(os.getcwd())


## Loading CSVs

In [12]:
rq2_svm_xlsx_file = os.path.join(current_dir, 'analysis/rq2-gridsearch-pearson-fs-recall-macro-1200k.xlsx')
ml_pd = pd.read_excel(rq2_svm_xlsx_file, sheet_name='Probabilities', engine='openpyxl')
ml_pd = ml_pd.loc[:, ~ml_pd.columns.str.contains('^Unnamed')]

### RQ2 - SVM threshold analysis

In [13]:
def calculateFscore(true_positives, false_positives, false_negatives): 
    # Calculate Precision
    if true_positives + false_positives > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0  # To handle the case when TP and FP are both zero

    # Calculate Recall
    if true_positives + false_negatives > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0  # To handle the case when TP and FN are both zero

    # Calculate F1 Score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0  # To handle the case when precision and recall are both zero

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")


def computeScoresByThreshold(df, threshold = 0.5):
    df['Predicted'] = (df['SVM_proba'] >= threshold).astype(int)

    # Calculate True Positives (TP), False Positives (FP), True Negatives (TN), False Negatives (FN)
    true_positives = ((df['Predicted'] == 1) & (df['Was Selected?'] == 1)).sum()
    false_positives = ((df['Predicted'] == 1) & (df['Was Selected?'] == 0)).sum()
    true_negatives = ((df['Predicted'] == 0) & (df['Was Selected?'] == 0)).sum()
    false_negatives = ((df['Predicted'] == 0) & (df['Was Selected?'] == 1)).sum()

    print(f"True Negatives: {true_negatives}")
    print(f"True Positives: {true_positives}")
    print(f"False Negatives: {false_negatives}")
    print(f"False Positives: {false_positives}")

    calculateFscore(true_positives, false_positives, false_negatives)


In [14]:
computeScoresByThreshold(ml_pd, 0.5)

True Negatives: 183
True Positives: 38
False Negatives: 0
False Positives: 330
Precision: 0.1033
Recall: 1.0000
F1 Score: 0.1872


In [15]:
computeScoresByThreshold(ml_pd, 0.75)

True Negatives: 265
True Positives: 37
False Negatives: 1
False Positives: 248
Precision: 0.1298
Recall: 0.9737
F1 Score: 0.2291


In [16]:
computeScoresByThreshold(ml_pd, 0.80)

True Negatives: 283
True Positives: 36
False Negatives: 2
False Positives: 230
Precision: 0.1353
Recall: 0.9474
F1 Score: 0.2368


In [17]:
computeScoresByThreshold(ml_pd, 0.85)

True Negatives: 307
True Positives: 34
False Negatives: 4
False Positives: 206
Precision: 0.1417
Recall: 0.8947
F1 Score: 0.2446
