# Setup 


## Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import topics_classifier_utils as tcu 

# Load pre-trained model (no API call - loads from local disk)
import zipfile
import os

# Check if model files are in root directory or need to be extracted
if os.path.exists('config.json') and os.path.exists('mlb.pkl'):
    # Model files are already in root directory
    model_path = '.'
    print("Using model files from current directory")
elif os.path.exists('trained_model') and os.path.exists(os.path.join('trained_model', 'config.json')):
    # Model is already extracted to trained_model directory
    model_path = 'trained_model'
    print("Using model files from 'trained_model' directory")
else:
    # Need to extract from zip
    print("Extracting model from zip file...")
    if not os.path.exists('trained_model'):
        os.makedirs('trained_model', exist_ok=True)
    with zipfile.ZipFile('trained_model.zip', 'r') as zip_ref:
        zip_ref.extractall('trained_model')
    model_path = 'trained_model'
    print("Model extracted successfully")

# Load from local path (no API call - local_files_only=True prevents HuggingFace API calls)
model, tokenizer, mlb = tcu.load_trained_model(model_path)
print("Model loaded successfully!")

Using model files from 'trained_model' directory
Model loaded successfully!


In [36]:
from importlib import reload
reload(tcu)

<module 'topics_classifier_utils' from '/scratch/midway3/aesteva/Speech-Multi-Classifier/topics_classifier_utils.py'>

In [37]:
# Import dataset
df_main = pd.read_excel("speeches_with_names.xlsx")

## Cleaning Dataset

In [38]:
cols_to_keep = ['new_speech_id', 'topic', 'speech', 'ID']

df_main = df_main[cols_to_keep]

df_test = df_main[df_main['topic'].notna() & (df_main['topic'] != '')]

df_candidates = df_main[df_main['topic'].isna() | (df_main['topic'] == '')]

df_main.shape, df_test.shape, df_candidates.shape

((28058, 4), (4284, 4), (23774, 4))

# Training on Test Set

In [39]:
# Train BERT model on df_test

# model, tokenizer, mlb, train_loader, val_loader = tcu.train_bert_multilabel(
#     df=df_test,
#     model_name='emanjavacas/MacBERTh',  # MacBERTh for historical/old English
#     text_col='speech',
#     label_col='topic',
#     batch_size=4,  # Start small for CPU
#     epochs=3,
#     max_length=256  # Reduced to save memory
# )

## Step 3: Create Embedding Cache (Optimization)

Create embeddings once and reuse them for faster predictions and threshold testing.

In [40]:
# Create embedding cache for df_candidates
# This embeds all speeches once and saves them for reuse
# Much faster when testing different thresholds and making predictions!

embedding_cache = tcu.create_embedding_cache(
    df_texts=df_main,
    id_col='new_speech_id',  # ID column to use as cache key
    text_col='speech',
    model=model,
    tokenizer=tokenizer,
    max_length=256,
    chunk_size=200,
    batch_size=128,  # Adjust based on your GPU memory
    cache_path='embedding_cache.pkl',  # Save cache to disk for reuse
    show_progress_bar=True
)


>>> Loading Embedding Cache from embedding_cache.pkl
✓ Loaded 27785 embeddings from cache



# Predict on Unlabeled Data (Using Cached Embeddings)

Now use the cached embeddings for fast predictions.

In [41]:
# Predict topic probabilities for df_candidates using cached embeddings
# This is MUCH faster than computing embeddings on-the-fly!

df_candidates_with_predictions = tcu.predict_from_embedding_cache(
    embedding_cache=embedding_cache,
    df=df_candidates,
    id_col='new_speech_id',
    model=model,
    mlb=mlb,
    batch_size=128  # Can use larger batch size since embeddings are pre-computed
)

candidate_prob_cols = [col for col in df_candidates_with_predictions.columns if col.startswith('prob_')]


Predicting from 23774 cached embeddings...


Predicting probabilities: 100%|██████████| 186/186 [00:00<00:00, 4054.93it/s]


In [42]:
df_candidates_with_predictions.head()

Unnamed: 0,new_speech_id,topic,speech,ID,prob_Other,prob_Politics: Domestic,prob_Politics: Foreign,prob_Private Matters/Ceremonial,prob_Private Matters/Cerenomial,prob_Public Finance,prob_Religious Issues,prob_Trials
0,70001,,"Trusty, and Well-beloved, We greet you well: I...",4599.0,0.023824,0.868598,0.184695,0.037516,0.004877,0.203659,0.577442,0.012328
1,70002,,"If the general Distraction and Confusion, whic...",4599.0,0.016579,0.939967,0.085532,0.034915,0.002706,0.128325,0.226138,0.020901
2,70003,,"Dread Sovereign, Your faithful Subjects the Co...",4479.0,0.034297,0.945988,0.13978,0.080254,0.004392,0.147302,0.146564,0.022458
3,70004,,"Sir, the House has taken very great Offence at...",2421.0,0.007718,0.966232,0.054771,0.023878,0.002126,0.214133,0.027931,0.131877
4,70005,,"'He, the King, had too ample a Manifestation ...",4599.0,0.016338,0.971618,0.080788,0.022719,0.00263,0.256028,0.29259,0.016291


In [43]:
# Save results to Excel
df_candidates_with_predictions.to_excel('candidates_with_predictions.xlsx', index=False)
print("Saved predictions (topics and probabilities) to 'candidates_with_predictions.xlsx'")

Saved predictions (topics and probabilities) to 'candidates_with_predictions.xlsx'


## Step 2: Find Optimal Threshold

Use validation set to find optimal threshold that minimizes FN (or FP) based on your needs.

In [44]:
df_test_with_predictions = tcu.predict_from_embedding_cache(
    embedding_cache=embedding_cache,
    df=df_test,
    id_col='new_speech_id',
    model=model,
    mlb=mlb,
    batch_size=128  # Can use larger batch size since embeddings are pre-computed
)

df_test_with_predictions.head()

Predicting from 4284 cached embeddings...


Predicting probabilities: 100%|██████████| 34/34 [00:00<00:00, 3897.41it/s]


Unnamed: 0,new_speech_id,topic,speech,ID,prob_Other,prob_Politics: Domestic,prob_Politics: Foreign,prob_Private Matters/Ceremonial,prob_Private Matters/Cerenomial,prob_Public Finance,prob_Religious Issues,prob_Trials
0,1,Politics: Domestic,My Lords and gentlemen; I hope that you do rem...,10001.0,0.011698,0.933067,0.022387,0.056214,0.002836,0.050063,0.224099,0.124481
1,2,Politics: Domestic,the king's main reason of calling the parliame...,10013.0,0.015721,0.761333,0.404397,0.02144,0.004048,0.580476,0.108299,0.011705
2,3,"Politics: Domestic, Religious Issues, Public F...","""That his majesty had amply accepted the Speak...",10013.0,0.023805,0.960059,0.128712,0.03761,0.003028,0.147314,0.132154,0.01386
3,4,Politics: Domestic,"""That the late dictates between the late King ...",1321.0,0.009132,0.984181,0.024302,0.021698,0.001397,0.103176,0.071109,0.043408
4,5,Politics: Domestic,there might be no Committees for Grievances or...,316.0,0.013387,0.960703,0.039627,0.016508,0.002295,0.530751,0.205464,0.01242


In [45]:
test_prob_cols = [col for col in df_test_with_predictions.columns if col.startswith('prob_')]
y_probs = df_test_with_predictions[test_prob_cols].values

# Get true labels in binary format (need to use original df_val_sample with topics)
val_texts, y_true, mlb_val, labels = tcu.prepare_multilabel_data(
    df=df_test_with_predictions, 
    text_col='speech', 
    label_col='topic'
)

Total samples: 4284
Number of unique topics: 8
Topics: ['Other', 'Politics: Domestic', 'Politics: Foreign', 'Private Matters/Ceremonial', 'Private Matters/Cerenomial', 'Public Finance', 'Religious Issues', 'Trials']


In [46]:
# Find optimal threshold
# Strategy options:
# - 'minimize_fn': Minimize false negatives (optimize recall) - good for not missing topics
# - 'minimize_fp': Minimize false positives (optimize precision) - good for avoiding wrong topics
# - 'per_class': Find optimal threshold for each class separately
# Get all thresholds at once
reload(tcu)
results = tcu.find_optimal_thresholds_multilabel(y_true, y_probs, mlb)

# Access specific strategies
fn_threshold = results['minimize_fn']['global_threshold']
fp_threshold = results['minimize_fp']['global_threshold']
pc_thresholds = results['per_class_f1']['per_class_thresholds']

# Compare F1 scores
print(results['minimize_fn']['f1_macro'])
print(results['minimize_fp']['f1_macro'])
print(results['per_class_f1']['f1_macro'])


=== Computing Optimal Thresholds (All Strategies) ===

1. Minimize FN (Cost-Sensitive Approach)...
   Threshold: 0.1667 | F1 Macro: 0.5210 | F1 Micro: 0.7868

2. Minimize FP (Precision-Optimized)...
   Threshold: 0.4183 | F1 Macro: 0.4789 | F1 Micro: 0.8112

3. Per-Class F1 Optimization (Balanced)...
   Avg Threshold: 0.2258 | F1 Macro: 0.5747 | F1 Micro: 0.8156

Summary of All Strategies:
  Minimize FN (Recall):     0.1667 (F1 Macro: 0.5210)
  Minimize FP (Precision):  0.4183 (F1 Macro: 0.4789)
  Per-Class F1 (Balanced):  0.2258 avg (F1 Macro: 0.5747)

0.5209828157178662
0.47894301485868096
0.5747347454931437


In [47]:

reload(tcu)
df_test_with_predictions = tcu.make_predictions(df_test_with_predictions, fn_threshold) 
df_test_with_predictions = tcu.make_predictions(df_test_with_predictions, fp_threshold) 
df_test_with_predictions = tcu.make_predictions(df_test_with_predictions, pc_thresholds) 
df_test_with_predictions.rename(columns={f'predicted_topic_{fn_threshold}': 'predicted_topic_fn_threshold'}, inplace=True)
df_test_with_predictions.rename(columns={f'predicted_topic_{fp_threshold}': 'predicted_topic_fp_threshold'}, inplace=True)
df_test_with_predictions.rename(columns={f'predicted_topic_per_class': 'predicted_topic_pc_threshold'}, inplace=True)

df_test_with_predictions.head()

df_test_with_predictions.head()

Unnamed: 0,new_speech_id,topic,speech,ID,prob_Other,prob_Politics: Domestic,prob_Politics: Foreign,prob_Private Matters/Ceremonial,prob_Private Matters/Cerenomial,prob_Public Finance,prob_Religious Issues,prob_Trials,parsed_topics,predicted_topic_fn_threshold,predicted_topic_fp_threshold,predicted_topic_pc_threshold
0,1,Politics: Domestic,My Lords and gentlemen; I hope that you do rem...,10001.0,0.011698,0.933067,0.022387,0.056214,0.002836,0.050063,0.224099,0.124481,[Politics: Domestic],"Politics: Domestic, Religious Issues",Politics: Domestic,"Politics: Domestic, Religious Issues"
1,2,Politics: Domestic,the king's main reason of calling the parliame...,10013.0,0.015721,0.761333,0.404397,0.02144,0.004048,0.580476,0.108299,0.011705,[Politics: Domestic],"Politics: Domestic, Politics: Foreign, Public ...","Politics: Domestic, Public Finance","Politics: Domestic, Politics: Foreign, Public ..."
2,3,"Politics: Domestic, Religious Issues, Public F...","""That his majesty had amply accepted the Speak...",10013.0,0.023805,0.960059,0.128712,0.03761,0.003028,0.147314,0.132154,0.01386,"[Politics: Domestic, Religious Issues, Public ...",Politics: Domestic,Politics: Domestic,Politics: Domestic
3,4,Politics: Domestic,"""That the late dictates between the late King ...",1321.0,0.009132,0.984181,0.024302,0.021698,0.001397,0.103176,0.071109,0.043408,[Politics: Domestic],Politics: Domestic,Politics: Domestic,Politics: Domestic
4,5,Politics: Domestic,there might be no Committees for Grievances or...,316.0,0.013387,0.960703,0.039627,0.016508,0.002295,0.530751,0.205464,0.01242,[Politics: Domestic],"Politics: Domestic, Public Finance, Religious ...","Politics: Domestic, Public Finance","Politics: Domestic, Public Finance"


In [48]:
results_fn = tcu.evaluate_trained_model(df_test_with_predictions, 'topic', "predicted_topic_fn_threshold", labels, mlb)


=== Evaluating Predictions ===
Total samples: 4284
Number of classes: 8

F1 Scores:
  Macro F1: 0.5210
  Micro F1: 0.7868

Confusion Matrix per Class:

Class: Other
  TP (True Positives):     2  |  FP (False Positives):    8
  FN (False Negatives):   46  |  TN (True Negatives):  4228
  Precision: 0.200  |  Recall: 0.042  |  F1: 0.069

Class: Politics: Domestic
  TP (True Positives):  3003  |  FP (False Positives):  774
  FN (False Negatives):   19  |  TN (True Negatives):   488
  Precision: 0.795  |  Recall: 0.994  |  F1: 0.883

Class: Politics: Foreign
  TP (True Positives):   480  |  FP (False Positives):  401
  FN (False Negatives):  166  |  TN (True Negatives):  3237
  Precision: 0.545  |  Recall: 0.743  |  F1: 0.629

Class: Private Matters/Ceremonial
  TP (True Positives):    45  |  FP (False Positives):   33
  FN (False Negatives):  127  |  TN (True Negatives):  4079
  Precision: 0.577  |  Recall: 0.262  |  F1: 0.360

Class: Private Matters/Cerenomial
  TP (True Positives):     

In [49]:
results_fn = tcu.evaluate_trained_model(df_test_with_predictions, 'topic', "predicted_topic_fp_threshold", labels, mlb)


=== Evaluating Predictions ===
Total samples: 4284
Number of classes: 8

F1 Scores:
  Macro F1: 0.4789
  Micro F1: 0.8112

Confusion Matrix per Class:

Class: Other
  TP (True Positives):     0  |  FP (False Positives):    0
  FN (False Negatives):   48  |  TN (True Negatives):  4236
  Precision: 0.000  |  Recall: 0.000  |  F1: 0.000

Class: Politics: Domestic
  TP (True Positives):  2919  |  FP (False Positives):  420
  FN (False Negatives):  103  |  TN (True Negatives):   842
  Precision: 0.874  |  Recall: 0.966  |  F1: 0.918

Class: Politics: Foreign
  TP (True Positives):   319  |  FP (False Positives):   88
  FN (False Negatives):  327  |  TN (True Negatives):  3550
  Precision: 0.784  |  Recall: 0.494  |  F1: 0.606

Class: Private Matters/Ceremonial
  TP (True Positives):     5  |  FP (False Positives):    1
  FN (False Negatives):  167  |  TN (True Negatives):  4111
  Precision: 0.833  |  Recall: 0.029  |  F1: 0.056

Class: Private Matters/Cerenomial
  TP (True Positives):     

In [50]:
results_fn = tcu.evaluate_trained_model(df_test_with_predictions, 'topic', "predicted_topic_pc_threshold", labels, mlb)


=== Evaluating Predictions ===
Total samples: 4284
Number of classes: 8

F1 Scores:
  Macro F1: 0.5747
  Micro F1: 0.8156

Confusion Matrix per Class:

Class: Other
  TP (True Positives):     5  |  FP (False Positives):   23
  FN (False Negatives):   43  |  TN (True Negatives):  4213
  Precision: 0.179  |  Recall: 0.104  |  F1: 0.132

Class: Politics: Domestic
  TP (True Positives):  2899  |  FP (False Positives):  392
  FN (False Negatives):  123  |  TN (True Negatives):   870
  Precision: 0.881  |  Recall: 0.959  |  F1: 0.918

Class: Politics: Foreign
  TP (True Positives):   421  |  FP (False Positives):  210
  FN (False Negatives):  225  |  TN (True Negatives):  3428
  Precision: 0.667  |  Recall: 0.652  |  F1: 0.659

Class: Private Matters/Ceremonial
  TP (True Positives):    69  |  FP (False Positives):   94
  FN (False Negatives):  103  |  TN (True Negatives):  4018
  Precision: 0.423  |  Recall: 0.401  |  F1: 0.412

Class: Private Matters/Cerenomial
  TP (True Positives):     

In [51]:
# Change according to your own preferences!
best_threshold = fn_threshold

df_candidates_with_predictions = tcu.make_predictions(df_candidates_with_predictions, best_threshold)

df_candidates_with_predictions.to_excel('candidates_with_predictions.xlsx', index=False)
print("Saved predictions (topics and probabilities) to 'candidates_with_predictions.xlsx'")


Saved predictions (topics and probabilities) to 'candidates_with_predictions.xlsx'
