# Setup 


## Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import topics_classifier_utils as tcu 

In [2]:
# Import dataset

df_main = pd.read_excel("C:\\Users\\agust\\Downloads\\speeches_with_names.xlsx")
df_main.head()


Unnamed: 0,new_speech_id,speaker,debate_title,sec_number,topic,speech,year,source,debate,volume,page,debate title,title of speech,date (d-m-y),mastername,ID
0,1,The King,The King's Speech on opening the Session.,1.0,Politics: Domestic,My Lords and gentlemen; I hope that you do rem...,1625,Cobbet,,,,,,18-06-1625,Charles I,10001.0
1,2,"Lord Keeper, Williams",The Lord Keeper's Speech.,2.0,Politics: Domestic,the king's main reason of calling the parliame...,1625,Cobbet,,,,,,18-06-1625,John Williams,10013.0
2,3,The Lord Keeper,The Lord Keeper's Answer.,4.0,"Politics: Domestic, Religious Issues, Public F...","""That his majesty had amply accepted the Speak...",1625,Cobbet,,,,,,20-06-1625,John Williams,10013.0
3,4,Sir Benjamin Rudyard,Motion for a good Harmony between King and Par...,6.0,Politics: Domestic,"""That the late dictates between the late King ...",1625,Cobbet,,,,,,22-06-1625,benjamin rudyard,1321.0
4,5,Sir Edward Coke,Motion for a good Harmony between King and Par...,6.0,Politics: Domestic,there might be no Committees for Grievances or...,1625,Cobbet,,,,,,22-06-1625,edward coke,316.0


## Cleaning Dataset

In [3]:
cols_to_keep = ['new_speech_id', 'topic', 'speech', 'ID']

df_main = df_main[cols_to_keep]

df_test = df_main[df_main['topic'].notna() & (df_main['topic'] != '')]

df_candidates = df_main[df_main['topic'].isna() | (df_main['topic'] == '')]

df_main.shape, df_test.shape, df_candidates.shape

((28058, 4), (4284, 4), (23774, 4))

# Training on Test Set

In [None]:
# Train BERT model on df_test


model, tokenizer, mlb, train_loader, val_loader = tcu.train_bert_multilabel(
    df=df_test,
    model_name='emanjavacas/MacBERTh',  # MacBERTh for historical/old English
    text_col='speech',
    label_col='topic',
    batch_size=4,  # Start small for CPU
    epochs=3,
    max_length=256  # Reduced to save memory
)

Using device: cpu

=== Preparing Data ===
Total samples: 4284
Number of unique topics: 8
Topics: ['Other', 'Politics: Domestic', 'Politics: Foreign', 'Private Matters/Ceremonial', 'Private Matters/Cerenomial', 'Public Finance', 'Religious Issues', 'Trials']
Train samples: 3427, Val samples: 857

=== Loading Model: emanjavacas/MacBERTh ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emanjavacas/MacBERTh and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Training ===


Epoch 1/3 [Train]:   1%|          | 10/857 [00:53<1:16:03,  5.39s/it, loss=0.524]


KeyboardInterrupt: 

In [None]:
# Evaluate model on train and validation sets

results = tcu.evaluate_trained_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    mlb=mlb,
    threshold=0.5 #Change this after studying predictions
)

# Predict on Unlabeled Data

In [None]:
# Predict topic probabilities for df_candidates
# This adds probability columns directly to the DataFrame
df_candidates_with_predictions = tcu.predict_topic_probabilities(
    df=df_candidates,
    model=model,
    tokenizer=tokenizer,
    mlb=mlb,
    text_col='speech',
    max_length=256,
    chunk_size=200  # Split speeches into chunks of ~200 tokens
)

# Extract predicted topics based on threshold
prob_cols = [col for col in df_candidates_with_predictions.columns if col.startswith('prob_')]
threshold = 0.5

# Apply threshold to get binary predictions, then convert to topic names
predictions_binary = (df_candidates_with_predictions[prob_cols].values > threshold).astype(int)
predictions = mlb.inverse_transform(predictions_binary)
df_candidates_with_predictions['predicted_topics'] = [', '.join(pred) if pred else '' for pred in predictions]

# View results
print(f"\nAdded {len(prob_cols)} probability columns and 'predicted_topics' column")
print(f"Topics: {list(mlb.classes_)}")
display(df_candidates_with_predictions[['new_speech_id', 'predicted_topics'] + prob_cols[:3]].head())

In [None]:
df_candidates_with_predictions.head()

In [None]:
# Save results to Excel
df_candidates_with_predictions.to_excel('candidates_with_predictions.xlsx', index=False)
print("Saved predictions (topics and probabilities) to 'candidates_with_predictions.xlsx'")