#### Ran in David's team server

##### Automatic procedural cliff as detailed in Jurafsky's 140 years paper but with BERT cliff

In [1]:
!pip install transformers
!pip install torch



In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
import random

from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

In [3]:
crec_new = pd.read_csv("crec2023_2024.csv", engine='python')

In [4]:
speeches = crec_new["speech"].to_list()
speeches = list(map(str, speeches))

In [5]:
# Step 1: Exclude speeches shorter than 3 words or 16 characters (20 cause punctuation)
def filter_short_speeches(speeches):
    return [s for s in speeches if (len(s.split()) >= 3 or len(s) >= 20)] 

# Step 2: Identify procedural speeches
# 5 repeats instead of 20 cause smaller dataset, 210 instead of 200 cause preprocessing does not include punctuation removal
def identify_procedural_speeches(speeches, min_repeats=5, max_length=210): 
    counter = Counter([s for s in speeches if len(s) <= max_length]) #every speech is only once and counted
    procedural = [speech for speech, count in counter.items() if count >= min_repeats]
    return procedural

In [6]:
# Preprocess and filter
speeches = filter_short_speeches(speeches)

# Identify procedural speeches
procedural_speeches = identify_procedural_speeches(speeches)
print(f"N° of procedural speeches = {len(procedural_speeches)}")

N° of procedural speeches = 910


In [7]:
#Non-procedural speeches
non_procedural_length=1000
non_procedural = [s for s in speeches if len(s) > non_procedural_length and len(s.split()) > 20]
chunks = []
for speech in non_procedural:
    chunks.extend([speech[i:i+200] for i in range(0, len(speech), 200)])

negative_cases = np.random.choice(chunks, size=int(0.05 * len(chunks)), replace=False)

In [8]:
def split_list_randomly(lst, seed, ratio = 0.75):
    random.seed(seed)  # Set the seed for reproducibility
    random.shuffle(lst)  # Shuffle the list randomly
    split_point = int(len(lst) * ratio)
    return lst[:split_point], lst[split_point:]

# Perform the splits
seed_value = 42
procedural_speeches_train, procedural_speeches_test = split_list_randomly(procedural_speeches, seed=seed_value)
negative_cases_train, negative_cases_test = split_list_randomly(negative_cases, seed=seed_value)

In [9]:
# Step 3: Prepare training data
def prepare_training_data(negative_cases_train, procedural_speeches_train):
    # Positive examples: 100% of procedural speeches
    procedural_training = np.random.choice(procedural_speeches_train, 
                                           size=int(1 * len(procedural_speeches_train)), 
                                           replace=False)

    # Combine training data
    X_train = list(procedural_training) + list(negative_cases_train)
    y_train = [1] * len(procedural_training) + [0] * len(negative_cases_train)
    return X_train, y_train

In [10]:
# Prepare training data
X_train, y_train = prepare_training_data(negative_cases_train, procedural_speeches_train)

# Prepare test data
X_test = list(procedural_speeches_test) + list(negative_cases_test)
y_test = [1] * len(procedural_speeches_test) + [0] * len(negative_cases_test)

In [11]:
# Custom Dataset class for BERT
class SpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Step 5: Train BERT classifier
def train_bert_classifier(X_train, y_train, max_length=128, epochs=3, batch_size=16):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

    train_dataset = SpeechDataset(X_train, y_train, tokenizer, max_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    #criterion = torch.nn.CrossEntropyLoss()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    return tokenizer, model

In [12]:
tokenizer, model = train_bert_classifier(X_train, y_train)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Step 6: Evaluate BERT classifier
def evaluate_bert_classifier(tokenizer, model, X_test, y_test, max_length=128, batch_size=16):
    test_dataset = SpeechDataset(X_test, y_test, tokenizer, max_length)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

In [14]:
evaluate_bert_classifier(tokenizer, model, X_test, y_test)

#Accuracy: 0.9980
#F1 Score: 0.9740
#Precision: 0.9574
#Recall: 0.9912

Accuracy: 0.9985
F1 Score: 0.9803
Precision: 0.9782
Recall: 0.9825


In [15]:
def flag_procedural_speeches(df, column_name, tokenizer, model, max_length=128, batch_size=16):
    """
    Classify speeches in a DataFrame column as procedural or not, skipping BERT for specific cases, 
    and add a 'procedural_flag' column.

    Args:
        df (pd.DataFrame): DataFrame containing speeches.
        column_name (str): Column name where speeches are stored.
        tokenizer (BertTokenizer): Tokenizer for BERT model.
        model (BertForSequenceClassification): Trained BERT model.
        max_length (int): Maximum sequence length for BERT input.
        batch_size (int): Batch size for inference.

    Returns:
        pd.DataFrame: DataFrame with an added 'procedural_flag' column.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Convert speeches to strings and handle missing values
    df[column_name] = df[column_name].astype(str).fillna("")

    # Step 1: Flag very short speeches as procedural
    def is_very_short(speech):
        return len(speech.split()) <= 3 or len(speech) <= 20

    df['procedural_flag'] = df[column_name].apply(lambda x: 1 if is_very_short(x) else 0)

    # Step 2: Filter out speeches already flagged as procedural
    non_very_short_speeches = df[df['procedural_flag'] == 0][column_name].tolist()
    short_speeches = [speech for speech in non_very_short_speeches if len(speech) <= 420]

    # Step 3: Prepare data for BERT
    dataset = SpeechDataset(short_speeches, [0] * len(short_speeches), tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    # Step 4: Classify remaining speeches with BERT
    procedural_flags = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            procedural_flags.extend(preds.cpu().numpy())

    # Step 5: Map BERT classifications back to speeches
    flag_mapping = {speech: flag for speech, flag in zip(short_speeches, procedural_flags)}

    # Step 6: Update the procedural_flag column for speeches classified with BERT
    def update_flag(speech, existing_flag):
        if existing_flag == 1:
            return 1  # Already flagged as procedural by the rule
        return flag_mapping.get(speech, 0)  # Use BERT classification otherwise

    df['procedural_flag'] = df.apply(lambda row: update_flag(row[column_name], row['procedural_flag']), axis=1)

    return df

In [16]:
crec_procflag = flag_procedural_speeches(crec_new, "speech", tokenizer, model, max_length=128, batch_size=16)

In [17]:
all_speeches = len(crec_procflag)
print("All crec:", all_speeches)
flagged_proc = len(crec_procflag[crec_procflag["procedural_flag"]==1])
print("Flagged crec:", flagged_proc)

print("%flagged:", round(flagged_proc*100/all_speeches,2))

All crec: 98187
Flagged crec: 54079
%flagged: 55.08


In [18]:
crec_procflag.to_csv("crec_procflag.csv", index=False)

In [19]:
model.save_pretrained("./trained_bert")
tokenizer.save_pretrained("./trained_bert")

('./trained_bert/tokenizer_config.json',
 './trained_bert/special_tokens_map.json',
 './trained_bert/vocab.txt',
 './trained_bert/added_tokens.json')

In [4]:
crec_see = pd.read_csv("crec_procflag.csv")

In [None]:
crec_new

In [21]:
crec_see.columns

Index(['speech_id', 'speech', 'chamber', 'is_extension', 'date', 'speaker',
       'speaker_bioguide', 'vol', 'num', 'congress_num', 'pages', 'doc_title',
       'title', 'procedural_flag'],
      dtype='object')

In [5]:
# Drop NaN values in the speech column
crec_see = crec_see.dropna(subset=['speech'])

# Filter speeches based on the length criteria
filtered_speeches = crec_see[(crec_see['speech'].str.split().apply(len) <= 3) | (crec_see['speech'].str.len() <= 20)]

# Calculate the proportion where procedural_flag == 1
proportion_procedural = (filtered_speeches["procedural_flag"] == 1).mean()

print("Proportion of speeches meeting criteria with procedural_flag == 1:", proportion_procedural)

Proportion of speeches meeting criteria with procedural_flag == 1: 1.0


In [35]:
non_flagged = len(crec_procflag[(crec_procflag["procedural_flag"] == 0) & (crec_procflag["speech"].str.len() <= 420)]["speech"])
print("non_flagged", non_flagged)

flagged = len(crec_procflag[(crec_procflag["procedural_flag"] == 1) & (crec_procflag["speech"].str.len() <= 420)]["speech"])
print("flagged", flagged)

non_flagged 5616
flagged 54079


In [36]:
# Select 20 random procedural speeches
filtered_speeches = crec_procflag[(crec_procflag["procedural_flag"] == 1) & (crec_procflag["speech"].str.len() <= 420)]["speech"]

random_speeches = random.sample(list(filtered_speeches), 20)

# Display all the selected speeches completely
for speech in random_speeches:
    print()
    print(speech)
    print("\n" + "-"*80 + "\n")  # Separator between speeches


The objection is heard.

--------------------------------------------------------------------------------


Mr. Speaker, I continue to reserve the balance of my time.

--------------------------------------------------------------------------------


The gentleman from Pennsylvania is recognized for 1 hour.

--------------------------------------------------------------------------------


Under the previous order, the question is, Shall the joint resolution (S.J. Res. 11) pass, the objections of the President to the contrary notwithstanding? The yeas and nays are required under the Constitution. The clerk will call the roll.

--------------------------------------------------------------------------------


Is there a sufficient second? There appears to be a sufficient second. The clerk will call the roll.

--------------------------------------------------------------------------------


Pursuant to clause 8 of rule XX, the Chair will postpone further proceedings on motions to suspe