In [8]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
import numpy as np

# Load the passages
with open('problem3_data.txt', 'r') as file:
    passages = file.readlines()

passages = [passage for passage in passages if len(passage.strip()) > 0]
# Initialize the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

# Set the model to evaluation mode
model.eval()

# Function to get the embedding of the word "charge" from layer 7
def get_charge_embedding(passage):
    # Tokenize the passage
    inputs = tokenizer(passage, return_tensors='pt', truncation=True, padding=True)
    input_ids = inputs['input_ids']
    
    # Find the position of the word "charge"
    tokenized_text = tokenizer.convert_ids_to_tokens(input_ids[0])

    print("tokenized text:", tokenized_text)
    
    # Handle cases where "charge" might be split into subwords
    charge_positions = [i for i, token in enumerate(tokenized_text) if token == 'charge']

    print("Charge positions:", charge_positions)
    
    if not charge_positions:
        # print(passage)
        raise ValueError(f"'charge' not found in passage: {passage}")
    # else:
    #     # print(passage)
    
    # Get the hidden states from BERT
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states
    
    # Ensure hidden_states is not None and has the expected structure
    if hidden_states is None or len(hidden_states) < 8:
        raise ValueError("Unexpected hidden_states structure")
    
    # Extract the embedding from layer 7 for the first occurrence of "charge"
    layer_7_embedding = hidden_states[7][0][charge_positions[0]]
    
    return layer_7_embedding

# Get embeddings for all passages
charge_embeddings = []
for passage in passages:
    try:
        embedding = get_charge_embedding(passage)
        charge_embeddings.append(embedding)
    except ValueError as e:
        print(f"Error processing passage: {e}")

# Convert embeddings to a numpy array
if charge_embeddings:
    charge_embeddings_np = torch.stack(charge_embeddings).numpy()
else:
    raise ValueError("No embeddings were extracted.")




tokenized text: ['[CLS]', 'natalie', 'allen', ':', 'hello', 'and', 'thanks', 'for', 'being', 'with', 'us', 'today', '.', 'president', 'clinton', "'", 's', 'pledge', 'to', 'lift', 'the', 'ban', 'on', 'gay', '##s', 'in', 'the', 'military', 'has', 'set', 'off', 'an', 'explosive', 'charge', 'in', 'washington', '.', 'the', 'pentagon', "'", 's', 'top', 'brass', 'is', 'opposed', 'to', 'the', 'clinton', 'campaign', 'promise', ',', 'and', 'said', 'so', 'at', 'a', 'white', 'house', 'meeting', 'yesterday', '.', '[SEP]']
Charge positions: [33]
tokenized text: ['[CLS]', 'smith', ':', 'and', 'so', 'ok', ',', 'you', 'know', ',', 'here', "'", 's', 'the', 'other', 'thing', 'is', 'there', "'", 's', '-', '-', 'she', 'looks', 'great', '!', 'lark', '##ey', ':', 'yes', '.', 'smith', ':', 'there', "'", 's', 'judy', 'and', 'so', 'what', 'did', 'this', 'do', 'to', 'her', 'charge', 'account', '?', 'lark', '##ey', ':', 'nothing', ',', 'she', '-', '-', 'we', 'found', 'most', 'of', 'this', 'in', 'her', 'own', 'clo

In [9]:
# Perform k-means clustering
n_clusters = 5
random_seed = 42
kmeans = KMeans(n_clusters=n_clusters, random_state=random_seed)
clusters = kmeans.fit_predict(charge_embeddings_np)

# Print the cluster assignments
for i, passage in enumerate(passages):
    if (i < len(clusters)):
        print(f"Passage: {passage.strip()}\nCluster: {clusters[i]}\n")


  super()._check_params_vs_input(X, default_n_init=10)


Passage: NATALIE ALLEN : Hello and thanks for being with us today . President Clinton 's pledge to lift the ban on gays in the military has set off an explosive  	 charge 	  in Washington . The Pentagon 's top brass is opposed to the Clinton campaign promise , and said so at a White House meeting yesterday .
Cluster: 3

Passage: Smith : And so OK , you know , here 's the other thing is there 's -- she looks great ! Larkey : Yes . Smith : There 's Judy and so what did this do to her  	 charge 	  account ? Larkey : Nothing , she -- we found most of this in her own closet . We 've -- even the smaller handbag that was slimming and carried right where she 's thin , on her hips so I want women to get rid of those big handbags and use those smaller ones .
Cluster: 3

Passage: In a moment , we 'll look at what the press is up against in the Gulf . Commercial break KOPPEL : You 've probably heard the  	 charge 	  in one form or another . It goes something like this . " The U.S. did n't lose the