In [1]:
###
###You mask the word "charge" and predict replacements using BERT.
###Convert these replacements into binary vectors based on unique words.
###Use KMeans clustering to group similar passages into clusters based on the replacement words
###
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from sklearn.cluster import KMeans
import numpy as np
from google.colab import files

uploaded = files.upload()

# Step 1: Load BERT Model and Tokenizer
checkpoint = 'bert-base-uncased'
model = AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Step 2: Function to get top 20 replacements for "[MASK]" from the file on Canvas 
## determine words that the model rates most highly  as replacements of <mask>:
def get_top_replacements(text, top_n=20):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)  # Ensure truncation for long texts
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    token_logits = model(**inputs).logits
    mask_token_logits = token_logits[0, mask_token_index, :]
    top_tokens = torch.topk(mask_token_logits, top_n, dim=1).indices[0].tolist()
    return [tokenizer.decode([token]) for token in top_tokens]

# Step 3: Load passages from file and mask the word "charge"
file_path = 'problem3_data.txt'
with open(file_path, 'r') as file:
    passages = file.readlines()

masked_passages = [p.replace('charge', '[MASK]') for p in passages]

# Step 4: Get top 20 replacements for "charge" in each passage
all_replacements = []
for passage in masked_passages:
    replacements = get_top_replacements(passage)
    if replacements:  # Only append if replacements were found
        all_replacements.append(replacements)

# Step 5: Vectorize the substitutes for clustering
# Create a set of all unique substitutes across all passages
unique_substitutes = list(set([word for sublist in all_replacements for word in sublist]))

# Function to convert each passage's substitutes into a binary vector
def get_binary_vector(replacements, unique_substitutes):
    return [1 if word in replacements else 0 for word in unique_substitutes]

# Create the binary matrix (one row per passage, one column per unique substitute)
binary_vectors = [get_binary_vector(replacements, unique_substitutes) for replacements in all_replacements]

# Step 6: Cluster the passages using K-Means
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(binary_vectors)

# Step 7: Output the clusters and the respective passages
clustered_passages = {}
for i, label in enumerate(kmeans.labels_):
    if label not in clustered_passages:
        clustered_passages[label] = []
    clustered_passages[label].append(passages[i])

print(clustered_passages)




ModuleNotFoundError: No module named 'google'