In [1]:
import os
import glob
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
# Define the root directory where all the subdirectories are located
root_dir = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/untarred'
dir_pattern = os.path.join(root_dir, '*_P')

# Find all matching directories
dirs = [d for d in glob.glob(dir_pattern) if os.path.isdir(d)]

# Check how many directories were found
print(f'Found {len(dirs)} directories matching the pattern [num]_P.')

Found 275 directories matching the pattern [num]_P.


In [5]:
texts = []
file_sources = []

# Regular expression to extract the number from the directory name
pattern = r'(\d+)_P$'

for dir_path in dirs:
    dir_name = os.path.basename(dir_path)
    
    # Use regex to extract the [num] part
    match = re.match(pattern, dir_name)
    if match:
        num = match.group(1)
        # Construct the transcript file name
        transcript_filename = f"{num}_Transcript.csv"
        transcript_file_path = os.path.join(dir_path, transcript_filename)
        
        # Check if the transcript file exists
        if os.path.exists(transcript_file_path):
            # Read the CSV file
            df = pd.read_csv(transcript_file_path)
            if 'Text' in df.columns:
                texts_in_file = df['Text'].tolist()
                texts.extend(texts_in_file)
                # Keep track of the source file
                file_sources.extend([transcript_file_path] * len(texts_in_file))
            else:
                print(f"'Text' column not found in {transcript_file_path}")
        else:
            print(f"Transcript file not found: {transcript_file_path}")
    else:
        print(f"Directory name '{dir_name}' does not match the expected pattern.")


In [6]:
# Load the pretrained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [7]:
batch_size = 32 
embeddings = []

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    ).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get embeddings for the [CLS] token
    batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    embeddings.append(batch_embeddings)

# Concatenate all embeddings
embeddings = np.vstack(embeddings)


In [13]:
from sklearn.cluster import KMeans
# Perform clustering
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embeddings)

# Assign cluster labels
labels = kmeans.labels_

# Examine texts in each cluster
for cluster in range(num_clusters):
    print(f"\nCluster {cluster}:")
    cluster_indices = np.where(labels == cluster)[0]
    for idx in cluster_indices[:3]:  # Show first 3 texts in the cluster
        print(f"- {texts[idx][:100]}...")  # Print first 100 characters


Cluster 0:
-  be real with myself that's that's one thing that took me 20 years to kind of get to just to kind of...
-  my family my my role as a wife my role as a mother I think that ultimately that's where my main ful...
-  I don't know that's a tough one. What am I most proud of I don't know honestly...

Cluster 1:
- thank you...
-  yes I am...
-  I'm good thank you...

Cluster 2:
-  the weather I like that I can get to the water if I want to go to the hills or hike or just any ele...
-  recently I went to Honduras again that's where my family is from so sore like a it's a dual fault I...
-  experience probably Hawaii in Lahaina and just having enough food I'm a foodie so having the pig ro...

Cluster 3:
-  originally from Los Angeles and my parents are from Central America...
-  you travel a lot when I can I have family in Central America and when I can my husband and I we try...
-  the freedom of not having to have my daily routine and not be guilty about not doing something I sa