# Text Embeddings with a time dependence

In [1]:
import os
import glob
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# Define the root directory where all the subdirectories are located
root_dir = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/untarred'
output_dir = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d'
os.makedirs(output_dir, exist_ok=True)
dir_pattern = os.path.join(root_dir, '*_P')

# Find all matching directories
dirs = [d for d in glob.glob(dir_pattern) if os.path.isdir(d)]

# Check how many directories were found
print(f'Found {len(dirs)} directories matching the pattern [num]_P.')

Found 275 directories matching the pattern [num]_P.


In [4]:
texts = []
file_sources = []

# Regular expression to extract the number from the directory name
pattern = r'(\d+)_P$'

for dir_path in dirs:
    dir_name = os.path.basename(dir_path)
    
    # Use regex to extract the [num] part
    match = re.match(pattern, dir_name)
    if match:
        num = match.group(1)
        # Construct the transcript file name
        transcript_filename = f"{num}_Transcript.csv"
        transcript_file_path = os.path.join(dir_path, transcript_filename)
        
        # Check if the transcript file exists
        if os.path.exists(transcript_file_path):
            # Read the CSV file
            df = pd.read_csv(transcript_file_path)
            if 'Text' in df.columns:
                texts_in_file = df['Text'].tolist()
                texts.extend(texts_in_file)
                # Keep track of the source file
                file_sources.extend([transcript_file_path] * len(texts_in_file))
            else:
                print(f"'Text' column not found in {transcript_file_path}")
        else:
            print(f"Transcript file not found: {transcript_file_path}")
    else:
        print(f"Directory name '{dir_name}' does not match the expected pattern.")


In [5]:
transcript_example = pd.read_csv("/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/untarred/300_P/300_Transcript.csv")
transcript_example.head()

Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,14.3,15.1,so I'm going to,0.93421
1,20.3,21.1,interview in Spanish,0.60847
2,23.9,24.3,okay,0.690606
3,62.1,62.7,good,0.951897
4,68.8,69.8,Atlanta Georgia,0.987629


## Roberta

In [6]:
# Load the pretrained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
model.to(device)
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [7]:
# Iterate through each directory and process transcripts
for dir_path in dirs:
    dir_name = os.path.basename(dir_path)
    
    # Extract [num] using regex
    match = re.match(r'(\d+)_P$', dir_name)
    if match:
        num = match.group(1)
        transcript_filename = f"{num}_Transcript.csv"
        transcript_file_path = os.path.join(dir_path, transcript_filename)
        
        if os.path.exists(transcript_file_path):
            try:
                # Read the transcript CSV file
                df = pd.read_csv(transcript_file_path)
                
                if 'Text' in df.columns:
                    texts = df['Text'].dropna().tolist()  # Remove NaN entries
                    
                    if not texts:
                        print(f"No texts found in {transcript_file_path}. Skipping.\n")
                        continue
                    
                    print(f'Processing Transcript {num}: {len(texts)} texts found.')
                    
                    # Tokenize the texts
                    inputs = tokenizer(
                        texts,
                        padding=True,
                        truncation=True,
                        max_length=512,
                        return_tensors='pt'
                    )
                    
                    # Move inputs to device
                    inputs = {key: val.to(device) for key, val in inputs.items()}
                    
                    with torch.no_grad():
                        outputs = model(**inputs)
                    
                    # Extract [CLS] token embeddings as before
                    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: (num_texts, hidden_size)

                    # Instead of aggregating (by averaging), we keep them as a sequence
                    # cls_embeddings is currently (num_texts, hidden_size)
                    # We want to make the second dimension represent time steps:
                    # Transpose the matrix to get (hidden_size, num_texts)
                    time_series_embeddings = cls_embeddings.transpose(0, 1)  # Shape: (hidden_size, num_texts)

                    # Convert to CPU and NumPy
                    time_series_embeddings = time_series_embeddings.cpu().numpy()

                    # Now save the embeddings
                    # Each column of 'time_series_embeddings' corresponds to the embedding at a particular time step
                    output_filename = f"{num}_TEXT.npy"
                    output_file_path = os.path.join(output_dir, output_filename)
                    np.save(output_file_path, time_series_embeddings)

                    
                    print(f'Aggregated embedding saved to {output_file_path}\n')
                    
                else:
                    print(f"'Text' column not found in {transcript_file_path}. Skipping.\n")
                    
            except Exception as e:
                print(f"Error processing {transcript_file_path}: {e}\n")
        else:
            print(f"Transcript file not found: {transcript_file_path}. Skipping.\n")
    else:
        print(f"Directory name '{dir_name}' does not match the expected pattern. Skipping.\n")


Processing Transcript 449: 54 texts found.
Aggregated embedding saved to /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d/449_TEXT.npy

Processing Transcript 622: 101 texts found.
Aggregated embedding saved to /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d/622_TEXT.npy

Processing Transcript 318: 48 texts found.
Aggregated embedding saved to /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d/318_TEXT.npy

Processing Transcript 348: 85 texts found.
Aggregated embedding saved to /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d/348_TEXT.npy

Processing Transcript 479: 106 texts found.
Aggregated embedding saved to /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d/479_TEXT.npy

Processing Transcript 713: 111 texts found.
Aggregated embedding saved to /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d/713_TEXT.npy

Processing Transcript 350: 83 texts found.
Aggregated embedding saved to /home/hice1/mbibars3/scr

In [10]:
import numpy as np

# Example: Load and check the shape of a flattened embedding file
num = '600'  # Replace with an actual [num] from your data
embedding_file = f'/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d/{num}_TEXT.npy'  # Update the path

if os.path.exists(embedding_file):
    embeddings = np.load(embedding_file)
    print(f"Loaded embeddings from {embedding_file}")
    print(f"Shape of embeddings: {embeddings.shape}")  # Should be (number_of_texts * 768,)
else:
    print(f"Embedding file {embedding_file} does not exist.")


Loaded embeddings from /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/text_2d/600_TEXT.npy
Shape of embeddings: (768, 70)
