In [1]:
import os
import glob
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [20]:
# Define the root directory where all the subdirectories are located
root_dir = '/home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/'
output_dir = '/home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/TEXT_embeddings'
os.makedirs(output_dir, exist_ok=True)
dir_pattern = os.path.join(root_dir, '*_P')

# Find all matching directories
dirs = [d for d in glob.glob(dir_pattern) if os.path.isdir(d)]

# Check how many directories were found
print(f'Found {len(dirs)} directories matching the pattern [num]_P.')

Found 275 directories matching the pattern [num]_P.


In [21]:
texts = []
file_sources = []

# Regular expression to extract the number from the directory name
pattern = r'(\d+)_P$'

for dir_path in dirs:
    dir_name = os.path.basename(dir_path)
    
    # Use regex to extract the [num] part
    match = re.match(pattern, dir_name)
    if match:
        num = match.group(1)
        # Construct the transcript file name
        transcript_filename = f"{num}_Transcript.csv"
        transcript_file_path = os.path.join(dir_path, transcript_filename)
        
        # Check if the transcript file exists
        if os.path.exists(transcript_file_path):
            # Read the CSV file
            df = pd.read_csv(transcript_file_path)
            if 'Text' in df.columns:
                texts_in_file = df['Text'].tolist()
                texts.extend(texts_in_file)
                # Keep track of the source file
                file_sources.extend([transcript_file_path] * len(texts_in_file))
            else:
                print(f"'Text' column not found in {transcript_file_path}")
        else:
            print(f"Transcript file not found: {transcript_file_path}")
    else:
        print(f"Directory name '{dir_name}' does not match the expected pattern.")


In [22]:
# Load the pretrained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
model.to(device)
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [23]:
# Iterate through each directory and process transcripts
for dir_path in dirs:
    dir_name = os.path.basename(dir_path)
    
    # Extract [num] using regex
    match = re.match(r'(\d+)_P$', dir_name)
    if match:
        num = match.group(1)
        transcript_filename = f"{num}_Transcript.csv"
        transcript_file_path = os.path.join(dir_path, transcript_filename)
        
        if os.path.exists(transcript_file_path):
            try:
                # Read the transcript CSV file
                df = pd.read_csv(transcript_file_path)
                
                if 'Text' in df.columns:
                    texts = df['Text'].dropna().tolist()  # Remove NaN entries
                    
                    if not texts:
                        print(f"No texts found in {transcript_file_path}. Skipping.\n")
                        continue
                    
                    print(f'Processing Transcript {num}: {len(texts)} texts found.')
                    
                    # Tokenize the texts
                    inputs = tokenizer(
                        texts,
                        padding=True,
                        truncation=True,
                        max_length=512,
                        return_tensors='pt'
                    )
                    
                    # Move inputs to device
                    inputs = {key: val.to(device) for key, val in inputs.items()}
                    
                    with torch.no_grad():
                        outputs = model(**inputs)
                    
                    # Extract [CLS] token embeddings
                    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Shape: (len(texts), 768)
                    
                    # Flatten the embeddings to 1D
                    flattened_embeddings = cls_embeddings.flatten()  # Shape: (len(texts) * 768,)
                    
                    # Define the output file path
                    output_filename = f"{num}_TEXT.npy"
                    output_file_path = os.path.join(output_dir, output_filename)
                    
                    # Save the flattened embeddings
                    np.save(output_file_path, flattened_embeddings)
                    
                    print(f'Flattened embeddings saved to {output_file_path}\n')
                    
                else:
                    print(f"'Text' column not found in {transcript_file_path}. Skipping.\n")
                    
            except Exception as e:
                print(f"Error processing {transcript_file_path}: {e}\n")
        else:
            print(f"Transcript file not found: {transcript_file_path}. Skipping.\n")
    else:
        print(f"Directory name '{dir_name}' does not match the expected pattern. Skipping.\n")

Processing Transcript 449: 54 texts found.
Flattened embeddings saved to /home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/TEXT_embeddings/449_TEXT.npy

Processing Transcript 622: 101 texts found.
Flattened embeddings saved to /home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/TEXT_embeddings/622_TEXT.npy

Processing Transcript 318: 48 texts found.
Flattened embeddings saved to /home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/TEXT_embeddings/318_TEXT.npy

Processing Transcript 348: 85 texts found.
Flattened embeddings saved to /home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/TEXT_embeddings/348_TEXT.npy

Processing Transcript 479: 106 texts found.
Flattened embeddings saved to /home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/TEXT_embeddings/479_TEXT.npy

Processing Transcript 713: 111 texts found.
Flattened embeddings saved to /home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/TEXT_embeddings/713_TEXT.npy

P

In [26]:
import numpy as np

# Example: Load and check the shape of a flattened embedding file
num = '449'  # Replace with an actual [num] from your data
embedding_file = f'/home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/{num}_TEXT.npy'  # Update the path

if os.path.exists(embedding_file):
    embeddings = np.load(embedding_file)
    print(f"Loaded embeddings from {embedding_file}")
    print(f"Shape of embeddings: {embeddings.shape}")  # Should be (number_of_texts * 768,)
else:
    print(f"Embedding file {embedding_file} does not exist.")


Loaded embeddings from /home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred/449_TEXT.npy
Shape of embeddings: (41472,)
