In [1]:
import torch
from transformers import BertModel
import pandas as pd
import numpy as np

def process_bert_sentences(sentences_df):
    """
    Process sentences through BERT with proper tensor conversion.
    
    Args:
        sentences_df (pd.DataFrame): DataFrame containing 'Indexed_Tokens' and 'Segments_IDs' columns
    """
    # Convert DataFrame series to numpy arrays first
    tokens_array = np.array(sentences_df['Indexed_Tokens'].tolist())
    segments_array = np.array(sentences_df['Segments_IDs'].tolist())
    
    # Convert numpy arrays to PyTorch tensors
    tokens_tensor = torch.tensor(tokens_array)
    segments_tensor = torch.tensor(segments_array)
    
    # Create attention mask (1 for real tokens, 0 for padding)
    attention_mask = (tokens_tensor != 0).long()
    
    # Print shapes for debugging
    print(f"Tokens tensor shape: {tokens_tensor.shape}")
    print(f"Segments tensor shape: {segments_tensor.shape}")
    print(f"Attention mask shape: {attention_mask.shape}")
    
    # Initialize model
    model = BertModel.from_pretrained('bert-base-uncased',
                                    output_hidden_states=True,
                                    return_dict=True)
    
    # Evaluation mode
    model.eval()
    
    # Move to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    tokens_tensor = tokens_tensor.to(device)
    segments_tensor = segments_tensor.to(device)
    attention_mask = attention_mask.to(device)
    
    # Get outputs
    with torch.no_grad():
        outputs = model(
            input_ids=tokens_tensor,
            token_type_ids=segments_tensor,
            attention_mask=attention_mask
        )
    
    return outputs

# Helper function to print tensor info
def print_tensor_info(tensor, name):
    print(f"\n{name}:")
    print(f"Shape: {tensor.shape}")
    print(f"Type: {tensor.dtype}")
    print(f"Device: {tensor.device}")

# Example usage with detailed debugging
def run_example():
    # Create sample data - make sure all sequences are the same length
    example_data = {
        'Indexed_Tokens': [
            [101, 2054, 2003, 102, 0, 0],  # Padded to length 6
            [101, 2040, 2001, 102, 0, 0]   # Padded to length 6
        ],
        'Segments_IDs': [
            [0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0]
        ]
    }
    df = pd.DataFrame(example_data)
    
    try:
        # Process sentences
        outputs = process_bert_sentences(df)
        
        # Print detailed information about the outputs
        print("\nOutput information:")
        print_tensor_info(outputs.last_hidden_state, "Last hidden state")
        print(f"\nNumber of hidden state layers: {len(outputs.hidden_states)}")
        print_tensor_info(outputs.hidden_states[0], "First hidden state layer")
        
        return outputs
        
    except Exception as e:
        print(f"\nError occurred: {str(e)}")
        print("\nInput data shape:")
        print(f"Number of rows in DataFrame: {len(df)}")
        print(f"Sample Indexed_Tokens shape: {np.array(df['Indexed_Tokens'].iloc[0]).shape}")
        print(f"Sample Segments_IDs shape: {np.array(df['Segments_IDs'].iloc[0]).shape}")
        raise

if __name__ == "__main__":
    outputs = run_example()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Tokens tensor shape: torch.Size([2, 6])
Segments tensor shape: torch.Size([2, 6])
Attention mask shape: torch.Size([2, 6])

Output information:

Last hidden state:
Shape: torch.Size([2, 6, 768])
Type: torch.float32
Device: cpu

Number of hidden state layers: 13

First hidden state layer:
Shape: torch.Size([2, 6, 768])
Type: torch.float32
Device: cpu
