In [None]:
import pandas as pd

df = pd.read_csv('../data/filtered_complaints.csv')

df.rename(columns={'Consumer complaint narrative': 'cleaned_narrative'}, inplace=True)
df.drop(columns=['narrative_length'], inplace=True)

df.to_csv('../data/cleaned_complaints.csv', index=False)

In [28]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

df = pd.read_csv('../data/cleaned_complaints.csv')

CHUNK_SIZE_WORDS = 400
CHUNK_OVERLAP_WORDS = 50
SHORT_THRESHOLD_WORDS = 10

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE_WORDS,
    chunk_overlap=CHUNK_OVERLAP_WORDS,
    length_function=len,
    is_separator_regex=False,
)

all_chunks_data = []

for index, row in df.iterrows():
    complaint_id = row['Complaint ID']
    cleaned_narrative = row['cleaned_narrative']
    
    chunks = text_splitter.create_documents([cleaned_narrative])
    
    for i, chunk_doc in enumerate(chunks):
        chunk_text = chunk_doc.page_content
        
        all_chunks_data.append({
            'original_narrative_id': complaint_id,
            'chunk_text': chunk_text,
            'chunk_word_count': len(chunk_text.split()),
            'chunk_char_count': len(chunk_text),
            'chunk_index': i
        })

df_chunks_lc = pd.DataFrame(all_chunks_data)
print(f"Total number of chunks created using RecursiveCharacterTextSplitter: {len(df_chunks_lc)}")
print(f"Sample of chunks (first 5 rows) from LangChain splitter:\n")
print(df_chunks_lc.head())



Total number of chunks created using RecursiveCharacterTextSplitter: 1328485
Sample of chunks (first 5 rows) from LangChain splitter:

   original_narrative_id                                         chunk_text  \
0               14069121  A XXXX XXXX card was opened under my name by a...   
1               14069121  agencies and this fraud is now impacting my cr...   
2               14061897  I made the mistake of using my wellsfargo debi...   
3               14061897  I went into the branch and was told they could...   
4               14061897  I waited a few days and got a letter stating m...   

   chunk_word_count  chunk_char_count  chunk_index  
0                73               399            0  
1                26               132            1  
2                20               116            0  
3                62               296            1  
4                27               137            2  


In [35]:
df_chunks_lc.to_csv("../data/chunks.csv", index=False)

In [3]:
from sentence_transformers import SentenceTransformer

# --- 2. Load the Embedding Model ---
print("\nLoading Sentence Transformer model 'all-MiniLM-L6-v2'...")
try:
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you have an active internet connection to download the model.")
    print("If the issue persists, you might need to install 'torch' separately: pip install torch")
    # Exit or handle the error appropriately if the model can't be loaded


  from .autonotebook import tqdm as notebook_tqdm



Loading Sentence Transformer model 'all-MiniLM-L6-v2'...
Model loaded successfully!


In [None]:
import pandas as pd

df_chunks_lc = pd.read_csv("../data/chunks.csv")

# --- 3. Generate Embeddings for Chunks ---
print("\nGenerating embeddings for chunks...")
# This process can take some time depending on the number of chunks and your CPU/GPU.
texts = df_chunks_lc['chunk_text'].tolist()
embeddings = embedding_model.encode(texts, batch_size=256, show_progress_bar=True)
df_chunks_lc['embedding'] = embeddings.tolist()
print("Embeddings generated!")

# --- 4. Display Results ---
print(f"\nDataFrame after embedding generation. Number of chunks: {len(df_chunks_lc)}")
print("Sample of DataFrame with 'embedding' column (first 2 rows):\n")
print(df_chunks_lc.head(2))

df_chunks_lc.to_csv("../data/embedded_chunks.csv", index=False)

# Verify the shape of a single embedding (should be 384 for all-MiniLM-L6-v2)
if not df_chunks_lc.empty:
    sample_embedding = df_chunks_lc['embedding'].iloc[0]
    print(f"\nShape of a single embedding: {sample_embedding.shape}") # Should output (384,)
    print(f"Data type of embedding: {type(sample_embedding)}")



Generating embeddings for chunks...


Batches: 100%|██████████| 5190/5190 [1:01:35<00:00,  1.40it/s]


In [1]:
print(df_chunks_lc.head(2))


NameError: name 'df_chunks_lc' is not defined