# 🧾 Business Documents — RAG Data Preparation & Text Embedding Pipeline
This notebook analyzes and processes unstructured text data (emails, reports, notes) to build retrieval-augmented generation (RAG) context datasets.

In [None]:
# STEP 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import torch
sns.set(style='whitegrid')
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
# STEP 2: Load dataset
df = pd.read_csv('business_documents.csv', parse_dates=['created_date'])
print('Shape:', df.shape)
df.head()


In [None]:
# STEP 3: EDA on metadata
print('Document types:\n', df['doc_type'].value_counts())
print('\nDepartments:\n', df['related_department'].value_counts())

plt.figure(figsize=(8,4))
sns.countplot(x='doc_type', data=df, order=df['doc_type'].value_counts().index)
plt.title('Documents by Type')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(8,4))
sns.countplot(x='related_department', data=df, order=df['related_department'].value_counts().index)
plt.title('Documents by Department')
plt.xticks(rotation=45)
plt.show()


In [None]:
# STEP 4: Text length analysis
df['content_length'] = df['content'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(8,4))
sns.histplot(df['content_length'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Document Length (words)')
plt.xlabel('Word count')
plt.show()


In [None]:
# STEP 5: Text preprocessing
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', str(text))
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return ' '.join(tokens)

df['clean_content'] = df['content'].apply(clean_text)
df[['content','clean_content']].head()


In [None]:
# STEP 6: Embedding generation
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['clean_content'].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')

# Save embeddings dimension for FAISS
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)
print('FAISS index built:', index.ntotal, 'vectors')


In [None]:
# STEP 7: Simple RAG-style retrieval simulation
def semantic_search(query, top_k=5):
    query_emb = model.encode([query]).astype('float32')
    D, I = index.search(query_emb, top_k)
    return df.iloc[I[0]][['doc_id','title','doc_type','related_department','summary']]

# Example
query = 'quarterly revenue performance meeting summary'
results = semantic_search(query, top_k=3)
results


In [None]:
# STEP 8: Save vector index and processed data
faiss.write_index(index, 'business_docs_index.faiss')
df.to_csv('business_documents_processed.csv', index=False)
print('Saved processed text and FAISS index.')


## ✅ Next Steps
- Integrate `business_docs_index.faiss` into your RAG backend (LangChain, LlamaIndex, or Haystack).
- Use `summary` + `keywords` fields for improved retrieval metadata filtering.
- Fine-tune retrieval scoring with hybrid search (BM25 + embeddings).
