# FIT-FLIX Knowledge Base Data Exploration

This notebook explores and analyzes the fitness knowledge base for the FIT-FLIX RAG system.

## Contents
1. Data Loading and Overview
2. Document Statistics
3. Content Analysis
4. Category Distribution
5. Text Processing Insights
6. Visualization

In [None]:
# Import required libraries
import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Add src to path
sys.path.append('../src')

from src.config import Config
from src.utils.document_loader import DocumentLoader
from src.utils.text_splitter import TextSplitter

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

In [None]:
# Initialize components
config = Config()
document_loader = DocumentLoader(config)
text_splitter = TextSplitter(config)

print(f"Knowledge base directory: {config.knowledge_base_dir}")
print(f"Chunk size: {config.chunk_size}")
print(f"Chunk overlap: {config.chunk_overlap}")

In [None]:
# Load all documents
documents = document_loader.load_all_documents()
print(f"Loaded {len(documents)} documents")

# Get document statistics
stats = document_loader.get_document_stats(documents)
print("\nDocument Statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")

In [None]:
# Create DataFrame for analysis
doc_data = []
for doc in documents:
    metadata = doc['metadata']
    content = doc['content']
    
    doc_data.append({
        'source': metadata.get('source', 'unknown'),
        'category': metadata.get('category', 'unknown'),
        'file_type': metadata.get('file_type', 'unknown'),
        'content_length': len(content),
        'word_count': len(content.split()),
        'line_count': content.count('\n') + 1,
        'content': content
    })

df = pd.DataFrame(doc_data)
print("Document DataFrame:")
print(df.head())

In [None]:
# Document length distribution
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.hist(df['content_length'], bins=20, alpha=0.7, color='skyblue')
plt.xlabel('Content Length (characters)')
plt.ylabel('Frequency')
plt.title('Distribution of Document Lengths')

plt.subplot(2, 2, 2)
plt.hist(df['word_count'], bins=20, alpha=0.7, color='lightgreen')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Distribution of Word Counts')

plt.subplot(2, 2, 3)
category_counts = df['category'].value_counts()
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Distribution by Category')

plt.subplot(2, 2, 4)
sns.boxplot(data=df, x='category', y='content_length')
plt.xticks(rotation=45)
plt.title('Content Length by Category')

plt.tight_layout()
plt.show()

In [None]:
# Analyze text splitting
chunked_documents = text_splitter.split_documents(documents)
split_stats = text_splitter.get_splitting_stats(chunked_documents)

print("Text Splitting Statistics:")
for key, value in split_stats.items():
    print(f"{key}: {value}")

# Create DataFrame for chunks
chunk_data = []
for chunk in chunked_documents:
    metadata = chunk['metadata']
    content = chunk['content']
    
    chunk_data.append({
        'source': metadata.get('source', 'unknown'),
        'category': metadata.get('category', 'unknown'),
        'chunk_id': metadata.get('chunk_id', 0),
        'total_chunks': metadata.get('total_chunks', 1),
        'content_length': len(content),
        'word_count': len(content.split())
    })

chunk_df = pd.DataFrame(chunk_data)
print(f"\nCreated {len(chunk_df)} chunks from {len(df)} documents")

In [None]:
# Chunk analysis
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(chunk_df['content_length'], bins=30, alpha=0.7, color='coral')
plt.axvline(config.chunk_size, color='red', linestyle='--', label=f'Target Size ({config.chunk_size})')
plt.xlabel('Chunk Length (characters)')
plt.ylabel('Frequency')
plt.title('Distribution of Chunk Lengths')
plt.legend()

plt.subplot(1, 2, 2)
chunks_per_doc = chunk_df.groupby('source')['chunk_id'].max() + 1
plt.hist(chunks_per_doc, bins=15, alpha=0.7, color='gold')
plt.xlabel('Number of Chunks per Document')
plt.ylabel('Frequency')
plt.title('Chunks per Document Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Content analysis - most common words
all_text = ' '.join(df['content'].tolist())
words = re.findall(r'\b\w+\b', all_text.lower())
word_freq = Counter(words)

# Remove common stop words
stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their'}

filtered_words = {word: count for word, count in word_freq.items() if word not in stop_words and len(word) > 2}
top_words = dict(Counter(filtered_words).most_common(20))

plt.figure(figsize=(12, 6))
plt.bar(top_words.keys(), top_words.values(), color='lightblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 20 Most Common Words in Knowledge Base')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("Top 10 words:")
for word, count in list(top_words.items())[:10]:
    print(f"{word}: {count}")

In [None]:
# Summary statistics
print("=== FIT-FLIX Knowledge Base Summary ===")
print(f"Total Documents: {len(documents)}")
print(f"Total Chunks: {len(chunked_documents)}")
print(f"Categories: {list(df['category'].unique())}")
print(f"Total Words: {df['word_count'].sum():,}")
print(f"Total Characters: {df['content_length'].sum():,}")
print(f"Average Document Length: {df['content_length'].mean():.0f} characters")
print(f"Average Chunk Length: {chunk_df['content_length'].mean():.0f} characters")
print(f"Chunk Size Efficiency: {(chunk_df['content_length'].mean() / config.chunk_size) * 100:.1f}%")

print("\n=== Category Breakdown ===")
category_stats = df.groupby('category').agg({
    'content_length': ['count', 'sum', 'mean'],
    'word_count': 'sum'
}).round(2)
print(category_stats)