# Quote Data Collection and Processing Pipeline

This notebook implements the data collection and processing pipeline according to specs/data_collection_processing_v2.md

## Setup and Dependencies

In [23]:
import pandas as pd
import requests
from datasets import load_dataset
import difflib
import re
import os
from typing import List, Dict, Tuple
import numpy as np
from pathlib import Path

In [None]:
# Configuration
KAGGLE_CSV_PATH = "../data/inspirational_quotes_kaggle.csv"
HUGGINGFACE_DATASET = "asuender/motivational-quotes"
GITHUB_GIST_URL = "https://gist.githubusercontent.com/JakubPetriska/060958fd744ca34f099e947cd080b540/raw/963b5a9355f04741239407320ac973a6096cd7b6/quotes.csv"
OUTPUT_PATH = "../data/processed/unified_quotes_dataset.csv"

# Create output directory
Path("../data/processed").mkdir(parents=True, exist_ok=True)

## Phase 1: Data Ingestion & Standardization

In [25]:
def load_kaggle_data() -> pd.DataFrame:
    """Load Kaggle inspirational quotes dataset"""
    df = pd.read_csv(KAGGLE_CSV_PATH)
    
    # Standardize column names and add metadata
    df_standard = pd.DataFrame({
        'quote_text': df['quote'],
        'author': df['author'],
        'source_dataset': 'kaggle',
        'instruction_prompt': '',
        'existing_categories': df['category']
    })
    
    print(f"Loaded {len(df_standard)} quotes from Kaggle dataset")
    return df_standard

def load_huggingface_data() -> pd.DataFrame:
    """Load HuggingFace motivational quotes dataset"""
    dataset = load_dataset(HUGGINGFACE_DATASET, "quotes_extended", split="train")
    df = dataset.to_pandas()
    
    # Standardize column names and add metadata
    df_standard = pd.DataFrame({
        'quote_text': df['quote'],
        'author': df['author'],
        'source_dataset': 'huggingface',
        'instruction_prompt': df['prompt'],
        'existing_categories': ''
    })
    
    print(f"Loaded {len(df_standard)} quotes from HuggingFace dataset")
    return df_standard

def load_github_data() -> pd.DataFrame:
    """Load GitHub gist quotes dataset"""
    response = requests.get(GITHUB_GIST_URL)
    
    # Save to temporary file and read with pandas
    with open("temp_github_quotes.csv", "w", encoding="utf-8") as f:
        f.write(response.text)
    
    df = pd.read_csv("temp_github_quotes.csv")
    os.remove("temp_github_quotes.csv")
    
    # Standardize column names and add metadata
    df_standard = pd.DataFrame({
        'quote_text': df['Quote'],
        'author': df['Author'],
        'source_dataset': 'github',
        'instruction_prompt': '',
        'existing_categories': ''
    })
    
    print(f"Loaded {len(df_standard)} quotes from GitHub gist")
    return df_standard

In [26]:
# Load all data sources
kaggle_df = load_kaggle_data()
huggingface_df = load_huggingface_data()
github_df = load_github_data()

# Combine all sources
combined_df = pd.concat([kaggle_df, huggingface_df, github_df], ignore_index=True)
print(f"\nTotal quotes before processing: {len(combined_df)}")
print(f"Source distribution:")
print(combined_df['source_dataset'].value_counts())

Loaded 29296 quotes from Kaggle dataset
Loaded 4262 quotes from HuggingFace dataset
Loaded 1664 quotes from GitHub gist

Total quotes before processing: 35222
Source distribution:
source_dataset
kaggle         29296
huggingface     4262
github          1664
Name: count, dtype: int64


## Phase 2: Data Cleaning & Filtering

In [27]:
def standardize_quotes(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize quote text formatting"""
    df = df.copy()
    
    # Remove surrounding quotation marks
    df['quote_text'] = df['quote_text'].str.strip('"\'“”‘’')
    
    # Normalize whitespace
    df['quote_text'] = df['quote_text'].str.replace(r'\s+', ' ', regex=True)
    df['quote_text'] = df['quote_text'].str.strip()
    
    # Ensure proper sentence capitalization
    df['quote_text'] = df['quote_text'].apply(lambda x: x[0].upper() + x[1:] if len(x) > 0 else x)
    
    return df

def filter_by_length(df: pd.DataFrame, min_length: int = 5, max_length: int = 150) -> pd.DataFrame:
    """Filter quotes by character length"""
    initial_count = len(df)
    
    # Filter by length
    df_filtered = df[
        (df['quote_text'].str.len() >= min_length) & 
        (df['quote_text'].str.len() <= max_length)
    ].copy()
    
    removed_count = initial_count - len(df_filtered)
    print(f"Removed {removed_count} quotes due to length constraints")
    
    return df_filtered

def normalize_authors(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize author names"""
    df = df.copy()
    
    # Handle missing authors
    df['author'] = df['author'].fillna('Unknown')
    df['author'] = df['author'].replace('', 'Unknown')
    
    # Clean author names
    df['author'] = df['author'].str.strip()
    
    return df

In [28]:
# Apply cleaning functions
print("Applying data cleaning...")
cleaned_df = standardize_quotes(combined_df)
cleaned_df = filter_by_length(cleaned_df)
cleaned_df = normalize_authors(cleaned_df)

print(f"Quotes after cleaning: {len(cleaned_df)}")

Applying data cleaning...
Removed 850 quotes due to length constraints
Quotes after cleaning: 34372


## Phase 3: Duplicate Detection & Removal

In [29]:
def calculate_quality_score(row: pd.Series) -> int:
    """Calculate quality score for duplicate resolution"""
    score = 0
    
    # Known author > Unknown author
    if row['author'] != 'Unknown':
        score += 4
    
    # Has categories > no categories
    if pd.notna(row['existing_categories']) and row['existing_categories'] != '':
        score += 2
    
    # Has instruction prompt > no prompt
    if pd.notna(row['instruction_prompt']) and row['instruction_prompt'] != '':
        score += 2
    
    # Source priority: HuggingFace > Kaggle > GitHub
    if row['source_dataset'] == 'huggingface':
        score += 3
    elif row['source_dataset'] == 'kaggle':
        score += 2
    else:  # github
        score += 1
    
    return score

def normalize_for_comparison(text: str) -> str:
    """Normalize text for similarity comparison"""
    # Convert to lowercase and remove punctuation
    normalized = re.sub(r'[^a-z0-9\s]', '', text.lower())
    # Remove extra whitespace
    normalized = re.sub(r'\s+', ' ', normalized).strip()
    return normalized

def find_near_duplicates_fast(df: pd.DataFrame, threshold: float = 0.9) -> pd.DataFrame:
    """Fast near-duplicate detection using optimized algorithms"""
    import time
    start_time = time.time()
    
    print(f"Finding near-duplicates with {threshold} similarity threshold...")
    
    # Add quality scores
    df = df.copy()
    df['quality_score'] = df.apply(calculate_quality_score, axis=1)
    
    # Step 1: Remove exact duplicates efficiently
    initial_count = len(df)
    df = df.sort_values('quality_score', ascending=False)
    df = df.drop_duplicates(subset=['quote_text'], keep='first')
    exact_duplicates_removed = initial_count - len(df)
    print(f"Removed {exact_duplicates_removed} exact duplicates")
    
    # Step 2: Fast near-duplicate detection using length-based grouping
    df['normalized_text'] = df['quote_text'].apply(normalize_for_comparison)
    df['text_length'] = df['normalized_text'].str.len()
    
    # Group by similar lengths (±5 characters) to reduce comparisons
    length_groups = {}
    for idx, row in df.iterrows():
        length = row['text_length']
        length_key = length // 5  # Group by 5-character buckets
        if length_key not in length_groups:
            length_groups[length_key] = []
        length_groups[length_key].append(idx)
    
    to_remove = set()
    total_comparisons = 0
    
    print(f"Processing {len(length_groups)} length groups...")
    
    for group_key, indices in length_groups.items():
        if len(indices) < 2:
            continue  # Skip groups with only one quote
            
        # Sort group by quality score (highest first)
        group_df = df.loc[indices].sort_values('quality_score', ascending=False)
        group_indices = list(group_df.index)
        
        # Compare within group using optimized approach
        for i, idx1 in enumerate(group_indices):
            if idx1 in to_remove:
                continue
                
            row1 = df.loc[idx1]
            
            # Only compare with next 20 quotes maximum to limit complexity
            max_comparisons = min(20, len(group_indices) - i - 1)
            
            for j in range(1, max_comparisons + 1):
                if i + j >= len(group_indices):
                    break
                    
                idx2 = group_indices[i + j]
                if idx2 in to_remove:
                    continue
                    
                row2 = df.loc[idx2]
                
                # Quick length check (must be within 10 characters)
                if abs(row1['text_length'] - row2['text_length']) > 10:
                    continue
                
                # Quick first-word check for early termination
                words1 = row1['normalized_text'].split()
                words2 = row2['normalized_text'].split()
                if len(words1) > 0 and len(words2) > 0 and words1[0] != words2[0]:
                    if len(words1[0]) > 3 and len(words2[0]) > 3:
                        continue  # Skip if first significant words are different
                
                # Full similarity check
                similarity = difflib.SequenceMatcher(
                    None, row1['normalized_text'], row2['normalized_text']
                ).ratio()
                
                total_comparisons += 1
                
                if similarity >= threshold:
                    # Always remove the lower quality one (idx2, since sorted by quality)
                    to_remove.add(idx2)
    
    # Remove near-duplicates
    df_deduplicated = df.drop(index=list(to_remove))
    near_duplicates_removed = len(to_remove)
    
    elapsed_time = time.time() - start_time
    
    print(f"Removed {near_duplicates_removed} near-duplicates")
    print(f"Total duplicates removed: {exact_duplicates_removed + near_duplicates_removed}")
    print(f"Total comparisons made: {total_comparisons:,}")
    print(f"Processing time: {elapsed_time:.1f} seconds")
    
    # Clean up temporary columns and reset index
    df_deduplicated = df_deduplicated.drop(columns=['quality_score', 'normalized_text', 'text_length'])
    df_deduplicated = df_deduplicated.reset_index(drop=True)
    
    return df_deduplicated

In [30]:
# Remove duplicates using fast optimized approach
deduplicated_df = find_near_duplicates_fast(cleaned_df)
print(f"Quotes after deduplication: {len(deduplicated_df)}")

Finding near-duplicates with 0.9 similarity threshold...
Removed 552 exact duplicates
Processing 31 length groups...
Removed 123 near-duplicates
Total duplicates removed: 675
Total comparisons made: 482,482
Processing time: 59.2 seconds
Quotes after deduplication: 33697


## Phase 4: Quality Assurance & Export

In [31]:
# No instruction prompt generation per v2 spec - use deduplicated data directly
final_df = deduplicated_df.copy()
print(f"Final dataset size: {len(final_df)}")

Final dataset size: 33697


## Phase 5: Quality Assurance & Export

## Quality Assurance & Export Functions

In [32]:
def validate_dataset(df: pd.DataFrame) -> bool:
    """Validate final dataset quality"""
    print("\n=== Dataset Validation ===")
    
    # Check required columns
    required_columns = ['quote_text', 'author', 'source_dataset', 'instruction_prompt', 'existing_categories']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"❌ Missing columns: {missing_columns}")
        return False
    print("✅ All required columns present")
    
    # Check for empty quote_text
    empty_quotes = df['quote_text'].isna().sum() + (df['quote_text'] == '').sum()
    if empty_quotes > 0:
        print(f"❌ {empty_quotes} empty quotes found")
        return False
    print("✅ No empty quotes")
    
    # Check length constraints
    quote_lengths = df['quote_text'].str.len()
    invalid_lengths = ((quote_lengths < 5) | (quote_lengths > 150)).sum()
    if invalid_lengths > 0:
        print(f"❌ {invalid_lengths} quotes with invalid length")
        return False
    print("✅ All quotes within length constraints (5-150 characters)")
    
    # Check instruction prompts (OK to be empty per v2 spec)
    empty_prompts = df['instruction_prompt'].isna().sum() + (df['instruction_prompt'] == '').sum()
    print(f"ℹ️  {empty_prompts} entries without instruction prompts (acceptable per v2 spec)")
    
    print("\n✅ Dataset validation passed!")
    return True

def generate_quality_metrics(df: pd.DataFrame):
    """Generate and display quality metrics"""
    print("\n=== Quality Metrics ===")
    
    # Source distribution
    print("\nSource Distribution:")
    source_dist = df['source_dataset'].value_counts()
    for source, count in source_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  {source}: {count:,} ({percentage:.1f}%)")
    
    # Quote length distribution
    print("\nQuote Length Statistics:")
    lengths = df['quote_text'].str.len()
    print(f"  Mean: {lengths.mean():.1f} characters")
    print(f"  Median: {lengths.median():.1f} characters")
    print(f"  Min: {lengths.min()} characters")
    print(f"  Max: {lengths.max()} characters")
    
    # Author coverage
    print("\nAuthor Statistics:")
    unique_authors = df['author'].nunique()
    unknown_authors = (df['author'] == 'Unknown').sum()
    print(f"  Unique authors: {unique_authors:,}")
    print(f"  Unknown authors: {unknown_authors:,} ({(unknown_authors/len(df)*100):.1f}%)")
    
    # Top authors
    print("\nTop 10 Authors:")
    top_authors = df['author'].value_counts().head(10)
    for author, count in top_authors.items():
        print(f"  {author}: {count} quotes")
    
    # Instruction prompt analysis
    print("\nInstruction Prompt Analysis:")
    with_prompts = df[df['instruction_prompt'] != '']
    if len(with_prompts) > 0:
        prompt_dist = with_prompts['instruction_prompt'].value_counts().head(5)
        for prompt, count in prompt_dist.items():
            print(f"  '{prompt}': {count} quotes")
    else:
        print("  No instruction prompts present")
    
    # Categories analysis
    print("\nCategory Analysis:")
    with_categories = df[df['existing_categories'] != '']
    if len(with_categories) > 0:
        print(f"  Entries with categories: {len(with_categories):,}")
        category_sample = with_categories['existing_categories'].value_counts().head(5)
        for category, count in category_sample.items():
            print(f"  '{category}': {count} quotes")
    else:
        print("  No categories present")
    
    return {
        'total_records': len(df),
        'source_distribution': source_dist.to_dict(),
        'length_stats': {
            'mean': lengths.mean(),
            'median': lengths.median(),
            'min': lengths.min(),
            'max': lengths.max()
        },
        'unique_authors': unique_authors,
        'unknown_authors': unknown_authors
    }

In [None]:
# Validate dataset using proper function
is_valid = validate_dataset(final_df)

if is_valid:
    # Generate quality metrics using proper function
    metrics = generate_quality_metrics(final_df)
    
    # Export to CSV
    final_df.to_csv(OUTPUT_PATH, index=False)
    print(f"\n✅ Dataset exported to: {OUTPUT_PATH}")
    print(f"📊 Final dataset contains {len(final_df):,} high-quality quotes")
    
    # Save summary report
    summary_path = "../data/processed/processing_summary.txt"
    with open(summary_path, "w") as f:
        f.write("Quote Processing Summary\n")
        f.write("======================\n\n")
        f.write(f"Total quotes processed: {len(final_df):,}\n")
        f.write(f"Source distribution:\n")
        for source, count in metrics['source_distribution'].items():
            f.write(f"  {source}: {count:,}\n")
        f.write(f"\nUnique authors: {metrics['unique_authors']:,}\n")
        f.write(f"Unknown authors: {metrics['unknown_authors']:,}\n")
        f.write(f"\nQuote length range: {metrics['length_stats']['min']}-{metrics['length_stats']['max']} characters\n")
        f.write(f"Average length: {metrics['length_stats']['mean']:.1f} characters\n")
    
    print(f"📄 Summary report saved to: {summary_path}")
    
else:
    print("❌ Dataset validation failed. Please fix issues before export.")

## Sample Data Inspection

In [34]:
# Display sample data
print("\n=== Sample Quotes ===")
sample_quotes = final_df.sample(5, random_state=42)
for idx, row in sample_quotes.iterrows():
    print(f"\nQuote: \"{row['quote_text']}\"")
    print(f"Author: {row['author']}")
    print(f"Source: {row['source_dataset']}")
    if row['instruction_prompt']:
        print(f"Prompt: {row['instruction_prompt']}")
    if row['existing_categories']:
        print(f"Categories: {row['existing_categories']}")
    print("-" * 50)


=== Sample Quotes ===

Quote: "By changing how you perceive things and how you act upon those perceptions, you will change your life."
Author: Chris Prentiss, Be Who You Want, Have What You Want: Change Your Thinking, Change Your Life
Source: kaggle
Categories: change, chris-prentiss, inspiration, inspirational, life, non-12-step, passages-malibu, passages-ventura, perception, philosophy, quotes
--------------------------------------------------

Quote: "There's no time for hatred, only questions. Where is love? Where is happiness? What is life? Where is peace?"
Author: Jeff Buckley
Source: kaggle
Categories: advice, inspirational, jeff-buckley, life, love, no-hatred, peace
--------------------------------------------------

Quote: "A purpose directed disciplined action always bring success."
Author: Debasish Mridha
Source: kaggle
Categories: action, directed, disciplined, education, happiness, hope, inspirational, intelligence, knowledge, life, love, philosophy, purpose, success, tru