### Load quotes dataset from kaggle

In [None]:
import kagglehub
import shutil
import os

# Download latest version to cache
path = kagglehub.dataset_download("manann/quotes-500k")

# Create data directory if it doesn't exist
data_dir = "../data"
os.makedirs(data_dir, exist_ok=True)

# Move all files from cache to data directory
for file in os.listdir(path):
   src = os.path.join(path, file)
   dst = os.path.join(data_dir, file)
   if os.path.isfile(src):
       shutil.move(src, dst)  # move instead of copy
       print(f"Moved {file} to {data_dir}")

# Remove the empty cache directory
try:
   os.rmdir(path)
   print(f"Removed cache directory: {path}")
except OSError:
   print(f"Could not remove cache directory (may not be empty): {path}")

print(f"All files moved to: {data_dir}")

In [None]:
# Check CSV structure first
import pandas as pd

df_sample = pd.read_csv('../data/quotes.csv', nrows=10)
print("CSV Columns:", df_sample.columns.tolist())
print("\nSample rows:")
print(df_sample.head())

### Filter for inspirational quotes

In [None]:
def filter_inspirational_quotes(csv_path, output_path=None):
    """Filter quotes that have 'inspirational' category"""
    
    df = pd.read_csv(csv_path)
    
    # Filter for quotes that have 'inspirational' in their category
    inspirational_df = df[df['category'].str.contains('inspirational', case=False, na=False)]
    
    # Length filter (10-150 characters for aphorisms)
    inspirational_df = inspirational_df[inspirational_df['quote'].str.len().between(10, 150)]
    
    # Remove duplicates based on quote text
    inspirational_df = inspirational_df.drop_duplicates(subset=['quote'])
    
    # Clean up - remove any rows with missing quotes or authors
    inspirational_df = inspirational_df.dropna(subset=['quote', 'author'])
    
    print(f"Found {len(inspirational_df)} inspirational quotes")
    print(f"Length range: {inspirational_df['quote'].str.len().min()}-{inspirational_df['quote'].str.len().max()} characters")
    
    if output_path:
        inspirational_df.to_csv(output_path, index=False)
    
    return inspirational_df

# Filter the quotes
inspirational_quotes = filter_inspirational_quotes('../data/quotes.csv', '../data/inspirational_quotes_kaggle.csv')

# Show a few samples
print("\nSample inspirational quotes:")
for i, row in inspirational_quotes.head(5).iterrows():
    print(f"'{row['quote']}' - {row['author']}")