In [None]:
# notebooks/exploratory_analysis.ipynb (save as Jupyter notebook)

# Animal Shelter Insights - Exploratory Data Analysis
# ================================================

# Cell 1: Setup and Data Loading
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load data
def load_shelter_data(db_path="data/shelter_data.db"):
    """Load and prepare shelter data for analysis"""
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM animals", conn)
    conn.close()

    # Parse photo counts
    df['photo_count'] = df['photos'].apply(
        lambda x: len(json.loads(x)) if pd.notna(x) and x != '[]' else 0
    )

    # Parse description length
    df['description_length'] = df['description'].fillna('').str.len()
    df['has_description'] = df['description_length'] > 0

    # Convert dates
    df['published_at'] = pd.to_datetime(df['published_at'])
    df['scraped_at'] = pd.to_datetime(df['scraped_at'])

    return df

df = load_shelter_data()
print(f"Loaded {len(df)} animals for analysis")

# Cell 2: Basic Overview Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Animal Shelter Dataset Overview', fontsize=16, fontweight='bold')

# Species distribution
df['species'].value_counts().plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Species Distribution')
axes[0,0].set_xlabel('Species')
axes[0,0].set_ylabel('Count')
axes[0,0].tick_params(axis='x', rotation=45)

# Age distribution
df['age'].value_counts().plot(kind='bar', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('Age Distribution')
axes[0,1].set_xlabel('Age Category')
axes[0,1].set_ylabel('Count')
axes[0,1].tick_params(axis='x', rotation=45)

# Size distribution (mainly for dogs)
size_data = df[df['size'].notna()]
size_data['size'].value_counts().plot(kind='bar', ax=axes[1,0], color='lightgreen')
axes[1,0].set_title('Size Distribution')
axes[1,0].set_xlabel('Size')
axes[1,0].set_ylabel('Count')
axes[1,0].tick_params(axis='x', rotation=45)

# Gender distribution
df['gender'].value_counts().plot(kind='bar', ax=axes[1,1], color='plum')
axes[1,1].set_title('Gender Distribution')
axes[1,1].set_xlabel('Gender')
axes[1,1].set_ylabel('Count')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Cell 3: Photo and Content Analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Content Quality Analysis', fontsize=16, fontweight='bold')

# Photo count distribution
photo_counts = df['photo_count'].value_counts().sort_index()
photo_counts.plot(kind='bar', ax=axes[0,0], color='orange')
axes[0,0].set_title('Photo Count Distribution')
axes[0,0].set_xlabel('Number of Photos')
axes[0,0].set_ylabel('Number of Animals')

# Description length distribution
df['description_length'].hist(bins=30, ax=axes[0,1], color='teal', alpha=0.7)
axes[0,1].set_title('Description Length Distribution')
axes[0,1].set_xlabel('Description Length (characters)')
axes[0,1].set_ylabel('Number of Animals')

# Photos vs Species
photo_by_species = df.groupby('species')['photo_count'].mean()
photo_by_species.plot(kind='bar', ax=axes[1,0], color='gold')
axes[1,0].set_title('Average Photos by Species')
axes[1,0].set_xlabel('Species')
axes[1,0].set_ylabel('Average Photo Count')
axes[1,0].tick_params(axis='x', rotation=45)

# Description completeness by species
desc_by_species = df.groupby('species')['has_description'].mean() * 100
desc_by_species.plot(kind='bar', ax=axes[1,1], color='coral')
axes[1,1].set_title('Description Completeness by Species (%)')
axes[1,1].set_xlabel('Species')
axes[1,1].set_ylabel('Percentage with Description')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Cell 4: Breed Analysis
print("TOP DOG BREEDS")
print("="*30)
dog_data = df[df['species'] == 'Dog']
if len(dog_data) > 0:
    top_dog_breeds = dog_data['breed_primary'].value_counts().head(10)
    for breed, count in top_dog_breeds.items():
        percentage = (count / len(dog_data)) * 100
        print(f"{breed}: {count} ({percentage:.1f}%)")

print("\nTOP CAT BREEDS")
print("="*30)
cat_data = df[df['species'] == 'Cat']
if len(cat_data) > 0:
    top_cat_breeds = cat_data['breed_primary'].value_counts().head(10)
    for breed, count in top_cat_breeds.items():
        percentage = (count / len(cat_data)) * 100
        print(f"{breed}: {count} ({percentage:.1f}%)")

# Visualize breed diversity
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

if len(dog_data) > 0:
    dog_data['breed_primary'].value_counts().head(10).plot(kind='barh', ax=axes[0], color='steelblue')
    axes[0].set_title('Top 10 Dog Breeds')
    axes[0].set_xlabel('Count')

if len(cat_data) > 0:
    cat_data['breed_primary'].value_counts().head(10).plot(kind='barh', ax=axes[1], color='darkseagreen')
    axes[1].set_title('Top 10 Cat Breeds')
    axes[1].set_xlabel('Count')

plt.tight_layout()
plt.show()

# Cell 5: Geographic Analysis
print("GEOGRAPHIC DISTRIBUTION")
print("="*40)

# State distribution
print("By State:")
state_counts = df['state'].value_counts()
for state, count in state_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {state}: {count} ({percentage:.1f}%)")

# City distribution
print(f"\nTop 10 Cities:")
city_counts = df['city'].value_counts().head(10)
for city, count in city_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {city}: {count} ({percentage:.1f}%)")

# Visualize geographic distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# State distribution
df['state'].value_counts().plot(kind='bar', ax=axes[0], color='lightcoral')
axes[0].set_title('Animals by State')
axes[0].set_xlabel('State')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Top cities
df['city'].value_counts().head(10).plot(kind='barh', ax=axes[1], color='lightblue')
axes[1].set_title('Top 10 Cities')
axes[1].set_xlabel('Count')

plt.tight_layout()
plt.show()

# Cell 6: Behavioral and Health Characteristics
behavioral_cols = ['good_with_children', 'good_with_dogs', 'good_with_cats']
health_cols = ['spayed_neutered', 'house_trained', 'shots_current', 'special_needs']

print("HEALTH & BEHAVIORAL CHARACTERISTICS")
print("="*50)

# Calculate percentages for behavioral traits
print("Behavioral Traits (% True):")
for col in behavioral_cols:
    if col in df.columns:
        true_count = df[col].sum()
        total_with_data = df[col].notna().sum()
        if total_with_data > 0:
            percentage = (true_count / total_with_data) * 100
            print(f"  {col.replace('_', ' ').title()}: {percentage:.1f}% ({true_count}/{total_with_data})")

print(f"\nHealth/Care Status (% True):")
for col in health_cols:
    if col in df.columns:
        true_count = df[col].sum()
        total_with_data = df[col].notna().sum()
        if total_with_data > 0:
            percentage = (true_count / total_with_data) * 100
            print(f"  {col.replace('_', ' ').title()}: {percentage:.1f}% ({true_count}/{total_with_data})")

# Visualize behavioral traits
available_behavioral = [col for col in behavioral_cols if col in df.columns and df[col].notna().sum() > 0]
available_health = [col for col in health_cols if col in df.columns and df[col].notna().sum() > 0]

if available_behavioral or available_health:
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    if available_behavioral:
        behavioral_percentages = []
        labels = []
        for col in available_behavioral:
            true_count = df[col].sum()
            total_with_data = df[col].notna().sum()
            if total_with_data > 0:
                percentage = (true_count / total_with_data) * 100
                behavioral_percentages.append(percentage)
                labels.append(col.replace('_', ' ').title())

        if behavioral_percentages:
            axes[0].bar(labels, behavioral_percentages, color='skyblue')
            axes[0].set_title('Behavioral Traits (% Positive)')
            axes[0].set_ylabel('Percentage')
            axes[0].tick_params(axis='x', rotation=45)

    if available_health:
        health_percentages = []
        labels = []
        for col in available_health:
            true_count = df[col].sum()
            total_with_data = df[col].notna().sum()
            if total_with_data > 0:
                percentage = (true_count / total_with_data) * 100
                health_percentages.append(percentage)
                labels.append(col.replace('_', ' ').title())

        if health_percentages:
            axes[1].bar(labels, health_percentages, color='lightgreen')
            axes[1].set_title('Health/Care Status (% Positive)')
            axes[1].set_ylabel('Percentage')
            axes[1].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

# Cell 7: Initial Pattern Discovery
print("INITIAL PATTERNS & INSIGHTS")
print("="*40)

# Photo count vs content richness correlation
if 'photo_count' in df.columns and 'description_length' in df.columns:
    photo_desc_corr = df['photo_count'].corr(df['description_length'])
    print(f"Correlation between photo count and description length: {photo_desc_corr:.3f}")

# Average characteristics by species
print(f"\nAverage Content Quality by Species:")
for species in df['species'].unique():
    if pd.notna(species):
        species_data = df[df['species'] == species]
        avg_photos = species_data['photo_count'].mean()
        avg_desc_len = species_data['description_length'].mean()
        print(f"  {species}: {avg_photos:.1f} photos, {avg_desc_len:.0f} char descriptions")

# Size vs content (for dogs)
if len(dog_data) > 0 and 'size' in dog_data.columns:
    print(f"\nDog Content Quality by Size:")
    for size in dog_data['size'].unique():
        if pd.notna(size):
            size_data = dog_data[dog_data['size'] == size]
            avg_photos = size_data['photo_count'].mean()
            avg_desc_len = size_data['description_length'].mean()
            print(f"  {size}: {avg_photos:.1f} photos, {avg_desc_len:.0f} char descriptions")

# Cell 8: Data Readiness Assessment
print("DATA READINESS FOR ANALYSIS")
print("="*40)

analysis_readiness = {}

# Photo analysis readiness
animals_with_photos = (df['photo_count'] > 0).sum()
photo_percentage = (animals_with_photos / len(df)) * 100
analysis_readiness['Photo Analysis'] = photo_percentage >= 60

print(f"Photo Analysis:")
print(f"  Animals with photos: {animals_with_photos}/{len(df)} ({photo_percentage:.1f}%)")
print(f"  Ready for analysis: {'Yes' if analysis_readiness['Photo Analysis'] else 'No'}")

# Description analysis readiness
animals_with_descriptions = df['has_description'].sum()
desc_percentage = (animals_with_descriptions / len(df)) * 100
analysis_readiness['Description Analysis'] = desc_percentage >= 60

print(f"\nDescription Analysis:")
print(f"  Animals with descriptions: {animals_with_descriptions}/{len(df)} ({desc_percentage:.1f}%)")
print(f"  Ready for analysis: {'Yes' if analysis_readiness['Description Analysis'] else 'No'}")

# Sample size assessment
analysis_readiness['Statistical Power'] = len(df) >= 300

print(f"\nSample Size:")
print(f"  Current sample: {len(df)} animals")
print(f"  Adequate for analysis: {'Yes' if analysis_readiness['Statistical Power'] else 'No'}")

# Geographic diversity
unique_locations = df['city'].nunique()
analysis_readiness['Geographic Diversity'] = unique_locations >= 5

print(f"\nGeographic Diversity:")
print(f"  Unique cities: {unique_locations}")
print(f"  Sufficient diversity: {'Yes' if analysis_readiness['Geographic Diversity'] else 'No'}")

# Overall readiness
ready_count = sum(analysis_readiness.values())
total_checks = len(analysis_readiness)

print(f"\nOverall Readiness: {ready_count}/{total_checks} criteria met")
if ready_count >= 3:
    print("Dataset is ready for initial analysis!")
else:
    print("Consider expanding dataset before deep analysis")

# Cell 9: Next Steps Recommendations
print("\nRECOMMENDED NEXT STEPS")
print("="*40)

next_steps = []

if not analysis_readiness['Statistical Power']:
    next_steps.append("Expand sample size to 500-1000 animals")

if not analysis_readiness['Photo Analysis']:
    next_steps.append("Focus data collection on animals with multiple photos")

if not analysis_readiness['Description Analysis']:
    next_steps.append("Prioritize animals with detailed descriptions")

if not analysis_readiness['Geographic Diversity']:
    next_steps.append("Expand geographic coverage to multiple cities/states")

if len(next_steps) == 0:
    next_steps = [
        "Begin photo quality analysis using computer vision",
        "Start NLP analysis of descriptions",
        "Develop adoption time prediction models",
        "Create shelter optimization recommendations"
    ]

for i, step in enumerate(next_steps, 1):
    print(f"{i}. {step}")

print(f"\nAnalysis foundation is established! Time to dive deeper into the data.")