In [5]:
# JSON Lines Content Type Sampler
# This notebook collects samples of each unique content_type from a JSON Lines file

import json
import os
from pathlib import Path
from collections import defaultdict

# Configuration
INPUT_FILE = r"G:\Shared drives\Labs\Constellate\Constellate Sunset\JSTOR Text Mining\Examples\jstor_metadata_2025-05-28.jsonl"  # Replace with your JSON Lines file path
OUTPUT_DIR = r"G:\Shared drives\Labs\Constellate\Constellate Sunset\JSTOR Text Mining\Examples\content_type_samples"  # Directory to save individual JSON files
SAMPLES_PER_TYPE = 10  # Number of samples to collect per content_type
MIN_LINE_GAP = 500  # Minimum lines between samples of same content_type (for variety)

def collect_content_type_samples(input_file, output_dir, samples_per_type=3, min_line_gap=100):
    """
    Collect samples of each unique content_type from a JSON Lines file.
    Ensures samples are spread out (not consecutive) for better variety.
    
    Args:
        input_file (str): Path to the input JSON Lines file
        output_dir (str): Directory to save the sample JSON files
        samples_per_type (int): Number of samples to collect per content_type
        min_line_gap (int): Minimum number of lines between samples of the same content_type
    """
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Dictionary to track samples by content_type
    content_type_samples = defaultdict(list)
    content_type_counts = defaultdict(int)
    content_type_last_sample_line = defaultdict(int)  # Track last sampled line for each type
    
    total_lines = 0
    valid_lines = 0
    
    print("🔍 Scanning file for content types...")
    print(f"📏 Ensuring minimum {min_line_gap} line gap between samples of same type")
    
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                total_lines += 1
                
                # Skip empty lines
                line = line.strip()
                if not line:
                    continue
                
                try:
                    # Parse the JSON line
                    json_data = json.loads(line)
                    valid_lines += 1
                    
                    # Get content_type
                    content_type = json_data.get('content_type', 'unknown')
                    content_type_counts[content_type] += 1
                    
                    # Check if we need more samples for this type
                    current_samples = len(content_type_samples[content_type])
                    if current_samples < samples_per_type:
                        # Check if enough lines have passed since last sample of this type
                        last_sample_line = content_type_last_sample_line[content_type]
                        line_gap = line_num - last_sample_line
                        
                        if line_gap >= min_line_gap or current_samples == 0:
                            content_type_samples[content_type].append({
                                'line_num': line_num,
                                'data': json_data
                            })
                            content_type_last_sample_line[content_type] = line_num
                            
                            print(f"📝 Collected sample {len(content_type_samples[content_type])}/{samples_per_type} for '{content_type}' (line {line_num}, gap: {line_gap})")
                    
                    # Check if we have enough samples for all discovered types
                    if line_num % 10000 == 0:
                        incomplete_types = sum(1 for samples in content_type_samples.values() if len(samples) < samples_per_type)
                        print(f"⏳ Processed {line_num:,} lines... Found {len(content_type_counts)} content types, {incomplete_types} still need more samples")
                        
                except json.JSONDecodeError as e:
                    print(f"⚠ Warning: Line {line_num} is not valid JSON - {e}")
                    continue
                    
    except FileNotFoundError:
        print(f"❌ Error: Input file '{input_file}' not found.")
        return False
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return False
    
    # Save samples to individual files
    print(f"\n💾 Saving samples to files...")
    total_saved = 0
    
    for content_type, samples in content_type_samples.items():
        for i, sample in enumerate(samples, 1):
            # Create safe filename from content_type
            safe_content_type = "".join(c if c.isalnum() or c in "._-" else "_" for c in content_type)
            output_filename = f"{safe_content_type}_sample_{i:02d}_line_{sample['line_num']:06d}.json"
            output_path = os.path.join(output_dir, output_filename)
            
            # Write to individual JSON file with pretty formatting
            with open(output_path, 'w', encoding='utf-8') as out_f:
                json.dump(sample['data'], out_f, indent=2, ensure_ascii=False)
            
            total_saved += 1
    
    # Print summary
    print(f"\n📊 SUMMARY")
    print(f"=" * 60)
    print(f"Total lines processed: {total_lines:,}")
    print(f"Valid JSON lines: {valid_lines:,}")
    print(f"Unique content types found: {len(content_type_counts)}")
    print(f"Sample files saved: {total_saved}")
    print(f"\nContent type distribution:")
    
    # Sort by count descending
    sorted_types = sorted(content_type_counts.items(), key=lambda x: x[1], reverse=True)
    for content_type, count in sorted_types:
        samples_collected = len(content_type_samples[content_type])
        print(f"  • {content_type}: {count:,} records ({samples_collected} samples collected)")
    
    return True, content_type_samples, content_type_counts

# Execute the content type sampling
print("JSON Lines Content Type Sampler")
print("=" * 50)
print(f"Input file: {INPUT_FILE}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Samples per content type: {SAMPLES_PER_TYPE}")
print(f"Minimum line gap between samples: {MIN_LINE_GAP}")
print()

# Run the sampling
result = collect_content_type_samples(INPUT_FILE, OUTPUT_DIR, SAMPLES_PER_TYPE, MIN_LINE_GAP)

if result and result[0]:  # Check if successful
    success, content_type_samples, content_type_counts = result
    
    # List the created files
    output_path = Path(OUTPUT_DIR)
    if output_path.exists():
        created_files = sorted(list(output_path.glob("*.json")))
        print(f"\n📁 Created sample files ({len(created_files)}):")
        
        # Group files by content type for better display
        files_by_type = defaultdict(list)
        for file_path in created_files:
            # Extract content type from filename
            filename = file_path.name
            content_type = filename.split('_sample_')[0]
            files_by_type[content_type].append(file_path)
        
        for content_type, files in sorted(files_by_type.items()):
            print(f"\n  {content_type}:")
            for file_path in files:
                file_size = file_path.stat().st_size
                print(f"    - {file_path.name} ({file_size:,} bytes)")

    # Display sample content from each content type
    print("\n" + "="*70)
    print("SAMPLE CONTENT - One example from each content type:")
    print("="*70)
    
    for content_type, samples in sorted(content_type_samples.items()):
        if samples:
            print(f"\n🏷️  CONTENT TYPE: {content_type}")
            print("-" * 50)
            
            # Show the first sample
            sample_data = samples[0]['data']
            
            # Show key fields for quick understanding
            key_fields = ['item_id', 'title', 'content_type', 'content_subtype', 'c5_data_type', 'published_date']
            for field in key_fields:
                if field in sample_data:
                    value = sample_data[field]
                    if isinstance(value, str) and len(value) > 100:
                        value = value[:100] + "..."
                    print(f"  {field}: {value}")
            
            print(f"  [From line {samples[0]['line_num']} of source file]")
    
    print(f"\n🎉 Complete! Collected samples of {len(content_type_samples)} different content types.")
    print(f"Check the '{OUTPUT_DIR}' directory for all sample files.")

JSON Lines Content Type Sampler
Input file: G:\Shared drives\Labs\Constellate\Constellate Sunset\JSTOR Text Mining\Examples\jstor_metadata_2025-05-28.jsonl
Output directory: G:\Shared drives\Labs\Constellate\Constellate Sunset\JSTOR Text Mining\Examples\content_type_samples
Samples per content type: 10
Minimum line gap between samples: 500

🔍 Scanning file for content types...
📏 Ensuring minimum 500 line gap between samples of same type
📝 Collected sample 1/10 for 'contributed_content' (line 1, gap: 1)
📝 Collected sample 1/10 for 'research_report' (line 2, gap: 2)
📝 Collected sample 1/10 for 'multi_part_research_report_part' (line 5, gap: 5)
📝 Collected sample 1/10 for 'book' (line 39, gap: 39)
📝 Collected sample 1/10 for 'book_part' (line 40, gap: 40)
📝 Collected sample 1/10 for 'article' (line 215, gap: 215)
📝 Collected sample 2/10 for 'contributed_content' (line 501, gap: 500)
📝 Collected sample 2/10 for 'research_report' (line 518, gap: 516)
📝 Collected sample 1/10 for 'multi_part_