In [1]:
# JSONL Title Regex Search
# This notebook searches through a JSONL file for items where the title matches a regular expression

import json
import re
import os
from pathlib import Path
from datetime import datetime

# Configuration
INPUT_FILE = r"G:\Shared drives\Labs\Constellate\Constellate Sunset\JSTOR Text Mining\Examples\jstor_metadata_2025-05-28.jsonl"
OUTPUT_DIR = r"G:\Shared drives\Labs\Constellate\Constellate Sunset\JSTOR Text Mining\Examples\regex_search_results"

def search_titles_regex(input_file, output_dir, regex_pattern, case_sensitive=False):
    """
    Search through JSONL file for titles matching a regular expression.
    
    Args:
        input_file (str): Path to the input JSON Lines file
        output_dir (str): Directory to save results
        regex_pattern (str): Regular expression pattern to search for
        case_sensitive (bool): Whether search should be case sensitive
    """
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Compile regex pattern
    flags = 0 if case_sensitive else re.IGNORECASE
    try:
        compiled_regex = re.compile(regex_pattern, flags)
    except re.error as e:
        print(f"‚ùå Error: Invalid regular expression '{regex_pattern}' - {e}")
        return False
    
    # Initialize tracking variables
    total_lines = 0
    valid_json_lines = 0
    items_with_titles = 0
    matching_items = []
    sample_matches = []  # Store first few matches for display
    
    print(f"üîç Searching for pattern: '{regex_pattern}'")
    print(f"üìÅ Case sensitive: {case_sensitive}")
    print(f"üìÑ Input file: {input_file}")
    print("=" * 70)
    
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                total_lines += 1
                
                # Skip empty lines
                line = line.strip()
                if not line:
                    continue
                
                try:
                    # Parse the JSON line
                    json_data = json.loads(line)
                    valid_json_lines += 1
                    
                    # Get title field
                    title = json_data.get('title')
                    if title is not None:
                        items_with_titles += 1
                        
                        # Check if title matches regex
                        if compiled_regex.search(title):
                            item_id = json_data.get('item_id', f'unknown_line_{line_num}')
                            matching_items.append({
                                'item_id': item_id,
                                'title': title,
                                'line_num': line_num,
                                'content_type': json_data.get('content_type', 'unknown')
                            })
                            
                            # Store first few matches as samples
                            if len(sample_matches) < 10:
                                sample_matches.append({
                                    'item_id': item_id,
                                    'title': title,
                                    'line_num': line_num
                                })
                            
                            # Progress indicator for matches
                            if len(matching_items) % 100 == 0:
                                print(f"‚úÖ Found {len(matching_items)} matches so far...")
                    
                    # Progress indicator for processing
                    if line_num % 50000 == 0:
                        print(f"‚è≥ Processed {line_num:,} lines... {len(matching_items)} matches found")
                        
                except json.JSONDecodeError as e:
                    print(f"‚ö† Warning: Line {line_num} is not valid JSON - {e}")
                    continue
                    
    except FileNotFoundError:
        print(f"‚ùå Error: Input file '{input_file}' not found.")
        return False
    except Exception as e:
        print(f"‚ùå Error reading file: {e}")
        return False
    
    # Generate timestamp for output files
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save item_ids to text file
    output_filename = f"matching_item_ids_{timestamp}.txt"
    output_path = os.path.join(output_dir, output_filename)
    
    try:
        with open(output_path, 'w', encoding='utf-8') as out_f:
            for item in matching_items:
                out_f.write(f"{item['item_id']}\n")
        
        print(f"\nüíæ Saved {len(matching_items)} item IDs to: {output_filename}")
        
    except Exception as e:
        print(f"‚ùå Error saving results: {e}")
        return False
    
    # Print comprehensive summary
    print("\n" + "=" * 70)
    print("üìä SEARCH RESULTS SUMMARY")
    print("=" * 70)
    print(f"Regular expression: '{regex_pattern}'")
    print(f"Case sensitive: {case_sensitive}")
    print(f"Total lines processed: {total_lines:,}")
    print(f"Valid JSON lines: {valid_json_lines:,}")
    print(f"Items with non-null titles: {items_with_titles:,}")
    print(f"Items matching regex: {len(matching_items):,}")
    
    if items_with_titles > 0:
        match_percentage = (len(matching_items) / items_with_titles) * 100
        print(f"Match percentage: {match_percentage:.2f}% of items with titles")
    
    # Show content type breakdown of matches
    if matching_items:
        content_type_counts = {}
        for item in matching_items:
            content_type = item['content_type']
            content_type_counts[content_type] = content_type_counts.get(content_type, 0) + 1
        
        print(f"\nüìã Matches by content type:")
        sorted_types = sorted(content_type_counts.items(), key=lambda x: x[1], reverse=True)
        for content_type, count in sorted_types:
            print(f"  ‚Ä¢ {content_type}: {count:,} matches")
    
    # Show sample matches
    if sample_matches:
        print(f"\nüîç Sample matches (first {len(sample_matches)}):")
        print("-" * 70)
        for i, match in enumerate(sample_matches, 1):
            title_preview = match['title'][:80] + "..." if len(match['title']) > 80 else match['title']
            print(f"{i:2d}. Line {match['line_num']:6d} | {match['item_id']}")
            print(f"     Title: {title_preview}")
            print()
    
    print(f"\nüéâ Search complete! Results saved to: {output_path}")
    
    return True, matching_items, output_path

# Interactive input for regex pattern
print("JSONL Title Regex Search Tool")
print("=" * 50)
print("This tool searches for titles matching a regular expression pattern.")
print("Examples:")
print("  - 'COVID.*vaccine' (COVID followed by vaccine)")
print("  - '^The ' (titles starting with 'The ')")
print("  - '\\d{4}' (contains a 4-digit number)")
print("  - 'climate.*change|global.*warming' (climate change OR global warming)")
print()

# Get user input
regex_pattern = input("Enter your regular expression pattern: ").strip()

if not regex_pattern:
    print("‚ùå No pattern entered. Exiting.")
else:
    # Ask about case sensitivity
    case_sensitive_input = input("Case sensitive search? (y/n, default=n): ").strip().lower()
    case_sensitive = case_sensitive_input in ['y', 'yes', 'true']
    
    print(f"\nüöÄ Starting search with pattern: '{regex_pattern}'")
    
    # Run the search
    result = search_titles_regex(INPUT_FILE, OUTPUT_DIR, regex_pattern, case_sensitive)
    
    if result and result[0]:
        success, matching_items, output_file = result
        print(f"\n‚ú® Successfully found {len(matching_items)} matching items!")
        print(f"üìÑ Item IDs saved to: {output_file}")
    else:
        print("‚ùå Search failed or no results found.")

JSONL Title Regex Search Tool
This tool searches for titles matching a regular expression pattern.
Examples:
  - 'COVID.*vaccine' (COVID followed by vaccine)
  - '^The ' (titles starting with 'The ')
  - '\d{4}' (contains a 4-digit number)
  - 'climate.*change|global.*warming' (climate change OR global warming)



Enter your regular expression pattern:  foster care
Case sensitive search? (y/n, default=n):  n



üöÄ Starting search with pattern: 'foster care'
üîç Searching for pattern: 'foster care'
üìÅ Case sensitive: False
üìÑ Input file: G:\Shared drives\Labs\Constellate\Constellate Sunset\JSTOR Text Mining\Examples\jstor_metadata_2025-05-28.jsonl
‚è≥ Processed 50,000 lines... 4 matches found
‚è≥ Processed 100,000 lines... 5 matches found
‚è≥ Processed 150,000 lines... 5 matches found
‚è≥ Processed 200,000 lines... 5 matches found
‚è≥ Processed 250,000 lines... 5 matches found
‚è≥ Processed 300,000 lines... 5 matches found
‚è≥ Processed 350,000 lines... 5 matches found
‚è≥ Processed 400,000 lines... 5 matches found
‚è≥ Processed 450,000 lines... 5 matches found
‚è≥ Processed 500,000 lines... 5 matches found
‚è≥ Processed 550,000 lines... 6 matches found
‚è≥ Processed 600,000 lines... 6 matches found
‚è≥ Processed 650,000 lines... 6 matches found
‚è≥ Processed 700,000 lines... 8 matches found
‚è≥ Processed 750,000 lines... 8 matches found
‚è≥ Processed 800,000 lines... 8 matches found
‚