In [5]:
import requests
import json
from typing import List, Dict, Any, Set, Tuple
from collections import defaultdict
import google.generativeai as genai
import urllib3
from urllib.parse import urlencode
import time
from difflib import SequenceMatcher
import tempfile
import os
import re
# Update parameters
# Main calls Analyzer(strategies, content_store, BATCH_DOC_EVAL_V1, SING_DOC_EVAL_V1,SING_DOC_SUMMARY_V1)
class TobaccoDocAnalyzer:
    def __init__(self):
        self.base_url = "https://solr.idl.ucsf.edu/solr/ltdl3/select"
        self.ocr_base = "https://download.industrydocuments.ucsf.edu/"
        genai.configure(api_key='AIzaSyDl7OsvN8gB8v33BkcIzmSuflwia7YOrQk')
        self.model = genai.GenerativeModel('gemini-1.5-flash')
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        # Document tracking
        self.document_frequencies = defaultdict(int)
        self.document_store = {}  # Single source of truth for all document data
        self.title_hash = set()  # Store normalized titles for fast duplicate checking

    def _normalize_title(self, title: str) -> str:
        """Create a normalized version of the title for comparison"""
        if not title:
            return ""
        # Remove punctuation, convert to lowercase, and remove extra whitespace
        normalized = ''.join(c.lower() for c in title if c.isalnum() or c.isspace())
        return ' '.join(normalized.split())

    
    def _add_to_document_store(self, doc: Dict[str, Any]) -> bool:
        """Add document to store if it's not a duplicate. Return True if added."""
        doc_id = doc['id']
        title = doc.get('ti', '')
        
        # Skip if exact ID already exists
        if doc_id in self.document_store:
            return False
            
        # Get normalized title
        norm_title = self._normalize_title(title)
        if not norm_title:  # If no title, just add the doc
            self.document_store[doc_id] = {
                'metadata': doc,
                'analysis': None,
                'ocr_text': None
            }
            return True
            
        # Check for duplicate title
        if norm_title in self.title_hash:
            print(f"Skipping document {doc_id} due to seen title")
            return False
            
        # Add document and its normalized title
        self.title_hash.add(norm_title)
        self.document_store[doc_id] = {
            'metadata': doc,
            'analysis': None,
            'ocr_text': None
        }
        return True
    def build_url(self, params):
        """Build URL manually to match exact format from website"""
        base = self.base_url + "?q=*:*"
        for fq in params.get('fq', []):
            base += f"&fq={requests.utils.quote(fq)}"
        for key, value in params.items():
            if key != 'fq':
                base += f"&{key}={requests.utils.quote(str(value))}"
        return base

    def is_public_doc(self, doc):
        """Check if document is public and unrestricted"""
        availability = doc.get('availability', [])
        return "public" in availability or "no restrictions" in availability

    def get_ocr_text(self, doc_id: str) -> str:
        """Gets OCR text for a document"""
        path_segment = '/'.join(list(doc_id[:4].lower()))
        url = f"{self.ocr_base}{path_segment}/{doc_id.lower()}/{doc_id.lower()}.ocr"
        try:
            response = requests.get(url, verify=False, timeout=10)
            if response.status_code == 200:
                return response.text
            return ""
        except Exception as e:
            print(f"Error getting OCR text for {doc_id}: {e}")
            return ""

    def generate_search_strategies(self, user_query: str) -> List[Dict[str, Any]]:
        """Generate multiple search strategies based on the user query"""
        prompt = f"""
        Given this research question about tobacco documents: "{user_query}"
        Generate 3 different search strategies. Each strategy should contain 2-4 key terms that would help find relevant documents.
        Return your response in this exact JSON format with no additional text:
        {{
            "strategies": [
                {{
                    "search_terms": "term1 term2",
                    "filters": {{}},
                    "rationale": "why this might work"
                }}
            ]
        }}
        """
        
        try:
            response = self.model.generate_content(prompt)
            response_text = response.text.strip()
            
            # Try to find JSON in the response
            import re
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                strategies = json.loads(json_match.group())
                print("\nGenerated search strategies:")
                for idx, strat in enumerate(strategies['strategies'], 1):
                    print(f"\nStrategy {idx}:")
                    print(f"Terms: {strat['search_terms']}")
                    print(f"Filters: {strat['filters']}")
                return strategies['strategies']
            else:
                return self._fallback_search_strategy(user_query)
                
        except Exception as e:
            print(f"Error generating search strategies: {e}")
            return self._fallback_search_strategy(user_query)

    def _fallback_search_strategy(self, query: str) -> List[Dict[str, Any]]:
        """Generate basic search strategies from query terms"""
        print("\nFalling back to query text parsing")
        query_terms = query.lower()
        # Extract any quoted phrases or word combinations
        terms = re.findall(r'"([^"]*)"|\b\w+\s+\w+\b|\b\w+\b', query_terms)
        
        strategies = []
        if len(terms) >= 2:
            strategies.append({
                "search_terms": f"{terms[0]} AND {terms[1]}",
                "filters": {},
                "rationale": "Primary terms combination"
            })
        
        if len(terms) >= 3:
            strategies.append({
                "search_terms": f"{terms[1]} AND {terms[2]}",
                "filters": {},
                "rationale": "Secondary terms combination"
            })
        
        all_terms = " AND ".join(terms[:4])
        strategies.append({
            "search_terms": all_terms,
            "filters": {},
            "rationale": "Combined key terms"
        })
        
        return strategies

    def execute_search(self, strategy: Dict[str, Any], max_results: int = 50) -> List[Dict[str, Any]]:
        """Execute a single search strategy"""
        params = {
            'q': strategy['search_terms'],
            'fq': ['availability:public'],
            'wt': 'json',
            'rows': str(max_results),
            'sort': 'score desc',
            'fl': 'id,au,ti,bn,dd,dt,availability,pg,attach,access,artifact'
        }
        
        # Add strategy filters
        for field, value in strategy['filters'].items():
            params['fq'].append(f'{field}:{value}')

        try:
            response = requests.get(self.base_url, params=params, verify=False)
            if response.status_code == 200:
                return [doc for doc in response.json()['response']['docs'] 
                       if self.is_public_doc(doc)]
        except Exception as e:
            print(f"Error executing search: {e}")
        return []

    def analyze_topic(self, user_query: str, max_iterations: int = 3) -> Dict[str, Any]:
        """Main analysis pipeline"""
        print(f"\nStarting analysis for: {user_query}")
        
        # Reset tracking for new analysis
        self.document_frequencies.clear()
        self.document_store.clear()
        self.title_hash.clear()  # Clear title hash
        
        # Phase 1: Initial Search
        strategies = self.generate_search_strategies(user_query)
        
        for strategy in strategies:
            print(f"\nExecuting strategy: {strategy['search_terms']}")
            docs = self.execute_search(strategy)
            if docs:
                new_docs = []
                for doc in docs:
                    doc_id = doc['id']
                    self.document_frequencies[doc_id] += 1
                    if self._add_to_document_store(doc):  # Use new method
                        new_docs.append(doc)
                
                # Process only new documents in batches
                if new_docs:
                    for i in range(0, len(new_docs), 5):
                        batch = new_docs[i:i+5]
                        self._process_document_batch(batch, user_query)
            time.sleep(2)  # Rate limiting
        
        # Phase 2: Expansion
        iteration = 0
        while iteration < max_iterations:
            print(f"\nExpansion iteration {iteration + 1}")
            
            # Sort documents by weighted score
            sorted_docs = self._get_sorted_documents()[:15]
            
            new_docs_count = 0
            for doc_id, _ in sorted_docs:
                before_docs, target, after_docs = self.find_surrounding_docs(doc_id)
                
                # Process new documents found
                new_docs = []
                for doc in before_docs + after_docs:
                    doc_id = doc['id']
                    self.document_frequencies[doc_id] += 1
                    if self._add_to_document_store(doc):  # Use new method
                        new_docs.append(doc)
                        new_docs_count += 1
                
                # Analyze new documents
                if new_docs:
                    self._process_document_batch(new_docs, user_query)
                time.sleep(1)
            
            print(f"Found {new_docs_count} new documents")
            print(f"Total unique documents: {len(self.document_store)}")
            
            # Check for saturation
            if new_docs_count < 3:
                print("Reached saturation")
                break
            
            iteration += 1
        
        # Prepare final results
        final_results = self._prepare_final_results()
        
        # Get top docs for final analysis
        top_docs = [self.document_store[doc_id]['metadata'] 
                for doc_id in list(final_results.keys())[:10]]
        
        final_analysis = self.analyze_with_gemini(top_docs, user_query)
        
        return {
            'raw_results': final_results,
            'final_analysis': final_analysis if top_docs else "No documents found",
            'document_count': len(final_results),
            'total_documents_seen': len(self.document_store),
            'frequency_statistics': dict(self.document_frequencies)
        }

    def _process_document_batch(self, docs: List[Dict[str, Any]], query_context: str):
        """Process a batch of new documents"""
        # Get OCR text for batch
        for doc in docs:
            doc_id = doc['id']
            if not self.document_store[doc_id]['ocr_text']:
                self.document_store[doc_id]['ocr_text'] = self.get_ocr_text(doc_id)
        
        # Analyze batch
        analysis = self.analyze_document_batch(docs, query_context)
        
        # Store analysis results
        for doc_id, doc_analysis in analysis.items():
            if doc_id in self.document_store:
                self.document_store[doc_id]['analysis'] = doc_analysis

    def analyze_document_batch(self, docs: List[Dict[str, Any]], query_context: str) -> Dict[str, Any]:
        """Analyze documents in batches with fallback to individual processing for problematic batches"""
        if not docs:
            return {}
        
        BATCH_SIZE = 5
        batch_results = {}
        
        # Define the JSON example template separately
        EXAMPLE_JSON = (
            '{\n'
            '    "doc_id1": {\n'
            '        "score": 7,\n'
            '        "entities": {\n'
            '            "people": ["name1", "name2"],\n'
            '            "projects": ["project1"],\n'
            '            "products": ["product1"],\n'
            '            "terms": ["term1"],\n'
            '            "dates": ["date1"]\n'
            '        }\n'
            '    }\n'
            '}'
        )
        
        # First try batch processing
        for i in range(0, len(docs), BATCH_SIZE):
            batch = docs[i:i+BATCH_SIZE]
            doc_texts = []
            batch_ids = []
            
            for doc in batch:
                doc_id = doc['id']
                title = doc.get('ti', 'No title')
                batch_ids.append(doc_id)
                doc_texts.append(
                    f"Document ID: {doc_id}\n"
                    f"Title: {title}\n"
                    f"Content:\n{self.document_store[doc_id]['ocr_text'][:3000]}..."
                )

            prompt = (
                f"Research context: {query_context}\n\n"
                "Analyze these tobacco industry documents. For each document, provide:\n"
                "1. A relevance score (0-10)\n"
                "2. Key entities found (people, projects, products, terms, dates)\n\n"
                f"Return only a JSON object like this example:\n{EXAMPLE_JSON}\n\n"
                "Documents to analyze:\n\n"
                f"{chr(10) + '---' + chr(10)}".join(doc_texts)
            )
            
            try:
                response = self.model.generate_content(prompt)
                response_text = response.text.strip()
                
                # Try to find JSON in the response
                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                if json_match:
                    analysis = json.loads(json_match.group())
                    print(f"\nScores for batch:")
                    for doc_id, details in analysis.items():
                        print(f"Document {doc_id}: Score {details['score']}/10")
                    batch_results.update(analysis)
                    continue  # Move to next batch if successful
                
            except ValueError as e:
                if "reciting from copyrighted material" in str(e):
                    print(f"\n⚠️ Copyright detection in batch {i//BATCH_SIZE + 1} - processing documents individually")
                    # Process problematic batch one by one
                    for doc in batch:
                        doc_id = doc['id']
                        try:
                            individual_result = self._analyze_single_document(doc, query_context)
                            batch_results.update(individual_result)
                        except Exception as doc_error:
                            print(f"Error with document {doc_id}: {doc_error}")
                            batch_results.update(self._create_basic_analysis([doc]))
                
            except Exception as e:
                print(f"\nError in batch {i//BATCH_SIZE + 1}: {e}")
                # Fallback to basic analysis for the entire batch
                batch_results.update(self._create_basic_analysis(batch))
            
            time.sleep(0.5)  # Rate limiting between batches
        
        return batch_results

    def _analyze_single_document(self, doc: Dict[str, Any], query_context: str) -> Dict[str, Any]:
        """Analyze a single document when batch processing fails"""
        doc_id = doc['id']
        title = doc.get('ti', 'No title')
        ocr_text = self.document_store[doc_id]['ocr_text'][:3000] if self.document_store[doc_id]['ocr_text'] else ''
        
        prompt = f"""
        Research context: {query_context}

        Analyze this tobacco industry document. Instead of directly quoting, summarize:
        1. A relevance score (0-10)
        2. Key entities found (people, projects, products, terms, dates)

        Return only a JSON object with NO direct quotes:
        {{
            "{doc_id}": {{
                "score": 7,
                "entities": {{
                    "people": ["name1", "name2"],
                    "projects": ["project1"],
                    "products": ["product1"],
                    "terms": ["term1"],
                    "dates": ["date1"]
                }}
            }}
        }}

        Document to analyze:
        Document ID: {doc_id}
        Title: {title}
        Content summary: {ocr_text}
        """
        
        try:
            response = self.model.generate_content(prompt)
            response_text = response.text.strip()
            
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                analysis = json.loads(json_match.group())
                print(f"Document {doc_id}: Score {analysis[doc_id]['score']}/10")
                return analysis
                
        except ValueError as e:
            if "reciting from copyrighted material" in str(e):
                print(f"⚠️ Copyright detection for {doc_id} - using metadata only")
                return self._create_basic_analysis([doc])
        except Exception as e:
            print(f"Error analyzing {doc_id}: {e}")
        
        return self._create_basic_analysis([doc])

    def _create_basic_analysis(self, docs: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Create basic document analysis from metadata"""
        analysis = {}
        for doc in docs:
            doc_id = doc['id']
            
            # Handle 'au' as either a list or string
            authors = []
            if 'au' in doc:
                if isinstance(doc['au'], list):
                    authors = doc['au']
                elif isinstance(doc['au'], str):
                    authors = [a.strip() for a in doc['au'].split(';') if a.strip()]
            
            date = doc.get('dd', '')
            title = doc.get('ti', '')
            title_terms = [t.lower() for t in title.split() if len(t) > 3] if isinstance(title, str) else []
            
            analysis[doc_id] = {
                "score": 5,  # Neutral score
                "entities": {
                    "people": authors,
                    "projects": [],
                    "products": [],
                    "terms": title_terms,
                    "dates": [date] if date else []
                }
            }
        return analysis

    def find_surrounding_docs(self, doc_id: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
        """Find documents before and after the target document"""
        target_response = requests.get(self.base_url, params={
            'q': f'id:{doc_id}',
            'wt': 'json',
            'fl': 'id,au,ti,bn,dd,dt,availability,pg,attach,access,artifact'
        }, verify=False)
        
        if target_response.status_code != 200:
            return [], None, []
            
        response_docs = target_response.json()['response']['docs']
        if not response_docs:
            print(f"Document {doc_id} not found")
            return [], None, []
            
        target_doc = response_docs[0]
        if not self.is_public_doc(target_doc):
            return [], None, []
            
        bates_num = target_doc.get('bn')
        if not bates_num:
            print(f"No Bates number for document {doc_id}")
            return [], None, []
        
        before_params = {
            'fq': [
                '(collectioncode:"pm" OR collectioncode:"msa")',
                'published:true',
                f'bn:[* TO "{bates_num}"}}'
            ],
            'fl': 'id,au,ti,bn,dd,dt,availability,pg,attach,access,artifact',
            'wt': 'json',
            'rows': '3',
            'sort': 'bn_sort desc'
        }
        
        after_params = {
            'fq': [
                '(collectioncode:"pm" OR collectioncode:"msa")',
                'published:true',
                f'bn:{{"{bates_num}" TO *]'
            ],
            'fl': 'id,au,ti,bn,dd,dt,availability,pg,attach,access,artifact',
            'wt': 'json',
            'rows': '3',
            'sort': 'bn_sort asc'
        }
        
        before_url = self.build_url(before_params)
        after_url = self.build_url(after_params)
        
        try:
            before_docs = requests.get(before_url, verify=False, timeout=10)
            after_docs = requests.get(after_url, verify=False, timeout=10)
            
            before_results = []
            after_results = []
            
            if before_docs.status_code == 200:
                before_results = [doc for doc in before_docs.json()['response']['docs'] 
                                if self.is_public_doc(doc)]
            
            if after_docs.status_code == 200:
                after_results = [doc for doc in after_docs.json()['response']['docs'] 
                               if self.is_public_doc(doc)]

            return before_results, target_doc, after_results
            
        except Exception as e:
            print(f"Error finding surrounding documents for {doc_id}: {e}")
            return [], None, []

    def _get_sorted_documents(self) -> List[Tuple[str, float]]:
        """Get documents sorted by weighted score"""
        weighted_scores = []
        for doc_id, doc_data in self.document_store.items():
            if doc_data['analysis']:  # Only include analyzed documents
                base_score = doc_data['analysis']['score']
                frequency_bonus = min(self.document_frequencies[doc_id] * 0.5, 2.0)
                weighted_scores.append((doc_id, base_score + frequency_bonus))
        
        return sorted(weighted_scores, key=lambda x: x[1], reverse=True)

    def _prepare_final_results(self) -> Dict[str, Any]:
        """Prepare final results with all relevant information"""
        sorted_docs = self._get_sorted_documents()
        final_results = {}
        
        for doc_id, weighted_score in sorted_docs:
            doc_data = self.document_store[doc_id]
            if doc_data['analysis']:  # Only include analyzed documents
                final_results[doc_id] = {
                    **doc_data['analysis'],
                    'frequency': self.document_frequencies[doc_id],
                    'weighted_score': weighted_score
                }
        
        return final_results

    def analyze_with_gemini(self, docs: List[Dict[Any, Any]], user_query: str, custom_prompt: str = None) -> str:
        """Generate final analysis using Gemini with simpler, more direct approach"""
        try:
            # Create a single temporary directory for full document texts
            with tempfile.TemporaryDirectory() as temp_dir:
                # Store all document texts in single file with clear separators
                analysis_file = os.path.join(temp_dir, "documents.txt")
                with open(analysis_file, 'w', encoding='utf-8') as f:
                    for doc in docs:
                        doc_id = doc.get('id')
                        title = doc.get('ti', 'No title')
                        date = doc.get('dd', 'No date')
                        authors = ', '.join(doc.get('au', [])) if isinstance(doc.get('au'), list) else doc.get('au', 'Unknown')
                        doc_type = doc.get('dt', 'Unknown type')
                        ocr_text = self.document_store[doc_id]['ocr_text'] if doc_id in self.document_store else self.get_ocr_text(doc_id)
                        
                        f.write(f"""
    === DOCUMENT START ===
    ID: {doc_id}
    Title: {title}
    Date: {date}
    Authors: {authors}
    Type: {doc_type}

    Content:
    {ocr_text}

    === DOCUMENT END ===

    """)
                
                # Read all content
                with open(analysis_file, 'r', encoding='utf-8') as f:
                    all_docs_text = f.read()

                # Create single comprehensive prompt
                prompt = f"""Research Question: {user_query}

    Analyze these tobacco industry documents. Create a detailed summary for each document in this exact format:

    [document_id]: Summary should include:
    - Document type and year
    - Key findings or main points
    - Notable people, organizations, or projects
    - Specific data, numbers, or research findings
    - Marketing strategies or business decisions
    - Public health implications

    Example of desired summary format:
    [ysvj0228]: This 2008 American Journal of Public Health article examines how tobacco manufacturers manipulated menthol levels in cigarettes to target adolescents and young adults. The authors analyzed internal tobacco industry documents, conducted lab tests on various menthol cigarette brands, and reviewed data from the National Survey on Drug Use and Health. Their findings indicate that lower menthol levels, particularly in brands like Newport and Marlboro Milds, were more appealing to younger smokers because they masked the harshness of cigarettes, facilitating smoking initiation and nicotine addiction. Higher menthol levels were targeted toward long-term smokers. The study also reveals a significant increase in magazine advertising expenditures for menthol brands, despite a decline in overall cigarette sales.

    Documents to analyze:

    {all_docs_text}

    Return ONLY document summaries in the specified format, with one blank line between each summary. Group related documents together if they share common themes or topics."""

                try:
                    response = self.model.generate_content(
                        prompt,
                        generation_config=genai.GenerationConfig(
                            candidate_count=1,
                            max_output_tokens=8192,
                            temperature=0.2,
                            top_k=20,
                            top_p=0.8
                        )
                    )
                    return response.text.strip()
                    
                except ValueError as e:
                    if "reciting from copyrighted material" in str(e):
                        # Return metadata-only summaries
                        return "\n\n".join([
                            f"[{doc.get('id', 'Unknown ID')}]: [Content restricted - Metadata only] "
                            f"This {doc.get('dd', 'undated')} document titled '{doc.get('ti', 'No title')}' "
                            f"is a {doc.get('dt', 'document')} authored by "
                            f"{', '.join(doc.get('au', ['Unknown'])) if isinstance(doc.get('au'), list) else doc.get('au', 'Unknown')}."
                            for doc in docs
                        ])
                except Exception as e:
                    print(f"Error in document analysis: {e}")
                    return self._create_basic_metadata_summaries(docs)
                    
        except Exception as e:
            print(f"Error in overall analysis: {e}")
            return self._create_basic_metadata_summaries(docs)

    def _create_basic_metadata_summaries(self, docs: List[Dict[str, Any]]) -> str:
        """Create basic summaries from metadata when analysis fails"""
        return "\n\n".join([
            f"[{doc.get('id', 'Unknown ID')}]: Basic metadata - "
            f"This {doc.get('dd', 'undated')} document titled '{doc.get('ti', 'No title')}' "
            f"is a {doc.get('dt', 'document')} authored by "
            f"{', '.join(doc.get('au', ['Unknown'])) if isinstance(doc.get('au'), list) else doc.get('au', 'Unknown')}."
            for doc in docs
        ])

In [3]:
analyzer = TobaccoDocAnalyzer()
query = input("Enter your research question about tobacco documents: ")
results = analyzer.analyze_topic(query)

print("\nFinal Analysis:")
print(results['final_analysis'])
print(f"\nTotal documents analyzed: {results['document_count']}")
print(f"Total documents seen: {results['total_documents_seen']}")    


In [26]:
import requests

base_url = "https://solr.idl.ucsf.edu/solr/ltdl3/select"

params = {
    'q': 'menthol',  
    'fq': [  
        'availability:public',
        'dd:[19900101 TO 19991231]',
        'dt:memo',
        'collection:"Philip Morris"'
    ],
    'wt': 'json',
    'rows': 10     
}

response = requests.get(base_url, params=params, verify=False)
results = response.json()
print(f"Found {results['response']['numFound']} total documents")

# Look at documents including their type
if results['response']['numFound'] > 0:
    for doc in results['response']['docs'][:3]:
        print("\nDocument:")
        print(f"Title: {doc.get('ti', 'No title')}")
        print(f"Date: {doc.get('dd', 'No date')}")
        print(f"Type: {doc.get('dt', 'No type')}")
        print(f"Collection: {doc.get('collection', 'No collection')}")

Found 19638 total documents

Document:
Title: No title
Date: 1999 August 18
Type: ['memo']
Collection: ['Philip Morris Records', 'Master Settlement Agreement']

Document:
Title: SPECIAL REPORT TO THE FTC
Date: 1993 July 06
Type: ['memo']
Collection: ['Philip Morris Records', 'Master Settlement Agreement']

Document:
Title: FTC SPECIAL REPORT FOR 19960000
Date: 1997 April 22
Type: ['memo']
Collection: ['Philip Morris Records', 'Master Settlement Agreement']


In [6]:
# Initialize analyzer
analyzer = TobaccoDocAnalyzer()

# Build search parameters for empty titles
params = {
    'q': 'ti:"tobacco',  # Search for empty title field
    'fq': ['availability:public'],  # Only public documents
    'wt': 'json',
    'rows': '20',  # Get 20 results
    'fl': 'id,au,ti,bn,dd,dt'  # Fields to return
}

# Execute search
response = requests.get(analyzer.base_url, params=params, verify=False)
docs = response.json()['response']['docs']

# Display results
print(f"Found {len(docs)} documents with empty titles\n")
for doc in docs:
    print(f"ID: {doc['id']}")
    print(f"Date: {doc.get('dd', 'No date')}")
    print(f"Type: {doc.get('dt', 'No type')}")
    print(f"Authors: {doc.get('au', 'No authors')}")
    print(f"Bates: {doc.get('bn', 'No bates number')}")
    print("-" * 50)

Found 20 documents with empty titles

ID: qpbh0182
Date: 1997 October 29
Type: ['deposition', 'trial transcript']
Authors: ['BLANK,MS', 'BRECKENRIDGE,JAE', 'GADDES,A', 'MAISTROS,JD', 'NUNLEY,LD III', 'SHUB,J', 'SWAIN,JW', 'WEBB,K']
Bates: 2062814125-2062814326
--------------------------------------------------
ID: frdx0055
Date: 1997 October 09
Type: ['deposition', 'trial transcript']
Authors: ['SUPREME COURT OF THE STATE OF NEW YORK COUNTY OF NEW YORK']
Bates: 94669244-94669494B
--------------------------------------------------
ID: plcm0166
Date: 1997 October 24
Type: ['pleading']
Authors: No authors
Bates: 2062817661-2062817669
--------------------------------------------------
ID: tmmh0055
Date: 1997 October 23
Type: ['deposition', 'trial transcript']
Authors: ['COUNTY OF NEW YORK', 'SUPREME COURT OF THE STATE OF NEW YORK']
Bates: 94669755-94670002
--------------------------------------------------
ID: ftwx0228
Date: 2004 October 07
Type: ['abstract']
Authors: ['AEGEAN AGRICULTURAL