<a href="https://colab.research.google.com/github/arame/BoE/blob/main/ConvertDocuments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
install_flag = True
if install_flag:
  !pip install PyPDF2
  !pip install python-docx
  !pip install transformers
  !pip install torch
  !pip install openpyxl

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lxml, python-docx
Successfully installed lxml-5.4.

In [None]:

import json, os, re, sys, time
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
import torch
import openpyxl
from google.colab import drive

# Optional imports with graceful degradation
try:
    import PyPDF2
except ImportError:
    PyPDF2 = None

try:
    import docx
except ImportError:
    docx = None

# Mount Google Drive (safe to call multiple times)
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:


class TranscriptProcessor:
    """
    Main class for processing earnings call transcripts using Phi-4.

    This class handles the entire pipeline from file reading to structured output,
    including text cleaning, Q&A extraction, and LLM-based parsing.
    """

    # Class constants - adjusted for Phi-4
    DEFAULT_CHUNK_SIZE = 4000  # Smaller chunks for local processing
    QA_CHUNK_SIZE = 5000
    MAX_NEW_TOKENS = 2000
    TEMPERATURE = 0.1
    MODEL_NAME = "microsoft/Phi-4"  # Phi-4 model

    def __init__(self, ticker: str, year: int, quarter: int, base_path: str ='/content/drive/MyDrive/', device: Optional[str] = None):
        """
        Initialize the TranscriptProcessor with Phi-4.

        Args:
            ticker: company ticker code
            year: year
            quarter: number of the quarter between 1 and 4
            device: Device to run model on ('cuda', 'cpu', or None for auto)
        """
        self.ticker = ticker.upper()
        self.year = year
        self.quarter = quarter
        output_file = f"{ticker}_{year}_Q{quarter}"
        self.output_presentation_file = f"{output_file}_presentation.txt"
        self.output_qa_file = f"{output_file}_qa_data.xlsx"

        base_dir = Path(base_path)
        if not base_dir.exists():
            base_dir.mkdir(parents=True)
        output_dir = Path(os.path.join(base_path, "output"))
        if not output_dir.exists():
            output_dir.mkdir(parents=True)
        self.output_dir = output_dir
        self.output_presentation_path = os.path.join(self.output_dir, self.output_presentation_file)
        self.output_qa_path = os.path.join(self.output_dir, self.output_qa_file)

        # delete and create log directory
        if os.path.exists("logs"):
            for file in os.listdir("logs"):
                os.remove(os.path.join("logs", file))
        log_dir = Path(os.path.join(base_path, "logs"))
        if not log_dir.exists():
            log_dir.mkdir(parents=True)
        self.log_dir = log_dir

        # Setup device
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

        print(f"🚀 Initializing Phi-4 on {self.device}...")
        self._initialize_model()

    def _initialize_model(self):
        """Initialize the Phi-4 model and tokenizer."""
        try:
            # Load tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.MODEL_NAME,
                trust_remote_code=True
            )

            # Load model with appropriate settings
            model_kwargs = {
                "trust_remote_code": True,
                "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
                "device_map": "auto" if self.device == "cuda" else None,
            }

            self.model = AutoModelForCausalLM.from_pretrained(
                self.MODEL_NAME,
                **model_kwargs
            )

            if self.device == "cpu":
                self.model = self.model.to(self.device)

            # Create text generation pipeline
            self.pipe = pipeline(
                "text-generation",
                model=self.model,
                tokenizer=self.tokenizer,
                device=0 if self.device == "cuda" else -1,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            )

            print(f"✅ Phi-4 model loaded successfully on {self.device}")

        except Exception as e:
            print(f"❌ Error loading Phi-4 model: {e}")
            print("💡 Make sure you have sufficient memory and the transformers library installed")
            raise

    def _generate_response(self, prompt: str) -> str:
        """
        Generate response using Phi-4 model.

        Args:
            prompt: Input prompt for the model

        Returns:
            Generated response text
        """
        try:
            # Format prompt for Phi-4
            formatted_prompt = f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"

            # Generate response
            result = self.pipe(
                formatted_prompt,
                max_new_tokens=self.MAX_NEW_TOKENS,
                temperature=self.TEMPERATURE,
                do_sample=True if self.TEMPERATURE > 0 else False,
                pad_token_id=self.tokenizer.eos_token_id,
                return_full_text=False  # Only return the generated part
            )

            response = result[0]['generated_text'].strip()
            return response

        except Exception as e:
            print(f"❌ Error generating response: {e}")
            return ""

    def _fix_pdf_encoding(self, text: str) -> str:
        """
        Clean and fix common PDF encoding issues.

        PDFs often contain malformed Unicode characters that need to be replaced
        with their proper equivalents for better text processing.

        Args:
            text (str): Raw text extracted from PDF

        Returns:
            str: Cleaned text with fixed encoding issues
        """
        # Common PDF encoding fixes
        encoding_fixes = {
            'â€"': '—',     # Em dash
            'â€™': "'",     # Right single quotation mark
            'â€œ': '"',     # Left double quotation mark
            'â€': '"',      # Right double quotation mark
            'â€¢': '•',     # Bullet point
            'â€¦': '…',     # Horizontal ellipsis
            'â€˜': "'",     # Left single quotation mark
            'Â': '',        # Non-breaking space artifact
            'â€‹': '',      # Zero-width space
            'ï¿½': '',      # Replacement character
            'â€Š': ' ',     # Hair space
            'â€‰': ' ',     # Thin space
            'â€ˆ': ' ',     # Punctuation space
            'â€‡': ' ',     # Figure space
        }

        # Apply encoding fixes
        for malformed, correct in encoding_fixes.items():
            text = text.replace(malformed, correct)

        # Remove any remaining non-ASCII characters and normalize whitespace
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)
        return text.strip()

    def _read_pdf_file(self, file_path: str) -> str:
        """
        Extract text content from a PDF file.

        Args:
            file_path (str): Path to the PDF file

        Returns:
            str: Extracted and cleaned text content

        Raises:
            ImportError: If PyPDF2 is not installed
            Exception: If PDF reading fails
        """
        if not PyPDF2:
            raise ImportError(
                "PyPDF2 is required to process PDF files. "
                "Install it with: pip install PyPDF2"
            )

        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text_pages = []

                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:  # Only add non-empty pages
                        text_pages.append(page_text)

                raw_text = '\n'.join(text_pages)
                return self._fix_pdf_encoding(raw_text)

        except Exception as e:
            raise Exception(f"Failed to read PDF file '{file_path}': {str(e)}")

    def _read_docx_file(self, file_path: str) -> str:
        """
        Extract text content from a DOCX file.

        Args:
            file_path (str): Path to the DOCX file

        Returns:
            str: Extracted text content

        Raises:
            ImportError: If python-docx is not installed
            Exception: If DOCX reading fails
        """
        if not docx:
            raise ImportError(
                "python-docx is required to process DOCX files. "
                "Install it with: pip install python-docx"
            )

        try:
            document = docx.Document(file_path)
            paragraphs = []

            for paragraph in document.paragraphs:
                if paragraph.text.strip():  # Only add non-empty paragraphs
                    paragraphs.append(paragraph.text)

            return '\n'.join(paragraphs)

        except Exception as e:
            raise Exception(f"Failed to read DOCX file '{file_path}': {str(e)}")

    def read_file(self, file_path: str) -> str:
        """
        Read and extract text from supported file formats.

        Supports PDF and DOCX files with automatic format detection based on
        file extension.

        Args:
            file_path (str): Path to the input file

        Returns:
            str: Extracted text content

        Raises:
            FileNotFoundError: If the specified file doesn't exist
            Exception: If file format is unsupported or reading fails
        """
        path = Path(file_path)

        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        file_extension = path.suffix.lower()

        if file_extension == '.pdf':
            return self._read_pdf_file(file_path)

        if file_extension in ['.docx', '.doc']:
            return self._read_docx_file(file_path)

        raise Exception(
            f"Unsupported file format: {file_extension}. "
            "Supported formats: .pdf, .docx, .doc"
        )

    def _find_qa_section_start(self, text: str) -> int:
        """
        Locate the start of the Q&A section in the transcript.

        Uses multiple patterns to identify where the Q&A section begins,
        including specific analyst name patterns.

        Args:
            text (str): Full transcript text

        Returns:
            int: Character position where Q&A section starts, or -1 if not found
        """
        qa_patterns = [
            # Traditional Q&A markers
            r'QUESTION AND ANSWER SECTION',
            r'QUESTIONS? AND ANSWERS?',
            r'Q&A SECTION',
            r'Q&A',
            r'(?i)can\s*we\s*go\s*to\s*Q&?A',  # Common transition phrase

            # Look for first analyst question pattern (NAME, COMPANY:)
            r'(?m)^[A-Z][A-Z\s]+,\s+[A-Z][A-Z\s&]+:\s*(?:Good morning|Hi|Thank you)',

            # Operator introducing questions
            r'(?i)OPERATOR:.*(?:question|Q:)',

            # Simple patterns as fallback
            r'(?i)Q:',
            r'(?m)^[A-Z\s]+:\s*(?:Good morning|Hi|Thank you).*(?:question|ask)',
        ]

        # Try each pattern and return the earliest match
        earliest_match = len(text)  # Start with end of text
        found_match = False

        for pattern in qa_patterns:
            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
            if match and match.start() < earliest_match:
                earliest_match = match.start()
                found_match = True

        return earliest_match if found_match else -1

    def _create_presentation_cleaning_prompt(self, chunk: str) -> str:
        """
        Create a prompt for cleaning presentation content using Phi-4.

        Args:
            chunk (str): Text chunk to be cleaned

        Returns:
            str: Formatted prompt for the model
        """
        return f"""You are helping to clean an earnings call transcript. Remove only the conference operator's technical instructions while keeping all executive and analyst remarks.

REMOVE these types of operator lines:
- "Your line is open"
- "We'll now take our next question"
- "Please hold while we connect your call"
- "Thank you for joining today's call"
- Technical connection instructions

KEEP everything else exactly as written, including:
- All executive presentations and remarks
- All analyst questions and comments
- Company names, financial figures, and business content

Return only the cleaned text with no additional commentary.

Text to clean:
{chunk}"""

    def _clean_presentation_with_llm(self, text: str) -> str:
        """
        Clean presentation text using Phi-4 processing in chunks.

        Processes the presentation text in manageable chunks to remove operator
        instructions and other non-content elements while preserving the actual
        presentation content.

        Args:
            text (str): Raw presentation text

        Returns:
            str: Cleaned presentation text
        """
        # Split text into manageable chunks
        chunks = [
            text[i:i + self.DEFAULT_CHUNK_SIZE]
            for i in range(0, len(text), self.DEFAULT_CHUNK_SIZE)
        ]

        cleaned_chunks = []

        for idx, chunk in enumerate(chunks, 1):
            prompt = self._create_presentation_cleaning_prompt(chunk)

            try:
                cleaned_text = self._generate_response(prompt)
                cleaned_chunks.append(cleaned_text)

                # Save debug files
                debug_file = f"cleaned_presentation_prompt_chunk_{idx}.txt"
                self._save_debug_file(debug_file, prompt)
                debug_file = f"cleaned_presentation_response_chunk_{idx}.txt"
                self._save_debug_file(debug_file, cleaned_text)

                print(f"✅ Presentation chunk {idx}/{len(chunks)} cleaned successfully")

            except Exception as e:
                print(f"❌ Error cleaning presentation chunk {idx}: {e}")
                # Continue with other chunks even if one fails
                continue

        return "\n\n".join(cleaned_chunks)

    def _create_qa_extraction_prompt(self, chunk: str) -> str:
        """
        Create a prompt for extracting Q&A data using Phi-4.

        Args:
            chunk (str): Q&A text chunk to be processed

        Returns:
            str: Formatted prompt for the model
        """
        return f"""Extract questions and answers from this earnings call transcript. Remove operator instructions but keep all analyst questions and executive answers.

Return ONLY a valid JSON array in this exact format:
[
{{"question_number": 1, "type": "question", "speaker_name": "John Smith", "speaker_details": "Analyst, Goldman Sachs", "text": "My question is about..."}},
{{"question_number": 1, "type": "answer", "speaker_name": "Jane Doe", "speaker_details": "CEO", "text": "Thanks for your question..."}}
]

Rules:
- Remove operator lines like "Your line is open", "Next question comes from"
- Keep exact wording of questions and answers
- Number questions sequentially starting from 1
- Include follow-up questions as separate entries
- Use "Analyst" or "Executive" if names unclear
- Return ONLY the JSON array, no other text

Transcript:
{chunk}"""

    def _process_qa_chunk(self, chunk: str, chunk_idx: int) -> List[Dict]:
        """
        Process a single Q&A chunk using Phi-4.

        Args:
            chunk (str): Q&A text chunk
            chunk_idx (int): Index of the current chunk

        Returns:
            List[Dict]: Extracted Q&A entries for this chunk
        """
        prompt = self._create_qa_extraction_prompt(chunk)

        # Save prompt for debugging
        debug_file = f"qa_prompt_chunk_{chunk_idx}.txt"
        self._save_debug_file(debug_file, prompt)

        try:
            raw_response = self._generate_response(prompt)

            # Save raw response for debugging
            debug_file = f"qa_response_chunk_{chunk_idx}.txt"
            self._save_debug_file(debug_file, raw_response)

            if not raw_response:
                print(f"⚠️  Chunk {chunk_idx} returned empty response")
                return []

            # Extract JSON from response - be more flexible with Phi-4 output
            json_start = raw_response.find("[")
            json_end = raw_response.rfind("]") + 1

            if json_start == -1 or json_end == 0:
                print(f"⚠️  Chunk {chunk_idx}: No JSON array found in response")
                return []

            json_str = raw_response[json_start:json_end]

            try:
                chunk_data = json.loads(json_str)
                if isinstance(chunk_data, list):
                    print(f"✅ Chunk {chunk_idx} processed: {len(chunk_data)} entries extracted")
                    return chunk_data
                else:
                    print(f"⚠️  Chunk {chunk_idx}: JSON structure is not a list")
                    return []

            except json.JSONDecodeError as json_err:
                print(f"❌ Chunk {chunk_idx} JSON parsing failed: {json_err}")
                debug_file = f"error_chunk_{chunk_idx}.log"
                self._save_debug_file(
                    debug_file,
                    f"JSON parsing error: {json_err}\n\nRaw content:\n{raw_response}"
                )
                return []

        except Exception as e:
            print(f"❌ Chunk {chunk_idx} processing failed: {e}")
            debug_file = f"error_chunk_{chunk_idx}.log"
            self._save_debug_file(
                debug_file,
                f"Processing error: {e}"
            )
            return []

    def _extract_qa_with_llm(self, qa_text: str) -> List[Dict]:
        """
        Extract Q&A data from text using Phi-4 processing.

        Processes Q&A section in chunks and combines results into a structured
        format suitable for analysis.

        Args:
            qa_text (str): Q&A section text

        Returns:
            List[Dict]: Structured Q&A data with questions and answers
        """
        if not qa_text or len(qa_text.strip()) < 100:
            print("⚠️  Q&A section too short or empty, skipping LLM extraction")
            return []

        # Split Q&A into chunks
        qa_chunks = [
            qa_text[i:i + self.QA_CHUNK_SIZE]
            for i in range(0, len(qa_text), self.QA_CHUNK_SIZE)
        ]

        print(f"📊 Processing {len(qa_chunks)} Q&A chunks")

        all_qa_data = []

        for idx, chunk in enumerate(qa_chunks, 1):
            print(f"🔄 Processing Q&A chunk {idx}/{len(qa_chunks)}")
            chunk_data = self._process_qa_chunk(chunk, idx)
            all_qa_data.extend(chunk_data)

        # Renumber questions sequentially
        return self._renumber_questions(all_qa_data)

    def _renumber_questions(self, qa_data: List[Dict]) -> List[Dict]:
        """
        Renumber questions sequentially across all chunks.

        Since chunks are processed independently, question numbers may not be
        sequential. This method ensures proper numbering.

        Args:
            qa_data (List[Dict]): Raw Q&A data with potentially non-sequential numbering

        Returns:
            List[Dict]: Q&A data with properly sequential question numbers
        """
        renumbered_data = []
        current_question_number = 1

        for entry in qa_data:
            if entry.get('type') == 'question':
                # Start of new Q&A pair
                question_number_for_pair = current_question_number
                current_question_number += 1

            # Update the question number for both questions and answers
            entry['question_number'] = question_number_for_pair
            renumbered_data.append(entry)

        return renumbered_data

    def _save_debug_file(self, debug_file: str, content: str) -> None:
        """
        Save debug content to file with error handling.

        Args:
            debug_file (str): file name to save
            content (str): Content to save
        """
        debug_path = os.path.join(self.log_dir, debug_file)
        try:
            with open(debug_path, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"✅ Saved: {debug_path}")
        except Exception as e:
            print(f"⚠️  Failed to save debug file {debug_path}: {e}")

    def extract_content(self, text: str) -> Tuple[str, List[Dict]]:
        """
        Extract and process both presentation and Q&A content from transcript.

        This is the main processing method that coordinates the extraction of
        presentation content and Q&A data using Phi-4 processing.

        Args:
            text (str): Full transcript text

        Returns:
            Tuple[str, List[Dict]]: Cleaned presentation text and structured Q&A data
        """
        print("🔍 Locating Q&A section...")
        qa_start_position = self._find_qa_section_start(text)

        if qa_start_position != -1:
            presentation_text = text[:qa_start_position]
            qa_text = text[qa_start_position:]
            print(f"✅ Found Q&A section at position {qa_start_position}")
        else:
            presentation_text = text
            qa_text = ""
            print("⚠️  Q&A section not found, treating entire text as presentation")

        print("🧹 Cleaning presentation content...")
        cleaned_presentation = self._clean_presentation_with_llm(presentation_text)

        print("❓ Extracting Q&A data...")
        qa_data = self._extract_qa_with_llm(qa_text)

        print(f"✅ Extraction complete: {len(qa_data)} Q&A entries processed")
        return cleaned_presentation, qa_data

    def _save_outputs(self, presentation: str, qa_data: List[Dict]) -> None:
        """
        Save processed content to output files.

        Args:
            presentation (str): Cleaned presentation text
            qa_data (List[Dict]): Structured Q&A data
        """

        # Save presentation text
        try:
            with open(self.output_presentation_path, 'w', encoding='utf-8') as f:
                f.write(presentation)
            print(f"✅ Saved: {self.output_presentation_path}")
        except Exception as e:
            print(f"❌ Failed to save {self.output_presentation_path}: {e}")

        # Save Q&A data as DataFrame and export to multiple formats
        if qa_data:
            df = pd.DataFrame(qa_data)

            # Save as Excel
            try:
                df.to_excel(self.output_qa_path, index=False)
                print(f"✅ Saved: {self.output_qa_path}")
            except Exception as e:
                print(f"❌ Failed to save {self.output_qa_path}: {e}")
        else:
            print("⚠️  No Q&A data to save")

    def process_transcript(self, file_path: str) -> Tuple[str, pd.DataFrame]:
        """
        Complete transcript processing pipeline.

        This is the main public method that orchestrates the entire processing
        workflow from file reading to output generation.

        Args:
            file_path (str): Path to the transcript file (PDF or DOCX)

        Returns:
            Tuple[str, pd.DataFrame]: Processed presentation text and Q&A DataFrame

        Raises:
            Exception: If any step in the processing pipeline fails
        """
        print(f"📖 Reading transcript from: {file_path}")

        # Read and extract text from file
        raw_text = self.read_file(file_path)
        print(f"📄 Extracted {len(raw_text):,} characters from transcript")

        # Process content using Phi-4
        presentation, qa_data = self.extract_content(raw_text)

        # Save outputs
        self._save_outputs(presentation, qa_data)

        # Return processed data
        qa_dataframe = pd.DataFrame(qa_data) if qa_data else pd.DataFrame()

        print("🎉 Processing completed successfully!")
        print(f"📊 Results: {len(presentation.split())} words in presentation, {len(qa_data)} Q&A entries")

        return presentation, qa_dataframe



In [None]:
def get_file(file_name: str, base_path: str = '/content/drive/MyDrive/BoE') -> str:

    # Construct full file path
    pdf_path = os.path.join(base_path, file_name)

    # Check if file exists
    if not os.path.exists(pdf_path):
        print(f"Error: File not found at {pdf_path}")
        print("Available files in directory:")
        try:
            print(os.listdir(base_path))
        except FileNotFoundError:
            print(f"Directory {base_path} not found")
        return None
    return pdf_path

In [None]:
def main() -> None:
    """
    Main function to run the transcript processor with Phi-4.

    Demonstrates usage of the TranscriptProcessor class with a specific file.
    In production, this could be modified to accept command-line arguments.
    """
    start_time = time.time()
    _type = "ING"
    try:
        if _type == "JPM":
            # Configuration - modify these values as needed
            file_name = "JPM_1q25-earnings-transcript.pdf"
            file_path = get_file(file_name)
            ticker = "JPM"
            year = 2025
            quarter = 1

        if _type == "HSBC":
            file_name = "250429-1q-2025-earnings-release-investors-and-analysts-call-transcript.pdf"
            file_path = get_file(file_name)
            ticker = "HSBC"
            year = 2025
            quarter = 1

        if _type == "ING":
            file_name = "ING_Transcript_Analyst_Call_3Q2023.pdf"
            file_directory = "ING"
            file_path = get_file(file_name)
            ticker = "ING"
            year = 2023
            quarter = 3

        # Verify file exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Transcript file not found: {file_path}")

        # Initialize processor and run
        processor = TranscriptProcessor(ticker, year, quarter)
        presentation, qa_dataframe = processor.process_transcript(file_path)

        # Display summary statistics
        print("\n" + "="*60)
        print("📈 PROCESSING SUMMARY")
        print("="*60)
        print(f"Input file: {file_path}")
        print(f"Output directory: {processor.output_dir}")
        print(f"Presentation words: {len(presentation.split()):,}")
        print(f"Q&A entries: {len(qa_dataframe):,}")
        if not qa_dataframe.empty:
            questions = len(qa_dataframe[qa_dataframe['type'] == 'question'])
            answers = len(qa_dataframe[qa_dataframe['type'] == 'answer'])
            print(f"Questions: {questions}, Answers: {answers}")

        elapsed_time = time.time() - start_time
        print(f"Processing time: {elapsed_time:.2f} seconds")
        print("="*60)

    except Exception as e:
        print(f"❌ Error in main execution: {e}")
        sys.exit(1)

In [None]:

print(f"🚀 Starting transcript processing with Phi-4 at {time.strftime('%Y-%m-%d %H:%M:%S')}")
main()
print(f"🏁 Completed at {time.strftime('%Y-%m-%d %H:%M:%S')}")


🚀 Starting transcript processing with Phi-4 at 2025-06-20 10:44:58
🚀 Initializing Phi-4 on cpu...


tokenizer_config.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.25M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/802 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Device set to use cpu


✅ Phi-4 model loaded successfully on cpu
📖 Reading transcript from: /content/drive/MyDrive/BoE/ING_Transcript_Analyst_Call_3Q2023.pdf
📄 Extracted 54,674 characters from transcript
🔍 Locating Q&A section...
✅ Found Q&A section at position 18339
🧹 Cleaning presentation content...
✅ Saved: /content/drive/MyDrive/logs/cleaned_presentation_prompt_chunk_1.txt
✅ Saved: /content/drive/MyDrive/logs/cleaned_presentation_response_chunk_1.txt
✅ Presentation chunk 1/5 cleaned successfully
✅ Saved: /content/drive/MyDrive/logs/cleaned_presentation_prompt_chunk_2.txt
✅ Saved: /content/drive/MyDrive/logs/cleaned_presentation_response_chunk_2.txt
✅ Presentation chunk 2/5 cleaned successfully
✅ Saved: /content/drive/MyDrive/logs/cleaned_presentation_prompt_chunk_3.txt
✅ Saved: /content/drive/MyDrive/logs/cleaned_presentation_response_chunk_3.txt
✅ Presentation chunk 3/5 cleaned successfully
✅ Saved: /content/drive/MyDrive/logs/cleaned_presentation_prompt_chunk_4.txt
✅ Saved: /content/drive/MyDrive/logs/c