## Amanda Cesario's notebook for DS Final Project
### Question 5: How does IonQ's risk factors change over time?
### Group 3 members: Cole Barrett, Caterina Grossi, Connor Steward
This notebook is my process for converting a pdf to a txt file, cleaning the text file, then extracting the Item 1A Risk Factors to begin my analysis using OpenAI API (in the notebook "OpenAI - Final Project")

In [1]:
import nltk
import pandas as pd
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize
import re

In [2]:
# !pip install pdfminer.six

In [3]:
from pdfminer.high_level import extract_text

def pdf_to_txt(pdf_path, txt_path):
    # Extract text from the PDF file
    text = extract_text(pdf_path)
    # Write the extracted text to a TXT file
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)

if __name__ == "__main__":
    pdf_file = "IONQ 2024 10K.pdf"
    txt_file = "IONQ 2024 10K.txt"
    pdf_to_txt(pdf_file, txt_file)
    print(f"Converted {pdf_file} to {txt_file}")

Converted IONQ 2024 10K.pdf to IONQ 2024 10K.txt


In [4]:
def clean_text_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile:
        raw_text = infile.read()
    
    cleaned_text = clean_text(raw_text)
    
    with open(output_path, 'w', encoding='utf-8') as outfile:
        outfile.write(cleaned_text)
    print(f"Cleaned text saved to {output_path}")

if __name__ == "__main__":
    input_txt = "IONQ 2024 10K.txt"  # the original converted text file
    output_txt = "IONQ 2024 10K_clean.txt"  # the cleaned text file
    clean_text_file(input_txt, output_txt)

Cleaned text saved to IONQ 2024 10K_clean.txt


In [7]:
def extract_item_1A_section(text):
    """
    Extracts the text starting from "Item 1A" up to (but not including) "Item 2".
    The search is case-insensitive and spans multiple lines.
    """
    # Define start and end patterns
    start_pattern = r'Item\s*1A\s*[:.-]?.*Risk\s*Factors'
    end_pattern = r'Item\s*2'
    
    # Use re.DOTALL so '.' matches newlines; re.IGNORECASE for case insensitivity.
    pattern = re.compile(f'({start_pattern}.*?){end_pattern}', re.DOTALL | re.IGNORECASE)
    match = pattern.search(text)
    if match:
        # Return the matched group (the section content)
        return match.group(1).strip()
    else:
        return None

# Function to open the file, run the Item 1A extraction func, then saves it once the file is done
def extract_and_save_section(input_path, output_path):
    # Read the entire text file
    with open(input_path, 'r', encoding='utf-8') as infile:
        full_text = infile.read()
    
    # Extract the "Item 1A: Risk Factors" section
    section = extract_item_1A_section(full_text)
    
    if section:
        with open(output_path, 'w', encoding='utf-8') as outfile:
            outfile.write(section)
        print(f"Extracted section saved to {output_path}")
    else:
        print("The section 'Item 1A: Risk Factors' could not be found.")

if __name__ == "__main__":
    input_txt = "IONQ 2024 10K_clean.txt"      # Cleaned text file from earlier
    output_txt = "IONQ 2024 10K_Item1A.txt"      # File to save the extracted section
    extract_and_save_section(input_txt, output_txt)

Extracted section saved to IONQ 2024 10K_Item1A.txt


In [8]:
# Rinse and repeat with however many years of 10-K's you have, changing the input and output file path names each time
if __name__ == "__main__":
    input_txt = "IONQ_2023_Cleaned.txt"      
    output_txt = "IONQ 2023 10K_Item1A.txt"      
    extract_and_save_section(input_txt, output_txt)

Extracted section saved to IONQ 2023 10K_Item1A.txt


In [9]:
if __name__ == "__main__":
    input_txt = "IONQ_2022_Cleaned.txt"      
    output_txt = "IONQ 2022 10K_Item1A.txt"      
    extract_and_save_section(input_txt, output_txt)

Extracted section saved to IONQ 2022 10K_Item1A.txt


In [10]:
if __name__ == "__main__":
    input_txt = "IONQ_2021_Cleaned.txt"      
    output_txt = "IONQ 2021 10K_Item1A.txt"     
    extract_and_save_section(input_txt, output_txt)

Extracted section saved to IONQ 2021 10K_Item1A.txt
