In [2]:
!pip install nest_asyncio -U -q

In [None]:
# code from https://github.com/sec-edgar/sec-edgar?tab=readme-ov-file
# pull from the above repo first before running this code

# downloading filings for oil and gas companies

In [1]:
import nest_asyncio
import logging
from secedgar import filings, FilingType
from secedgar.exceptions import EDGARQueryError, NoFilingsError
from time import sleep

# Apply the nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Configure logging to log to both console and file
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("filings_download.log"),
                        logging.StreamHandler()
                    ])

# Define the tickers for the specified utility companies
utility_tickers = [
    "XOM",  # Exxon Mobil Corporation
    "CVX",  # Chevron Corporation
    "COP",   # ConocoPhillips
    "OXY",  # Occidental Petroleum Corporation
    "MPC"    # Marathon Petroleum
]
# Define the maximum number of retries
max_retries = 3

# Function to download filings with retry logic and logging
def download_filings(ticker, filing_type, base_path, user_agent):
    for attempt in range(max_retries):
        try:
            logging.info(f"Attempting to download {filing_type.name} filings for ticker {ticker} (Attempt {attempt + 1})")
            my_filings = filings(cik_lookup=[ticker],
                                 filing_type=filing_type,
                                 user_agent=user_agent)
            my_filings.save(base_path)
            logging.info(f"Successfully downloaded {filing_type.name} filings for ticker {ticker}")
            break
        except (EDGARQueryError, NoFilingsError) as e:
            logging.error(f"Error downloading {filing_type.name} filings for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")
        except Exception as e:
            logging.error(f"Unexpected error for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")

# Define the base path where the filings are saved
base_path = r'C:\Users\avani\Desktop\Thesis\oilandgas'
user_agent = " "

# Download filings for each company 
for ticker in utility_tickers:
    download_filings(ticker, FilingType.FILING_10Q, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_10K, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_8K, base_path, user_agent)

print("Filings download process completed.")


2024-07-02 12:09:41,657 - INFO - Attempting to download FILING_10Q filings for ticker XOM (Attempt 1)
  return BeautifulSoup(self.get_response(path, params, **kwargs).text,
310it [00:31,  9.92it/s]                                                                                               
2024-07-02 12:10:15,487 - INFO - Successfully downloaded FILING_10Q filings for ticker XOM
2024-07-02 12:10:16,491 - INFO - Attempting to download FILING_10K filings for ticker XOM (Attempt 1)
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00,  8.98it/s]
2024-07-02 12:10:24,267 - INFO - Successfully downloaded FILING_10K filings for ticker XOM
2024-07-02 12:10:25,268 - INFO - Attempting to download FILING_8K filings for ticker XOM (Attempt 1)
1140it [01:55,  9.90it/s]                                                                                              
2024-07-02 12:12:27,446 - INFO - Successfully downloaded FILING_8K filings for tic

Filings download process completed.


# downloading filings for utilities companies

In [2]:
import nest_asyncio
import logging
from secedgar import filings, FilingType
from secedgar.exceptions import EDGARQueryError, NoFilingsError
from time import sleep

# Apply the nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Configure logging to log to both console and file
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("filings_download.log"),
                        logging.StreamHandler()
                    ])

# Define the tickers for the specified utility companies
utility_tickers = [
    "NEE",  # NextEra Energy, Inc.
    "DUK",  # Duke Energy Corporation
    "EXC",   # Exelon corporation
    "D",  # Dominion
    "ED"   # Consolidated Edison, Inc.
]
# Define the maximum number of retries
max_retries = 3

# Function to download filings with retry logic and logging
def download_filings(ticker, filing_type, base_path, user_agent):
    for attempt in range(max_retries):
        try:
            logging.info(f"Attempting to download {filing_type.name} filings for ticker {ticker} (Attempt {attempt + 1})")
            my_filings = filings(cik_lookup=[ticker],
                                 filing_type=filing_type,
                                 user_agent=user_agent)
            my_filings.save(base_path)
            logging.info(f"Successfully downloaded {filing_type.name} filings for ticker {ticker}")
            break
        except (EDGARQueryError, NoFilingsError) as e:
            logging.error(f"Error downloading {filing_type.name} filings for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")
        except Exception as e:
            logging.error(f"Unexpected error for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")

# Define the base path where the filings are saved
base_path = r'C:\Users\avani\Desktop\Thesis\utilities'
user_agent = ""

# Download filings for each company
for ticker in utility_tickers:
    download_filings(ticker, FilingType.FILING_10Q, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_10K, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_8K, base_path, user_agent)

print("Filings download process completed.")


2024-07-02 12:23:18,692 - INFO - Attempting to download FILING_10Q filings for ticker NEE (Attempt 1)
310it [00:36,  8.60it/s]                                                                                               
2024-07-02 12:23:56,950 - INFO - Successfully downloaded FILING_10Q filings for ticker NEE
2024-07-02 12:23:57,953 - INFO - Attempting to download FILING_10K filings for ticker NEE (Attempt 1)
70it [00:09,  7.72it/s]                                                                                                
2024-07-02 12:24:08,216 - INFO - Successfully downloaded FILING_10K filings for ticker NEE
2024-07-02 12:24:09,218 - INFO - Attempting to download FILING_8K filings for ticker NEE (Attempt 1)
1830it [03:04,  9.92it/s]                                                                                              
2024-07-02 12:27:25,305 - INFO - Successfully downloaded FILING_8K filings for ticker NEE
2024-07-02 12:27:25,305 - INFO - Attempting to download FILING_

Filings download process completed.


## downloading filings for mining companies

In [2]:
import nest_asyncio
import logging
from secedgar import filings, FilingType
from secedgar.exceptions import EDGARQueryError, NoFilingsError
from time import sleep

# Apply the nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Configure logging to log to both console and file
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("filings_download.log"),
                        logging.StreamHandler()
                    ])

# Define the tickers for the specified utility companie
utility_tickers = [
    "FCX",  # Freeport-McMoRan Inc
    "NEM",  # Newmont Corporation
    "SCCO",  # Southern Copper Corporation
    "AA",    # Alcoa Corporation
    "HL"   # Hecla Mining Company
]
# Define the maximum number of retries
max_retries = 3

# Function to download filings with retry logic and logging
def download_filings(ticker, filing_type, base_path, user_agent):
    for attempt in range(max_retries):
        try:
            logging.info(f"Attempting to download {filing_type.name} filings for ticker {ticker} (Attempt {attempt + 1})")
            my_filings = filings(cik_lookup=[ticker],
                                 filing_type=filing_type,
                                 user_agent=user_agent)
            my_filings.save(base_path)
            logging.info(f"Successfully downloaded {filing_type.name} filings for ticker {ticker}")
            break
        except (EDGARQueryError, NoFilingsError) as e:
            logging.error(f"Error downloading {filing_type.name} filings for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")
        except Exception as e:
            logging.error(f"Unexpected error for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")

# Define the base path where the filings are saved
base_path = r'C:\Users\avani\Desktop\Thesis\mining'
user_agent = ""

# Download filings for each company 
for ticker in utility_tickers:
    download_filings(ticker, FilingType.FILING_10Q, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_10K, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_8K, base_path, user_agent)

print("Filings download process completed.")


2024-06-19 22:51:30,307 - INFO - Attempting to download FILING_10Q filings for ticker FCX (Attempt 1)
310it [00:45,  6.85it/s]                                                                                               
2024-06-19 22:52:26,996 - INFO - Successfully downloaded FILING_10Q filings for ticker FCX
2024-06-19 22:52:27,998 - INFO - Attempting to download FILING_10K filings for ticker FCX (Attempt 1)
50it [00:17,  2.92it/s]                                                                                                
2024-06-19 22:52:49,294 - INFO - Successfully downloaded FILING_10K filings for ticker FCX
2024-06-19 22:52:50,299 - INFO - Attempting to download FILING_8K filings for ticker FCX (Attempt 1)
1610it [02:50,  9.42it/s]                                                                                              
2024-06-19 22:56:33,678 - INFO - Successfully downloaded FILING_8K filings for ticker FCX
2024-06-19 22:56:33,681 - INFO - Attempting to download FILING_

Filings download process completed.


# Cleaning without removing punctuation

In [1]:
import nest_asyncio
import logging
from secedgar import filings, FilingType
from secedgar.exceptions import EDGARQueryError, NoFilingsError
from time import sleep
import os
import re
import sqlite3
from bs4 import BeautifulSoup

# Apply the nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Configure logging to log to both console and file
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("filings_download.log"),
                        logging.StreamHandler()
                    ])

# Define the tickers for the specified utility companies
utility_tickers = [
    "FCX",  # Freeport-McMoRan Inc
    "NEM",  # Newmont Corporation
    "SCCO",  # Southern Copper Corporation
    "AA",    # Alcoa Corporation
    "HL"     # Hecla Mining Company
]

# Define the maximum number of retries
max_retries = 3

# Function to download filings with retry logic and logging
def download_filings(ticker, filing_type, base_path, user_agent):
    for attempt in range(max_retries):
        try:
            logging.info(f"Attempting to download {filing_type.name} filings for ticker {ticker} (Attempt {attempt + 1})")
            my_filings = filings(cik_lookup=[ticker],
                                 filing_type=filing_type,
                                 user_agent=user_agent)
            my_filings.save(base_path)
            logging.info(f"Successfully downloaded {filing_type.name} filings for ticker {ticker}")
            break
        except (EDGARQueryError, NoFilingsError) as e:
            logging.error(f"Error downloading {filing_type.name} filings for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")
        except Exception as e:
            logging.error(f"Unexpected error for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")

# Download filings for each company 
for ticker in utility_tickers:
    download_filings(ticker, FilingType.FILING_10Q, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_10K, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_8K, base_path, user_agent)

print("Filings download process completed.")

def extract_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    company_name = re.search(r'COMPANY CONFORMED NAME:\s*(.+)', content)
    company_name = company_name.group(1) if company_name else None

    filing_date = re.search(r'<ACCEPTANCE-DATETIME>(\d+)', content)
    filing_date = filing_date.group(1) if filing_date else None

    accession_number = re.search(r'ACCESSION NUMBER:\s*(.+)', content)
    accession_number = accession_number.group(1) if accession_number else None

    form_type = re.search(r'FORM TYPE:\s*(.+)', content)
    form_type = form_type.group(1) if form_type else None

    business_address = re.search(r'BUSINESS ADDRESS:\s+STREET 1:\s+(.+)\s+STREET 2:\s+(.+)\s+CITY:\s+(.+)\s+STATE:\s+(.+)\s+ZIP:\s+(\d+)', content)
    business_address = f"{business_address.group(1)}, {business_address.group(2)}, {business_address.group(3)}, {business_address.group(4)}, {business_address.group(5)}" if business_address else None

    document_type = re.search(r'<TYPE>(.+)', content)
    document_type = document_type.group(1) if document_type else None

    document_text = re.search(r'<TEXT>(.*?)</TEXT>', content, re.DOTALL)
    document_text = document_text.group(1).strip() if document_text else None

    if document_text:
        document_text = clean_html(document_text)

    return {
        'company_name': company_name,
        'filing_date': filing_date,
        'accession_number': accession_number,
        'form_type': form_type,
        'business_address': business_address,
        'document_type': document_type,
        'document_text': document_text,
    }

def clean_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
  
    # Remove unwanted characters and extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def parse_document_text(document_text):
    financial_statement_pattern = re.compile(r'(?<=financial statements:)(.*?)(?=end of financial statements)', re.DOTALL | re.IGNORECASE)
    management_discussion_pattern = re.compile(r'(?<=management discussion:)(.*?)(?=end of management discussion)', re.DOTALL | re.IGNORECASE)

    financial_statements = financial_statement_pattern.search(document_text)
    management_discussion = management_discussion_pattern.search(document_text)

    return {
        'financial_statements': financial_statements.group(1).strip() if financial_statements else None,
        'management_discussion': management_discussion.group(1).strip() if management_discussion else None,
    }

def initialize_database(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute('DROP TABLE IF EXISTS filings')
    cursor.execute('''CREATE TABLE filings (
                        id INTEGER PRIMARY KEY,
                        company_name TEXT,
                        filing_date TEXT,
                        accession_number TEXT,
                        form_type TEXT,
                        business_address TEXT,
                        document_type TEXT,
                        document_text TEXT
                    )''')

    cursor.execute('''CREATE TABLE IF NOT EXISTS financial_statements (
                        id INTEGER PRIMARY KEY,
                        filing_id INTEGER,
                        content TEXT,
                        FOREIGN KEY (filing_id) REFERENCES filings(id)
                    )''')
    cursor.execute('''CREATE TABLE IF NOT EXISTS management_discussions (
                        id INTEGER PRIMARY KEY,
                        filing_id INTEGER,
                        content TEXT,
                        FOREIGN KEY (filing_id) REFERENCES filings(id)
                    )''')

    conn.commit()
    conn.close()
    print("Database initialized with new schema.")

def save_to_database(data, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute('''INSERT INTO filings (company_name, filing_date, accession_number, form_type, business_address, document_type, document_text)
                      VALUES (?, ?, ?, ?, ?, ?, ?)''',
                   (data['company_name'], data['filing_date'], data['accession_number'], data['form_type'], data['business_address'], data['document_type'], data['document_text']))

    filing_id = cursor.lastrowid

    parsed_data = parse_document_text(data['document_text'])
    parsed_data['filing_id'] = filing_id

    if parsed_data['financial_statements']:
        cursor.execute('''INSERT INTO financial_statements (filing_id, content)
                          VALUES (?, ?)''',
                       (filing_id, parsed_data['financial_statements']))
    if parsed_data['management_discussion']:
        cursor.execute('''INSERT INTO management_discussions (filing_id, content)
                          VALUES (?, ?)''',
                       (filing_id, parsed_data['management_discussion']))

    conn.commit()
    conn.close()

def save_data_to_file(output_file, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute('SELECT * FROM filings')
    rows = cursor.fetchall()

    with open(output_file, 'w', encoding='utf-8') as f:
        for row in rows:
            f.write(str(row) + '\n')

    conn.close()

def clean_text_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []
    for line in lines:
        cleaned_line = re.sub(r'\bfcx:\w+\b', '', line)
        cleaned_line = re.sub(r'http://\S+', '', cleaned_line)
        
        
        cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
        if cleaned_line:
            cleaned_lines.append(cleaned_line)

    with open(output_file, 'w', encoding='utf-8') as file:
        for cleaned_line in cleaned_lines:
            file.write(cleaned_line + '\n')


def process_filings(filing_type, company_name, db_path, text_files_dir, output_file):
    initialize_database(db_path)
    
    for filename in os.listdir(text_files_dir):
        if filename.endswith('.txt'):
            file_path = os.path.join(text_files_dir, filename)
            data = extract_data(file_path)
            save_to_database(data, db_path)
    
    save_data_to_file(output_file, db_path)
    clean_text_file(output_file, f"cleaned_{output_file}")
    print(f"Data extraction and saving completed for {filing_type} filings of {company_name}.")
    print(f"Data has been written to {output_file}.")

# List of companies and their tickers
companies = [
    {"name": "fcx", "ticker": "FCX"},
    {"name": "nem", "ticker": "NEM"},
    {"name": "scco", "ticker": "SCCO"},
    {"name": "aa", "ticker": "AA"},
    {"name": "hl", "ticker": "HL"}
    
]

base_dir = r'C:\Users\avani\Desktop\Thesis\mining'

for company in companies:
    for filing_type in ["10-K"]:
        db_path = os.path.join(base_dir, company['name'], f'test3_{company["ticker"]}_{filing_type}.db')
        text_files_dir = os.path.join(base_dir, company['name'], filing_type)
        output_file = f'{company["ticker"]}_{filing_type}.txt'
        
        process_filings(filing_type, company['name'], db_path, text_files_dir, output_file)

print("All filings processed.")


2024-07-27 16:47:06,471 - INFO - Attempting to download FILING_10Q filings for ticker FCX (Attempt 1)
  return BeautifulSoup(self.get_response(path, params, **kwargs).text,
310it [00:48,  6.46it/s]                                                                                               
2024-07-27 16:47:59,129 - INFO - Successfully downloaded FILING_10Q filings for ticker FCX
2024-07-27 16:48:00,136 - INFO - Attempting to download FILING_10K filings for ticker FCX (Attempt 1)
50it [00:17,  2.82it/s]                                                                                                
2024-07-27 16:48:19,373 - INFO - Successfully downloaded FILING_10K filings for ticker FCX
2024-07-27 16:48:20,376 - INFO - Attempting to download FILING_8K filings for ticker FCX (Attempt 1)
1620it [02:47,  9.69it/s]                                                                                              
2024-07-27 16:51:19,093 - INFO - Successfully downloaded FILING_8K filings for tic

Filings download process completed.
Database initialized with new schema.
Data extraction and saving completed for 8-K filings of fcx.
Data has been written to FCX_8-K.txt.
Database initialized with new schema.
Data extraction and saving completed for 10-K filings of fcx.
Data has been written to FCX_10-K.txt.
Database initialized with new schema.
Data extraction and saving completed for 10-Q filings of fcx.
Data has been written to FCX_10-Q.txt.
All filings processed.


In [3]:
import nest_asyncio
import logging
from secedgar import filings, FilingType
from secedgar.exceptions import EDGARQueryError, NoFilingsError
from time import sleep
import os
import re
import sqlite3
from bs4 import BeautifulSoup

# Apply the nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Configure logging to log to both console and file
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("filings_download.log"),
                        logging.StreamHandler()
                    ])

# Define the tickers for the specified utility companies
utility_tickers = [
    "XOM",  
    "CVX",  
    "COP",  
    "OXY",    
    "MPC"    
]

# Define the maximum number of retries
max_retries = 3

# Function to download filings with retry logic and logging
def download_filings(ticker, filing_type, base_path, user_agent):
    for attempt in range(max_retries):
        try:
            logging.info(f"Attempting to download {filing_type.name} filings for ticker {ticker} (Attempt {attempt + 1})")
            my_filings = filings(cik_lookup=[ticker],
                                 filing_type=filing_type,
                                 user_agent=user_agent)
            my_filings.save(base_path)
            logging.info(f"Successfully downloaded {filing_type.name} filings for ticker {ticker}")
            break
        except (EDGARQueryError, NoFilingsError) as e:
            logging.error(f"Error downloading {filing_type.name} filings for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")
        except Exception as e:
            logging.error(f"Unexpected error for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")

# Define the base path where the filings are saved
base_path = r'C:\Users\avani\Desktop\Thesis\oilandgas'
user_agent = "am1623@ic.ac.uk"

# Download filings for each company
for ticker in utility_tickers:
    download_filings(ticker, FilingType.FILING_10Q, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_10K, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_8K, base_path, user_agent)

print("Filings download process completed.")

def extract_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    company_name = re.search(r'COMPANY CONFORMED NAME:\s*(.+)', content)
    company_name = company_name.group(1) if company_name else None

    filing_date = re.search(r'<ACCEPTANCE-DATETIME>(\d+)', content)
    filing_date = filing_date.group(1) if filing_date else None

    accession_number = re.search(r'ACCESSION NUMBER:\s*(.+)', content)
    accession_number = accession_number.group(1) if accession_number else None

    form_type = re.search(r'FORM TYPE:\s*(.+)', content)
    form_type = form_type.group(1) if form_type else None

    business_address = re.search(r'BUSINESS ADDRESS:\s+STREET 1:\s+(.+)\s+STREET 2:\s+(.+)\s+CITY:\s+(.+)\s+STATE:\s+(.+)\s+ZIP:\s+(\d+)', content)
    business_address = f"{business_address.group(1)}, {business_address.group(2)}, {business_address.group(3)}, {business_address.group(4)}, {business_address.group(5)}" if business_address else None

    document_type = re.search(r'<TYPE>(.+)', content)
    document_type = document_type.group(1) if document_type else None

    document_text = re.search(r'<TEXT>(.*?)</TEXT>', content, re.DOTALL)
    document_text = document_text.group(1).strip() if document_text else None

    if document_text:
        document_text = clean_html(document_text)

    return {
        'company_name': company_name,
        'filing_date': filing_date,
        'accession_number': accession_number,
        'form_type': form_type,
        'business_address': business_address,
        'document_type': document_type,
        'document_text': document_text,
    }

def clean_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove numeric patterns (e.g., numbers, dates, CIKs)
    text = re.sub(r'\b\d{4,}\b', '', text)
    text = re.sub(r'\b\d{1,2}\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\b[A-Za-z]{3}\d{2}\b', '', text)
    # Remove unwanted characters and extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def parse_document_text(document_text):
    financial_statement_pattern = re.compile(r'(?<=financial statements:)(.*?)(?=end of financial statements)', re.DOTALL | re.IGNORECASE)
    management_discussion_pattern = re.compile(r'(?<=management discussion:)(.*?)(?=end of management discussion)', re.DOTALL | re.IGNORECASE)

    financial_statements = financial_statement_pattern.search(document_text)
    management_discussion = management_discussion_pattern.search(document_text)

    return {
        'financial_statements': financial_statements.group(1).strip() if financial_statements else None,
        'management_discussion': management_discussion.group(1).strip() if management_discussion else None,
    }

def initialize_database(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute('DROP TABLE IF EXISTS filings')
    cursor.execute('''CREATE TABLE filings (
                        id INTEGER PRIMARY KEY,
                        company_name TEXT,
                        filing_date TEXT,
                        accession_number TEXT,
                        form_type TEXT,
                        business_address TEXT,
                        document_type TEXT,
                        document_text TEXT
                    )''')

    cursor.execute('''CREATE TABLE IF NOT EXISTS financial_statements (
                        id INTEGER PRIMARY KEY,
                        filing_id INTEGER,
                        content TEXT,
                        FOREIGN KEY (filing_id) REFERENCES filings(id)
                    )''')
    cursor.execute('''CREATE TABLE IF NOT EXISTS management_discussions (
                        id INTEGER PRIMARY KEY,
                        filing_id INTEGER,
                        content TEXT,
                        FOREIGN KEY (filing_id) REFERENCES filings(id)
                    )''')

    conn.commit()
    conn.close()
    print("Database initialized with new schema.")

def save_to_database(data, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute('''INSERT INTO filings (company_name, filing_date, accession_number, form_type, business_address, document_type, document_text)
                      VALUES (?, ?, ?, ?, ?, ?, ?)''',
                   (data['company_name'], data['filing_date'], data['accession_number'], data['form_type'], data['business_address'], data['document_type'], data['document_text']))

    filing_id = cursor.lastrowid

    parsed_data = parse_document_text(data['document_text'])
    parsed_data['filing_id'] = filing_id

    if parsed_data['financial_statements']:
        cursor.execute('''INSERT INTO financial_statements (filing_id, content)
                          VALUES (?, ?)''',
                       (filing_id, parsed_data['financial_statements']))
    if parsed_data['management_discussion']:
        cursor.execute('''INSERT INTO management_discussions (filing_id, content)
                          VALUES (?, ?)''',
                       (filing_id, parsed_data['management_discussion']))

    conn.commit()
    conn.close()

def save_data_to_file(output_file, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute('SELECT * FROM filings')
    rows = cursor.fetchall()

    with open(output_file, 'w', encoding='utf-8') as f:
        for row in rows:
            f.write(str(row) + '\n')

    conn.close()

def clean_text_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []
    for line in lines:
        cleaned_line = re.sub(r'\bfcx:\w+\b', '', line)
        cleaned_line = re.sub(r'http://\S+', '', cleaned_line)
        cleaned_line = re.sub(r'\b\d+\b', '', cleaned_line)
        cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
        if cleaned_line:
            cleaned_lines.append(cleaned_line)

    with open(output_file, 'w', encoding='utf-8') as file:
        for cleaned_line in cleaned_lines:
            file.write(cleaned_line + '\n')

def process_filings(filing_type, company_name, db_path, text_files_dir, output_file):
    initialize_database(db_path)
    
    for filename in os.listdir(text_files_dir):
        if filename.endswith('.txt'):
            file_path = os.path.join(text_files_dir, filename)
            data = extract_data(file_path)
            save_to_database(data, db_path)
    
    save_data_to_file(output_file, db_path)
    clean_text_file(output_file, f"cleaned_{output_file}")
    print(f"Data extraction and saving completed for {filing_type} filings of {company_name}.")
    print(f"Data has been written to {output_file}.")

# List of companies and their tickers
companies = [
    {"name": "xom", "ticker": "XOM"},
    {"name": "cvx", "ticker": "CVX"},
    {"name": "cop", "ticker": "COP"},
    {"name": "oxy", "ticker": "OXY"},
    {"name": "mpc", "ticker": "MPC"}
]

base_dir = r'C:\Users\avani\Desktop\Thesis\oilandgas'

for company in companies:
    for filing_type in ["10-K"]:
        db_path = os.path.join(base_dir, company['name'], f'test3_{company["ticker"]}_{filing_type}.db')
        text_files_dir = os.path.join(base_dir, company['name'], filing_type)
        output_file = f'{company["ticker"]}_{filing_type}.txt'
        
        process_filings(filing_type, company['name'], db_path, text_files_dir, output_file)

print("All filings processed.")


2024-07-02 12:48:11,065 - INFO - Attempting to download FILING_10Q filings for ticker XOM (Attempt 1)
310it [00:31,  9.95it/s]                                                                                               
2024-07-02 12:48:45,045 - INFO - Successfully downloaded FILING_10Q filings for ticker XOM
2024-07-02 12:48:46,053 - INFO - Attempting to download FILING_10K filings for ticker XOM (Attempt 1)
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00,  9.79it/s]
2024-07-02 12:48:53,957 - INFO - Successfully downloaded FILING_10K filings for ticker XOM
2024-07-02 12:48:54,966 - INFO - Attempting to download FILING_8K filings for ticker XOM (Attempt 1)
1140it [01:55,  9.90it/s]                                                                                              
2024-07-02 12:50:58,697 - INFO - Successfully downloaded FILING_8K filings for ticker XOM
2024-07-02 12:50:58,697 - INFO - Attempting to download FILING_

Filings download process completed.
Database initialized with new schema.
Data extraction and saving completed for 8-K filings of xom.
Data has been written to XOM_8-K.txt.
Database initialized with new schema.
Data extraction and saving completed for 10-K filings of xom.
Data has been written to XOM_10-K.txt.
Database initialized with new schema.
Data extraction and saving completed for 10-Q filings of xom.
Data has been written to XOM_10-Q.txt.
Database initialized with new schema.
Data extraction and saving completed for 8-K filings of cvx.
Data has been written to CVX_8-K.txt.
Database initialized with new schema.
Data extraction and saving completed for 10-K filings of cvx.
Data has been written to CVX_10-K.txt.
Database initialized with new schema.
Data extraction and saving completed for 10-Q filings of cvx.
Data has been written to CVX_10-Q.txt.
Database initialized with new schema.
Data extraction and saving completed for 8-K filings of cop.
Data has been written to COP_8-K.tx

In [None]:
import nest_asyncio
import logging
from secedgar import filings, FilingType
from secedgar.exceptions import EDGARQueryError, NoFilingsError
from time import sleep
import os

# Apply the nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Configure logging to log to both console and file
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("filings_download.log"),
                        logging.StreamHandler()
                    ])

# Define the tickers for the specified utility companies
utility_tickers = [
    "NEE",  # NextEra Energy
    "DUK",  # Duke Energy
    "EXC",  # Exelon Corporation
    "D",    # Dominion Energy
    "ED"    # Consolidated Edison
]

# Define the maximum number of retries
max_retries = 3

# Function to download filings with retry logic and logging
def download_filings(ticker, filing_type, base_path, user_agent):
    for attempt in range(max_retries):
        try:
            logging.info(f"Attempting to download {filing_type.name} filings for ticker {ticker} (Attempt {attempt + 1})")
            my_filings = filings(cik_lookup=[ticker],
                                 filing_type=filing_type,
                                 user_agent=user_agent)
            my_filings.save(base_path)
            logging.info(f"Successfully downloaded {filing_type.name} filings for ticker {ticker}")
            break
        except (EDGARQueryError, NoFilingsError) as e:
            logging.error(f"Error downloading {filing_type.name} filings for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")
        except Exception as e:
            logging.error(f"Unexpected error for ticker {ticker}: {e}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in 5 seconds...")
                sleep(5)
            else:
                logging.error(f"Failed to download {filing_type.name} filings for ticker {ticker} after {max_retries} attempts")

# Define the base path where the filings are saved
base_path = r'C:\Users\avani\Desktop\Thesis\utilities'
user_agent = "am1623@ic.ac.uk"

# Download filings for each company
for ticker in utility_tickers:
    download_filings(ticker, FilingType.FILING_10Q, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_10K, base_path, user_agent)
    sleep(1)  # Pause between different filings
    download_filings(ticker, FilingType.FILING_8K, base_path, user_agent)

print("Filings download process completed.")

# List of companies and their tickers
companies = [
    {"name": "d", "ticker": "D"},
    {"name": "nee", "ticker": "NEE"},
    {"name": "duk", "ticker": "DUK"},
    {"name": "exc", "ticker": "EXC"},
    {"name": "ed", "ticker": "ED"}
]

base_dir = r'C:\Users\avani\Desktop\Thesis\utilities'

for company in companies:
    for filing_type in ["8-K", "10-K", "10-Q"]:
        db_path = os.path.join(base_dir, company['name'], f'test3_{company["ticker"]}_{filing_type}.db')
        text_files_dir = os.path.join(base_dir, company['name'], filing_type)
        output_file = f'{company["ticker"]}_{filing_type}.txt'
        
        process_filings(filing_type, company['name'], db_path, text_files_dir, output_file)

print("All filings processed.")
