# Pre-Processing Task Force on Climate-related Financial Disclosures

### This script demonstrates the pre-processing of TCF disclosures for two insurance companies, Travelers Cos (TRV) and St. James's Place, preparing them for further NLP analysis. Task Force on Climate-related Financial Disclosures, provide detailed information on how companies are addressing climate-related risks and opportunities. The transcripts were downloaded from each company's website. Their pre-processing is intricate due to the varied formats and detailed nature of the disclosures, necessitating meticulous handling to ensure data consistency for subsequent analysis.

## Libraries

In [4]:
import os
import fnmatch
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from tqdm import tqdm
import spacy
from nltk.corpus import words
from collections import Counter

## Checking for TCFD Reports in Company Directories

The code defines a function to search for Task Force on Climate-related Financial Disclosures (TCFD) reports within a specified base directory. It looks through directories for banks and insurers, specifically searching for PDF files containing "tcfd report" in their names. Some companies include their TCFD reports inside their annual reports, but this script specifically searches for tailored TCFD reports to ensure focused and relevant data collection. The results are returned in a dictionary structure, organized by sector and company.

In [5]:
def check_tcfd_reports(base_dir):
    results = {}
    for sector in ["Banks", "Insurers"]:
        sector_path = os.path.join(base_dir, sector)
        if not os.path.isdir(sector_path):
            continue
        
        companies = os.listdir(sector_path)
        for company in companies:
            company_path = os.path.join(sector_path, company)
            if not os.path.isdir(company_path):
                continue
            
            tcfd_path = os.path.join(company_path, "TCFD")
            if not os.path.isdir(tcfd_path):
                continue
            
            # Look for PDF files with "tcfd report" in their name (case insensitive)
            for root, dirs, files in os.walk(tcfd_path):
                for file in files:
                    if file.lower().endswith(".pdf") and fnmatch.fnmatch(file.lower(), "*tcfd*.pdf"):
                        if sector not in results:
                            results[sector] = {}
                        if company not in results[sector]:
                            results[sector][company] = []
                        results[sector][company].append(os.path.join(root, file))
    
    return results

base_dir = 'ARP BOE'
report_files = check_tcfd_reports(base_dir)
for sector, companies in report_files.items():
    for company, files in companies.items():
        print(f"Sector: {sector}, Company: {company}")
        for file in files:
            print(f"  {file}")

Sector: Insurers, Company: ST. JAMES place
  ARP BOE\Insurers\ST. JAMES place\TCFD\sjp-tcfd-report-april2020.pdf
  ARP BOE\Insurers\ST. JAMES place\TCFD\sjp-tcfd-report-april2021.pdf
  ARP BOE\Insurers\ST. JAMES place\TCFD\SJP_TCFD_2023.pdf
  ARP BOE\Insurers\ST. JAMES place\TCFD\SJP_TCFD_Report_2022.pdf
Sector: Insurers, Company: Traveler Cos TRV
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2019.pdf
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2020.pdf
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2021.pdf
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2022.pdf
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2023.pdf


## Extracting text out of the PDFs 

In [6]:
# Traverses directories, extracts TCFD PDF text and saves as .txt files.
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if reader.is_encrypted:
                reader.decrypt('')
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def traverse_and_extract_text_for_companies(root_dir, output_dir, companies, max_files_per_folder=10):
    total_size = 0

    for sector in companies:
        for company in companies[sector]:
            tcfd_path = os.path.join(root_dir, sector, company, "TCFD")
            if os.path.isdir(tcfd_path):
                company_output_dir = os.path.join(output_dir, company)
                os.makedirs(company_output_dir, exist_ok=True)
                
                file_count = 0
                for root, dirs, files in os.walk(tcfd_path):
                    for file in files:
                        if file.lower().endswith(".pdf") and "tcfd" in file.lower() and file_count < max_files_per_folder:
                            pdf_path = os.path.join(root, file)
                            file_size = os.path.getsize(pdf_path)
                            text = extract_text_from_pdf(pdf_path)

                            if text:
                                # Save the text as a .txt file
                                output_file = os.path.join(company_output_dir, f"{os.path.splitext(file)[0]}.txt")
                                with open(output_file, 'w', encoding='utf-8') as txt_file:
                                    txt_file.write(text)
                                print(f"Saved text for {pdf_path} to {output_file}")

                                total_size += file_size
                                file_count += 1

                            if file_count >= max_files_per_folder:
                                break

    return total_size

# Root directory path
root_dir = "ARP BOE"
# Output directory path
output_dir = "Example TCFD Texts"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Specified companies
companies = {
    "Insurers": ["Traveler Cos TRV", "ST. JAMES place"],
}

total_size = traverse_and_extract_text_for_companies(root_dir, output_dir, companies)

Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2019.pdf to Example TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2019.txt
Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2020.pdf to Example TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2020.txt
Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2021.pdf to Example TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2021.txt
Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2022.pdf to Example TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2022.txt
Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2023.pdf to Example TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2023.txt
Saved text for ARP BOE\Insurers\ST. JAMES place\TCFD\sjp-tcfd-report-april2020.pdf to Example TCFD Texts\ST. JAMES place\sjp-tcfd-report-april2020.txt
Saved text for ARP BOE\Insurers\ST. JAMES place\TCFD\sjp-tcfd-report-april2021.pdf to Example 

## Tailored Cleaning and Pre-processing

This script demonstrates the cleaning and pre-processing of the TCFDs for Travelers Cos (TRV) and St. James's Place. The script performs several tasks:

- Download Necessary NLTK Data: Ensures that the necessary stopwords and English words are downloaded from NLTK.
- Define Cleaning Functions: Includes functions to clean the text by removing headers, footers, special characters, non-English words and unnecessary spaces.
- Specific Cleaning for Each Company: Custom cleaning functions are tailored for the unique formats of the disclosures from each company.
- Save Cleaned Texts: The cleaned text files are saved to specified output directories, maintaining the directory structure from the input.

### Traveler Cos TRV

In [7]:
# Ensure the NLTK words corpus is downloaded
nltk.download('words')
english_vocab = set(words.words())

# Directory containing the text files
input_dir = "Example TCFD Texts/Traveler Cos TRV"
output_directory = 'Cleaned TCFD Texts/Traveler Cos TRV'

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Define a function to clean the text
def clean_text(text, year):
    # Remove specific phrase with year
    text = re.sub(rf"Travelers Task Force on Climate-related Financial Disclosures Report\s*{year}\s*\d*", "", text)

    # Remove numbered list items ending with a dot (e.g., "3." but not "3.1")
    text = re.sub(r'\b\d+\.(?!\d)', "", text)

    # Remove bullet points
    text = re.sub(r'•', "", text)

    # Remove text starting from "Important Legal Information"
    text = re.split(r"Important Legal Information", text)[0]

    # Remove "Figure" followed by number and dot or if it is alone in a row
    text = re.sub(r'Figure(\s*\d+\.)?', "", text)

    # Remove bracketed numbers like [1]
    text = re.sub(r'\[\d+\]', "", text)

    # Remove inline numbers within words (e.g., "1kill")
    text = re.sub(r'(?<=\D)\d+(?=\D)', "", text)

    # Normalize spaced letters (e.g., e l e c t r i c v e h i c l e to electric vehicle)
    text = re.sub(r'\b(?:[a-zA-Z]\s)+[a-zA-Z]\b', lambda m: m.group(0).replace(' ', ''), text)

    # Split text into lines
    lines = text.splitlines()

    # Remove lines containing ".lnum"
    lines = [line for line in lines if ".lnum" not in line]

    # Remove lines based on year
    if year == "2021":
        lines = lines[22:]
    elif year == "2022":
        lines = lines[21:]
    elif year == "2023":
        lines = lines[21:]
    else:  # For other years like 2019 and 2020
        lines = lines[4:]

    # Filter non-English words and special characters
    def filter_line(line):
        filtered_words = []
        for word in re.findall(r'\b\w+\b', line):
            if word.lower() in english_vocab or word.isdigit():
                filtered_words.append(word)
        return " ".join(filtered_words)

    # Apply filter to each line to keep structure
    filtered_lines = [filter_line(line) for line in lines]

    # Remove empty lines
    filtered_lines = [line for line in filtered_lines if line.strip()]

    # Join lines back into text
    text = "\n".join(filtered_lines)

    return text

# Process each text file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_dir, filename)

        # Extract the year from the filename (assuming the format is consistent)
        year_match = re.search(r'\d{4}', filename)
        if year_match:
            year = year_match.group(0)
        else:
            continue

        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Clean the text
        cleaned_text = clean_text(text, year)

        # Save the cleaned text back to a file
        cleaned_file_path = os.path.join(output_directory, f"cleaned_{filename}")
        with open(cleaned_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)

        print(f"Processed {filename} and saved to {cleaned_file_path}")  

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dimi3\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Processed Travelers_TCFDReport2019.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2019.txt
Processed Travelers_TCFDReport2020.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2020.txt
Processed Travelers_TCFDReport2021.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2021.txt
Processed Travelers_TCFDReport2022.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2022.txt
Processed Travelers_TCFDReport2023.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2023.txt


### St. James Place

In [8]:
# Ensure the NLTK words corpus is downloaded
nltk.download('words')
english_vocab = set(words.words())

# Directory containing the text files
input_dir = "Example TCFD Texts/ST. JAMES place"
output_directory = 'Cleaned TCFD Texts/ST. JAMES place'

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Define a function to clean the text
def clean_text(text, year):
    # Remove specific phrase with year
    text = re.sub(rf"St. James’s Place TCFD Report\s*{year}\s*\d*", "", text)

    # Remove specific sections for 2020
    if year == "2020":
        text = re.sub(r'\b(?:Introduction|Governance|Strategy|Risk Management|Metrics & Targets|Glossary)\b', "", text)

    # Remove numbered list items ending with a dot (e.g., "3." but not "3.1")
    text = re.sub(r'\b\d+\.(?!\d)', "", text)

    # Remove bullet points and specific characters
    text = re.sub(r'•||l|©', "", text)

    # Normalize spaced letters (e.g., e l e c t r i c v e h i c l e to electric vehicle)
    text = re.sub(r'\b(?:[a-zA-Z]\s)+[a-zA-Z]\b', lambda m: m.group(0).replace(' ', ''), text)

    # Split text into lines
    lines = text.splitlines()

    # Remove first 60 and last 80 rows
    if len(lines) > 140:
        lines = lines[61:-80]

    # Define patterns for removal
    patterns = [
        r'^\s*[a-zA-Z0-9]\)\s*',  # Lines starting with "c)" or "1)"
        r'^\s*#\d+',              # Lines starting with "#3"
        r'^\s*Scope\b.*',         # Lines starting with "Scope"
        r'^\s*[%$\-].*',          # Lines starting with "%", "$", or "-"
        r'^\s*Pages\s*\d+–\d+\s*$', # Lines like "Pages 44–45"
        r'^\s*Figure(\s*\d+\.)?$', # "Figure" alone or followed by number
    ]

    # Filter non-English words and special characters
    def filter_line(line):
        filtered_words = []
        for word in re.findall(r'\b\w+\b', line):
            if word.lower() in english_vocab or word.isdigit():
                filtered_words.append(word)
        return " ".join(filtered_words)

    # Apply filters to each line
    filtered_lines = []
    for line in lines:
        # Remove lines with only numbers, only letters, or combinations like 'S 0 1'
        if (re.match(r'^[0-9\s]+$', line.strip()) or 
            re.match(r'^[a-zA-Z\s]+$', line.strip()) or 
            re.match(r'^[a-zA-Z0-9\s]+$', line.strip())):
            continue
        
        # Only keep lines that do not match removal criteria
        if not any(re.match(pattern, line) for pattern in patterns):
            filtered_line = filter_line(line)
            if filtered_line:
                filtered_lines.append(filtered_line)

    # Join lines back into text
    text = "\n".join(filtered_lines)

    return text

# Process each text file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_dir, filename)

        # Extract the year from the filename (assuming the format is consistent)
        year_match = re.search(r'\d{4}', filename)
        if year_match:
            year = year_match.group(0)
        else:
            continue

        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Clean the text
        cleaned_text = clean_text(text, year)

        # Save the cleaned text back to a file
        cleaned_file_path = os.path.join(output_directory, f"cleaned_{filename}")
        with open(cleaned_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)

        print(f"Processed {filename} and saved to {cleaned_file_path}")

Processed sjp-tcfd-report-april2020.txt and saved to Cleaned TCFD Texts/ST. JAMES place\cleaned_sjp-tcfd-report-april2020.txt
Processed sjp-tcfd-report-april2021.txt and saved to Cleaned TCFD Texts/ST. JAMES place\cleaned_sjp-tcfd-report-april2021.txt
Processed SJP_TCFD_2023.txt and saved to Cleaned TCFD Texts/ST. JAMES place\cleaned_SJP_TCFD_2023.txt
Processed SJP_TCFD_Report_2022.txt and saved to Cleaned TCFD Texts/ST. JAMES place\cleaned_SJP_TCFD_Report_2022.txt


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dimi3\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Organizing Cleaned TCFD Text Data

The script extracts and organizes cleaned TCFD text data, preparing it for further analysis. It performs the following steps:

- Data Extraction: Reads the text from each file and extracts the year and company name from the filename and directory structure.
- Data Compilation: Compiles the extracted data into a list.
- DataFrame Creation: Creates a DataFrame with columns for the file name, year, company name, and text content.
- Data Sorting: Converts the 'year' column to integers, handling any non-numeric values, and sorts the DataFrame by company name and year.

In [9]:
# Directory containing the cleaned text files
input_dir = "Cleaned TCFD Texts"

# Initialize a list to hold the extracted data
extracted_data = []

# Iterate over each text file in the directory
for company_dir in os.listdir(input_dir):
    company_path = os.path.join(input_dir, company_dir)
    if os.path.isdir(company_path):
        for filename in os.listdir(company_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(company_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    
                    # Extract the year and company name from the filename
                    year_match = re.search(r'\d{4}', filename)
                    year = year_match.group(0) if year_match else None
                    company_name = company_dir.replace("_", " ")

                    extracted_data.append([filename, year, company_name, text])

# Create a DataFrame with the extracted data
df_horizontal = pd.DataFrame(extracted_data, columns=['file_name', 'year', 'company_name', 'text'])

# Convert 'year' column to integer for sorting, handling None values
df_horizontal['year'] = pd.to_numeric(df_horizontal['year'], errors='coerce')

# Sort the DataFrame by 'company_name' and then by 'year'
df_horizontal = df_horizontal.sort_values(by=['company_name', 'year']).reset_index(drop=True)

# Print the DataFrame
pd.set_option('display.max_columns', None)
df_horizontal

Unnamed: 0,file_name,year,company_name,text
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Committee s\nof the Net\nSt s Pace Report 2020...
1,cleaned_sjp-tcfd-report-april2021.txt,2021,ST. JAMES place,46 to our 2021 report\nto St s Pace Group s re...
2,cleaned_SJP_TCFD_Report_2022.txt,2022,ST. JAMES place,change is fast becoming one\nour It is a that ...
3,cleaned_SJP_TCFD_2023.txt,2023,ST. JAMES place,approach We are to our\nwi have a effect over ...
4,cleaned_Travelers_TCFDReport2019.txt,2019,Traveler Cos TRV,Severe weather over the last two have\nthe of ...
5,cleaned_Travelers_TCFDReport2020.txt,2020,Traveler Cos TRV,Severe weather over the last two have\nthe of ...
6,cleaned_Travelers_TCFDReport2021.txt,2021,Traveler Cos TRV,additional uncertainty as to future and Climat...
7,cleaned_Travelers_TCFDReport2022.txt,2022,Traveler Cos TRV,of climate For example the frequency and or se...
8,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Severe weather over the last two have the\nof ...


## Text Cleaning and Post-processing

The script loads a spaCy model and performs text cleaning and post-processing. It includes functions to remove unwanted characters, stop words, numbers, punctuation, and to lemmatize the text. The cleaned text is stored in a new column in the DataFrame.

In [10]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Function to clean text in the text_series
def clean_text(text_series):
    # Convert text_series df to list
    text_list = text_series.to_list()

    # Remove whitespaces and trailing spaces
    def remove_whitespace(text):
        pattern = re.compile(r'\s+')
        without_whitespace = re.sub(pattern, ' ', text)
        text = without_whitespace.replace('?', ' ? ').replace(')', ') ')
        text = text.strip()
        return text

    text_list = list(map(lambda x: remove_whitespace(x), text_list))

    # Apply NLP pipeline to remove stop words, numbers, and lemmatize the words
    cleaned_text_list = []
    for text in tqdm(text_list):  # or tqdm.tqdm
        doc = nlp(text)
        cleaned_text = " ".join([
            token.lemma_
            for token in doc
            if not token.is_stop
            and not token.like_num
            and not token.is_punct
            and token.is_alpha
        ])
        cleaned_text_list.append(cleaned_text)
    return cleaned_text_list

# Function to post-process the DataFrame
def post_process(df):
    # Create a new column 'text_clean'
    df['text_clean'] = df['text']

    # Remove unwanted characters and numeric values
    df['text_clean'] = df['text_clean'].str.replace(',', '', regex=False)
    df['text_clean'] = df['text_clean'].str.replace('.', '', regex=False)
    df['text_clean'] = df['text_clean'].str.replace('(', '', regex=False)
    df['text_clean'] = df['text_clean'].str.replace(')', '', regex=False)
    df['text_clean'] = df['text_clean'].str.replace(r'\d+\.\d+', '', regex=True)
    df['text_clean'] = df['text_clean'].str.replace('\d+', '', regex=True)
    df['text_clean'] = df['text_clean'].astype(str)

    return df

# Assuming df_horizontal is your DataFrame
df_horizontal = post_process(df_horizontal)

# Clean the 'text_clean' column
df_horizontal['text_clean'] = clean_text(df_horizontal['text_clean'])

pd.set_option('display.max_columns', None)
print(df_horizontal['file_name'].nunique())
df_horizontal


100%|██████████| 9/9 [00:07<00:00,  1.23it/s]

9





Unnamed: 0,file_name,year,company_name,text,text_clean
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Committee s\nof the Net\nSt s Pace Report 2020...,Committee s Net St s Pace Report worth year ti...
1,cleaned_sjp-tcfd-report-april2021.txt,2021,ST. JAMES place,46 to our 2021 report\nto St s Pace Group s re...,report St s Pace Group s report wi take action...
2,cleaned_SJP_TCFD_Report_2022.txt,2022,ST. JAMES place,change is fast becoming one\nour It is a that ...,change fast address Foster Director Business i...
3,cleaned_SJP_TCFD_2023.txt,2023,ST. JAMES place,approach We are to our\nwi have a effect over ...,approach wi effect time Maria Spooner Director...
4,cleaned_Travelers_TCFDReport2019.txt,2019,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...
5,cleaned_Travelers_TCFDReport2020.txt,2020,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...
6,cleaned_Travelers_TCFDReport2021.txt,2021,Traveler Cos TRV,additional uncertainty as to future and Climat...,additional uncertainty future climate governme...
7,cleaned_Travelers_TCFDReport2022.txt,2022,Traveler Cos TRV,of climate For example the frequency and or se...,climate example frequency severity hurricane t...
8,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Severe weather over the last two have the\nof ...,severe weather climate example frequency sever...


## Noun Lemmatization of Cleaned TCFDs

The code applies lemmatization to extract nouns from the 'text_clean' column using spaCy and stores the results in a new column 'text_noun'.

In [11]:
# Load spaCy model
nlp = spacy.load("en_core_web_lg", disable=["parser", "ner"])

# Lemmatization function
def lemmatization(texts, allowed_postags=["NOUN"]):
    doc = nlp(texts)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    final = " ".join(new_text)
    return final

# Apply lemmatization to the 'text_clean' column
df_horizontal['text_noun'] = df_horizontal['text_clean'].apply(lemmatization)

# Display the DataFrame
pd.set_option('display.max_columns', None)
print(df_horizontal['file_name'].nunique())
df_horizontal

9


Unnamed: 0,file_name,year,company_name,text,text_clean,text_noun
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Committee s\nof the Net\nSt s Pace Report 2020...,Committee s Net St s Pace Report worth year ti...,year time commitment term change business repo...
1,cleaned_sjp-tcfd-report-april2021.txt,2021,ST. JAMES place,46 to our 2021 report\nto St s Pace Group s re...,report St s Pace Group s report wi take action...,report action change business home page report...
2,cleaned_SJP_TCFD_Report_2022.txt,2022,ST. JAMES place,change is fast becoming one\nour It is a that ...,change fast address Foster Director Business i...,address introduction statement business parent...
3,cleaned_SJP_TCFD_2023.txt,2023,ST. JAMES place,approach We are to our\nwi have a effect over ...,approach wi effect time Maria Spooner Director...,effect time business unit trust manager whist ...
4,cleaned_Travelers_TCFDReport2019.txt,2019,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...,weather climate climate severity insurance com...
5,cleaned_Travelers_TCFDReport2020.txt,2020,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...,weather climate climate severity uncertainty i...
6,cleaned_Travelers_TCFDReport2021.txt,2021,Traveler Cos TRV,additional uncertainty as to future and Climat...,additional uncertainty future climate governme...,uncertainty climate government catastrophe mod...
7,cleaned_Travelers_TCFDReport2022.txt,2022,Traveler Cos TRV,of climate For example the frequency and or se...,climate example frequency severity hurricane t...,example frequency severity wildfire time perio...
8,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Severe weather over the last two have the\nof ...,severe weather climate example frequency sever...,weather climate example frequency severity wil...


## Tokenizing and Filtering

The script performs detailed text tokenization and filtering on TCFD disclosures. The steps involve defining functions to clean, tokenize text and removing general words that are not useful for analysis.

In [12]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def clean_tokens_noun(text_series):
    # Step 1: Convert text_series df to list
    text_list = text_series.to_list()

    # Step 2: Change the list to lower case
    text_list = list(map(lambda x: x.lower(), text_list))

    # Step 3: Remove whitespaces and trailing spaces
    def remove_whitespace(text):
        pattern = re.compile(r'\s+')
        without_whitespace = re.sub(pattern, ' ', text)
        text = without_whitespace.replace('?', ' ? ').replace(')', ') ')
        text = text.strip()
        return text

    text_list = list(map(lambda x: remove_whitespace(x), text_list))

    # Create column for cleaned text_list
    tokens, tmp_tokens = [], []
    for text in tqdm(text_list):
        tmp_tokens = [
            token.lemma_
            for token in nlp(text)
            if not token.is_stop
            and not token.like_num
            and not token.is_punct
            and token.is_alpha
        ]
        tokens.append(tmp_tokens)
        tmp_tokens = []
    return tokens

# Use apply to get the token of the text_noun column
df_horizontal['token'] = clean_tokens_noun(df_horizontal['text_noun'])
print(len(df_horizontal))
df_horizontal


100%|██████████| 9/9 [00:00<00:00, 11.53it/s]

9





Unnamed: 0,file_name,year,company_name,text,text_clean,text_noun,token
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Committee s\nof the Net\nSt s Pace Report 2020...,Committee s Net St s Pace Report worth year ti...,year time commitment term change business repo...,"[year, time, commitment, term, change, busines..."
1,cleaned_sjp-tcfd-report-april2021.txt,2021,ST. JAMES place,46 to our 2021 report\nto St s Pace Group s re...,report St s Pace Group s report wi take action...,report action change business home page report...,"[report, action, change, business, home, page,..."
2,cleaned_SJP_TCFD_Report_2022.txt,2022,ST. JAMES place,change is fast becoming one\nour It is a that ...,change fast address Foster Director Business i...,address introduction statement business parent...,"[address, introduction, statement, business, p..."
3,cleaned_SJP_TCFD_2023.txt,2023,ST. JAMES place,approach We are to our\nwi have a effect over ...,approach wi effect time Maria Spooner Director...,effect time business unit trust manager whist ...,"[effect, time, business, unit, trust, manager,..."
4,cleaned_Travelers_TCFDReport2019.txt,2019,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...,weather climate climate severity insurance com...,"[weather, climate, climate, severity, insuranc..."
5,cleaned_Travelers_TCFDReport2020.txt,2020,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...,weather climate climate severity uncertainty i...,"[weather, climate, climate, severity, uncertai..."
6,cleaned_Travelers_TCFDReport2021.txt,2021,Traveler Cos TRV,additional uncertainty as to future and Climat...,additional uncertainty future climate governme...,uncertainty climate government catastrophe mod...,"[uncertainty, climate, government, catastrophe..."
7,cleaned_Travelers_TCFDReport2022.txt,2022,Traveler Cos TRV,of climate For example the frequency and or se...,climate example frequency severity hurricane t...,example frequency severity wildfire time perio...,"[example, frequency, severity, wildfire, time,..."
8,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Severe weather over the last two have the\nof ...,severe weather climate example frequency sever...,weather climate example frequency severity wil...,"[weather, climate, example, frequency, severit..."


In [13]:
# Drop the words that are too general for earnings calls
general_words = [
    'lady', 'gentleman', 'presentation',
    'question', 'answer', 'slide', 'mm', 'mm_mm', 'guy', 'sir', ' ', 'ytd', 'host_sir',
    'bb', 'ty', 'word', 'year', 'quer', 'month', 'period', 'day', 'time', 'result',
    'investor', 'week', 'update', 'business', 'lot', 'ratio', 'rate', 'quarter',
    'number', 'point', 'term', 'thing', 'level', 'bit', 'sort', 'reason', 'management',
    'fact', 'case', 'area', 'people', 'sense', 'item', 'issue', 'market', 'meeting',
    'questions', 'answers', 'managements', 'discussion', 'section', 'presentation', 
    'speaker', 'participant', 'afternoon', 'morning', 'conference', 'today', 'lady', 
    'gentleman', 'presentation', 'question', 'answer', 'slide',
    'mm', 'mm_mm', 'guy', 'sir', 'host_sir', 'bb', 'ty', 'word', 'year', 'quer',
    'month', 'period', 'day', 'time', 'result', 'investor', 'week', 'update', 'business', 'lot', 'ratio', 'rate', 'quarter',
    'number', 'point', 'term', 'thing', 'level', 'bit', 'sort', 'reason', 'management',
    'fact', 'case', 'area', 'people', 'sense', 'item', 'issue', 'market', 'earnings',
    'report', 'financial', 'results', 'quarterly', 'performance', 'guidance', 'statement',
    'outlook', 'projection', 'profit', 'loss', 'revenue', 'sales', 'expense', 'income',
    'cash', 'flow', 'margin', 'growth', 'decline', 'increase', 'decrease', 'forecast',
    'expectation', 'trend', 'metric', 'indicator', 'shareholder', 'stock', 'price', 'value',
    'equity', 'debt', 'asset', 'liability', 'balance', 'sheet', 'capital', 'investment',
    'portfolio', 'dividend', 'yield', 'ratio', 'return', 'earnings', 'per', 'share', 'EPS',
    'acquisition', 'merger', 'synergy', 'integration', 'strategy', 'execution', 'plan',
    'objective', 'goal', 'target', 'vision', 'mission', 'operation', 'process', 'initiative',
    'efficiency', 'optimization', 'innovation', 'technology', 'product', 'service', 'customer',
    'client', 'market', 'segment', 'competition', 'competitor', 'industry', 'sector', 'environment',
    'regulation', 'compliance', 'risk', 'opportunity', 'challenge', 'threat', 'advantage',
    'disadvantage', 'strength', 'weakness', 'SWOT', 'analysis', 'review', 'summary',
    'highlight', 'detail', 'report', 'note', 'comment', 'announcement', 'release', 'update','tcfd, Severe wind and hail n an a n an a, Hurricane n a n a, Winter storm n a n a'
]

# Apply the filtering to your DataFrame
df_horizontal['token'] = df_horizontal['token'].apply(lambda x: [i for i in x if i not in general_words])

# Display the updated DataFrame
df_horizontal


Unnamed: 0,file_name,year,company_name,text,text_clean,text_noun,token
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Committee s\nof the Net\nSt s Pace Report 2020...,Committee s Net St s Pace Report worth year ti...,year time commitment term change business repo...,"[commitment, change, scope, change, pace, addi..."
1,cleaned_sjp-tcfd-report-april2021.txt,2021,ST. JAMES place,46 to our 2021 report\nto St s Pace Group s re...,report St s Pace Group s report wi take action...,report action change business home page report...,"[action, change, home, page, desire, head, cha..."
2,cleaned_SJP_TCFD_Report_2022.txt,2022,ST. JAMES place,change is fast becoming one\nour It is a that ...,change fast address Foster Director Business i...,address introduction statement business parent...,"[address, introduction, parent, company, unit,..."
3,cleaned_SJP_TCFD_2023.txt,2023,ST. JAMES place,approach We are to our\nwi have a effect over ...,approach wi effect time Maria Spooner Director...,effect time business unit trust manager whist ...,"[effect, unit, trust, manager, whist, company,..."
4,cleaned_Travelers_TCFDReport2019.txt,2019,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...,weather climate climate severity insurance com...,"[weather, climate, climate, severity, insuranc..."
5,cleaned_Travelers_TCFDReport2020.txt,2020,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...,weather climate climate severity uncertainty i...,"[weather, climate, climate, severity, uncertai..."
6,cleaned_Travelers_TCFDReport2021.txt,2021,Traveler Cos TRV,additional uncertainty as to future and Climat...,additional uncertainty future climate governme...,uncertainty climate government catastrophe mod...,"[uncertainty, climate, government, catastrophe..."
7,cleaned_Travelers_TCFDReport2022.txt,2022,Traveler Cos TRV,of climate For example the frequency and or se...,climate example frequency severity hurricane t...,example frequency severity wildfire time perio...,"[example, frequency, severity, wildfire, clima..."
8,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Severe weather over the last two have the\nof ...,severe weather climate example frequency sever...,weather climate example frequency severity wil...,"[weather, climate, example, frequency, severit..."


## Extracting and Cleaning Frequent Words

The script extracts the 50 most frequent words from the TCFD disclosures for each company and cleans the tokens by removing these frequent words. 

In [14]:
# Initialize a list to hold the results
most_frequent_words = []

# Iterate over each unique company
for company in df_horizontal['company_name'].unique():
    # Filter the DataFrame for the current company
    company_df = df_horizontal[df_horizontal['company_name'] == company]

    # Iterate over each text file for the current company
    for _, row in company_df.iterrows():
        text = row['text']
        file_name = row['file_name']

        # Tokenize and clean the text
        tokens = clean_tokens_noun(pd.Series([text]))[0]  # Using your existing function

        # Calculate word frequency
        word_freq = Counter(tokens).most_common(50)

        # Append the results to the list
        most_frequent_words.append([file_name, company, word_freq])

# Create a DataFrame with the results
frequent_words_df = pd.DataFrame(most_frequent_words, columns=['file_name', 'company_name', 'word_freq'])

# Display the DataFrame
frequent_words_df

100%|██████████| 1/1 [00:00<00:00,  7.02it/s]
100%|██████████| 1/1 [00:00<00:00,  3.82it/s]
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
100%|██████████| 1/1 [00:00<00:00,  2.48it/s]
100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
100%|██████████| 1/1 [00:00<00:00,  3.39it/s]
100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
100%|██████████| 1/1 [00:00<00:00,  1.71it/s]


Unnamed: 0,file_name,company_name,word_freq
0,cleaned_sjp-tcfd-report-april2020.txt,ST. JAMES place,"[(s, 73), (risk, 56), (pace, 53), (carbon, 45)..."
1,cleaned_sjp-tcfd-report-april2021.txt,ST. JAMES place,"[(risk, 98), (s, 77), (business, 70), (carbon,..."
2,cleaned_SJP_TCFD_Report_2022.txt,ST. JAMES place,"[(risk, 144), (s, 94), (investment, 93), (carb..."
3,cleaned_SJP_TCFD_2023.txt,ST. JAMES place,"[(risk, 193), (s, 118), (investment, 111), (bu..."
4,cleaned_Travelers_TCFDReport2019.txt,Traveler Cos TRV,"[(risk, 103), (climate, 100), (relate, 66), (c..."
5,cleaned_Travelers_TCFDReport2020.txt,Traveler Cos TRV,"[(climate, 103), (risk, 102), (relate, 67), (c..."
6,cleaned_Travelers_TCFDReport2021.txt,Traveler Cos TRV,"[(risk, 192), (climate, 185), (energy, 135), (..."
7,cleaned_Travelers_TCFDReport2022.txt,Traveler Cos TRV,"[(risk, 198), (climate, 186), (energy, 130), (..."
8,cleaned_Travelers_TCFDReport2023.txt,Traveler Cos TRV,"[(risk, 188), (climate, 169), (energy, 141), (..."


In [15]:
# Make sure to download the words corpus if you haven't already
nltk.download('words')

# Initialize a list to hold the results
most_frequent_words = []

# Iterate over each unique company
for company in df_horizontal['company_name'].unique():
    # Filter the DataFrame for the current company
    company_df = df_horizontal[df_horizontal['company_name'] == company]

    # Iterate over each text file for the current company
    for _, row in company_df.iterrows():
        text = row['text_noun']
        file_name = row['file_name']

        # Tokenize and clean the text
        tokens = clean_tokens_noun(pd.Series([text]))[0]  # Using your existing function

        # Calculate word frequency
        word_freq = Counter(tokens).most_common(50)

        # Append the results to the list
        most_frequent_words.append([file_name, company, word_freq])

# Create a DataFrame with the results
frequent_words_df = pd.DataFrame(most_frequent_words, columns=['file_name', 'company_name', 'word_freq'])

# Remove the most frequent words from tokens
cleaned_tokens = []

for index, row in df_horizontal.iterrows():
    text = row['text_noun']
    tokens = clean_tokens_noun(pd.Series([text]))[0]

    # Calculate word frequency for the current text
    word_freq = Counter(tokens).most_common(50)
    most_frequent_words = [word for word, freq in word_freq]

    # Remove the most frequent words from tokens
    tokens = [word for word in tokens if word not in most_frequent_words]

    # Append the cleaned tokens to the list
    cleaned_tokens.append(tokens)

# Update the 'token' column with cleaned tokens
df_horizontal['token'] = cleaned_tokens

# Display the DataFrame
df_horizontal

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dimi3\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
100%|██████████| 1/1 [00:00<00:00, 36.25it/s]
100%|██████████| 1/1 [00:00<00:00, 19.31it/s]
100%|██████████| 1/1 [00:00<00:00, 17.15it/s]
100%|██████████| 1/1 [00:00<00:00, 10.76it/s]
100%|██████████| 1/1 [00:00<00:00, 18.97it/s]
100%|██████████| 1/1 [00:00<00:00, 14.32it/s]
100%|██████████| 1/1 [00:00<00:00,  7.70it/s]
100%|██████████| 1/1 [00:00<00:00,  7.47it/s]
100%|██████████| 1/1 [00:00<00:00,  7.09it/s]
100%|██████████| 1/1 [00:00<00:00, 32.10it/s]
100%|██████████| 1/1 [00:00<00:00, 20.41it/s]
100%|██████████| 1/1 [00:00<00:00, 16.39it/s]
100%|██████████| 1/1 [00:00<00:00, 10.38it/s]
100%|██████████| 1/1 [00:00<00:00, 21.10it/s]
100%|██████████| 1/1 [00:00<00:00, 15.52it/s]
100%|██████████| 1/1 [00:00<00:00,  6.99it/s]
100%|██████████| 1/1 [00:00<00:00,  7.90it/s]
100%|██████████| 1/1 [00:00<00:00,  7.09it/s]


Unnamed: 0,file_name,year,company_name,text,text_clean,text_noun,token
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Committee s\nof the Net\nSt s Pace Report 2020...,Committee s Net St s Pace Report worth year ti...,year time commitment term change business repo...,"[time, summary, overview, roe, key, considerat..."
1,cleaned_sjp-tcfd-report-april2021.txt,2021,ST. JAMES place,46 to our 2021 report\nto St s Pace Group s re...,report St s Pace Group s report wi take action...,report action change business home page report...,"[action, home, desire, head, summary, factor, ..."
2,cleaned_SJP_TCFD_Report_2022.txt,2022,ST. JAMES place,change is fast becoming one\nour It is a that ...,change fast address Foster Director Business i...,address introduction statement business parent...,"[address, introduction, statement, parent, uni..."
3,cleaned_SJP_TCFD_2023.txt,2023,ST. JAMES place,approach We are to our\nwi have a effect over ...,approach wi effect time Maria Spooner Director...,effect time business unit trust manager whist ...,"[effect, unit, trust, singe, subsidiary, text,..."
4,cleaned_Travelers_TCFDReport2019.txt,2019,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...,weather climate climate severity insurance com...,"[casualty, role, organization, evaluation, ide..."
5,cleaned_Travelers_TCFDReport2020.txt,2020,Traveler Cos TRV,Severe weather over the last two have\nthe of ...,severe weather future climate climate add freq...,weather climate climate severity uncertainty i...,"[company, casualty, core, provide, price, prot..."
6,cleaned_Travelers_TCFDReport2021.txt,2021,Traveler Cos TRV,additional uncertainty as to future and Climat...,additional uncertainty future climate governme...,uncertainty climate government catastrophe mod...,"[government, experience, frequency, intensity,..."
7,cleaned_Travelers_TCFDReport2022.txt,2022,Traveler Cos TRV,of climate For example the frequency and or se...,climate example frequency severity hurricane t...,example frequency severity wildfire time perio...,"[frequency, severity, wildfire, period, severi..."
8,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Severe weather over the last two have the\nof ...,severe weather climate example frequency sever...,weather climate example frequency severity wil...,"[frequency, severity, wildfire, period, severi..."


## Creating and Saving the Final DataFrames

This script processes the dataframe by splitting texts into chunks of about 150 words, treating each chunk as a paragraph. It groups data by company and applies the splits. The resulting final DataFrames are saved as separate CSV files for each company.

In [16]:
# Create the final DataFrame with the specified columns
tcfd = df_horizontal[['file_name', 'year', 'company_name', 'token', 'text']]

# Rename the columns according to your specifications
tcfd.columns = ['file_name', 'date', 'company_name', 'token', 'text']

# Display the DataFrame
tcfd

Unnamed: 0,file_name,date,company_name,token,text
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"[time, summary, overview, roe, key, considerat...",Committee s\nof the Net\nSt s Pace Report 2020...
1,cleaned_sjp-tcfd-report-april2021.txt,2021,ST. JAMES place,"[action, home, desire, head, summary, factor, ...",46 to our 2021 report\nto St s Pace Group s re...
2,cleaned_SJP_TCFD_Report_2022.txt,2022,ST. JAMES place,"[address, introduction, statement, parent, uni...",change is fast becoming one\nour It is a that ...
3,cleaned_SJP_TCFD_2023.txt,2023,ST. JAMES place,"[effect, unit, trust, singe, subsidiary, text,...",approach We are to our\nwi have a effect over ...
4,cleaned_Travelers_TCFDReport2019.txt,2019,Traveler Cos TRV,"[casualty, role, organization, evaluation, ide...",Severe weather over the last two have\nthe of ...
5,cleaned_Travelers_TCFDReport2020.txt,2020,Traveler Cos TRV,"[company, casualty, core, provide, price, prot...",Severe weather over the last two have\nthe of ...
6,cleaned_Travelers_TCFDReport2021.txt,2021,Traveler Cos TRV,"[government, experience, frequency, intensity,...",additional uncertainty as to future and Climat...
7,cleaned_Travelers_TCFDReport2022.txt,2022,Traveler Cos TRV,"[frequency, severity, wildfire, period, severi...",of climate For example the frequency and or se...
8,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,"[frequency, severity, wildfire, period, severi...",Severe weather over the last two have the\nof ...


In [17]:
# Create the final DataFrame with the specified columns
tcfd = df_horizontal[['file_name', 'year', 'company_name', 'token', 'text']]

# Rename the columns according to your specifications
tcfd.columns = ['file_name', 'date', 'company_name', 'token', 'paragraph']

# Group by 'company_name' to create separate DataFrames for each company
company_dfs = {}
for company in tcfd['company_name'].unique():
    company_df = tcfd[tcfd['company_name'] == company][['file_name', 'date', 'token', 'paragraph']]
    company_name_cleaned = "_".join(company.split()[:2]).replace('(', '').replace(')', '').replace('.', '').replace(',', '').replace("'", "") + '_tcfd_df'
    company_dfs[company_name_cleaned] = company_df

# Function to get DataFrame by company name
def get_company_df(company_name):
    company_name_cleaned = "_".join(company_name.split()[:2]).replace('(', '').replace(')', '').replace('.', '').replace(',', '').replace("'", "") + '_tcfd_df'
    return company_dfs.get(company_name_cleaned, None)

# Function to split text into chunks of approximately 'chunk_size' words
def split_into_chunks(text, chunk_size=150):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to split tokens into chunks corresponding to text chunks
def split_tokens(tokens, chunk_size=150):
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return chunks

# Function to apply the split to the DataFrame and expand the rows
def split_text_and_tokens_in_df(df, chunk_size=150):
    rows = []
    for _, row in df.iterrows():
        text_chunks = split_into_chunks(row['paragraph'], chunk_size)
        token_chunks = split_tokens(row['token'], chunk_size)
        for text_chunk, token_chunk in zip(text_chunks, token_chunks):
            new_row = row.copy()
            new_row['paragraph'] = text_chunk
            new_row['token'] = token_chunk
            rows.append(new_row)
    expanded_df = pd.DataFrame(rows)
    expanded_df.reset_index(drop=True, inplace=True)
    return expanded_df

# Apply the function to split the text and tokens into chunks and expand the DataFrame for each company
for company_name in company_dfs.keys():
    company_dfs[company_name] = split_text_and_tokens_in_df(company_dfs[company_name])

# Save each company's DataFrame to a CSV file
output_dir = "tcfd_company_csvs"
os.makedirs(output_dir, exist_ok=True)

for company_name, df in company_dfs.items():
    csv_path = os.path.join(output_dir, f"{company_name}.csv")
    df.to_csv(csv_path, index=False)