# Pre-Processing Task Force on Climate-related Financial Disclosures

### This script demonstrates the pre-processing of TCF disclosures for two insurance companies, Travelers Cos (TRV) and St. James's Place, preparing them for further NLP analysis. Task Force on Climate-related Financial Disclosures, provide detailed information on how companies are addressing climate-related risks and opportunities. The transcripts were downloaded from each company's website. Their pre-processing is intricate due to the varied formats and detailed nature of the disclosures, necessitating meticulous handling to ensure data consistency.

## Libraries

In [1]:
import os
import fnmatch
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from tqdm import tqdm
import spacy
from nltk.corpus import words
from collections import Counter

## Checking for TCFD Reports in Company Directories

The code defines a function to search for Task Force on Climate-related Financial Disclosures (TCFD) reports within a specified base directory. It looks through directories for banks and insurers, specifically searching for PDF files containing "tcfd report" in their names. Some companies include their TCFD reports inside their annual reports, but this script specifically searches for tailored TCFD reports to ensure focused and relevant data collection. The results are returned in a dictionary structure, organized by sector and company.

In [2]:
def check_tcfd_reports(base_dir):
    results = {}
    for sector in ["Banks", "Insurers"]:
        sector_path = os.path.join(base_dir, sector)
        if not os.path.isdir(sector_path):
            continue
        
        companies = os.listdir(sector_path)
        for company in companies:
            company_path = os.path.join(sector_path, company)
            if not os.path.isdir(company_path):
                continue
            
            tcfd_path = os.path.join(company_path, "TCFD")
            if not os.path.isdir(tcfd_path):
                continue
            
            # Look for PDF files with "tcfd report" in their name (case insensitive)
            for root, dirs, files in os.walk(tcfd_path):
                for file in files:
                    if file.lower().endswith(".pdf") and fnmatch.fnmatch(file.lower(), "*tcfd*.pdf"):
                        if sector not in results:
                            results[sector] = {}
                        if company not in results[sector]:
                            results[sector][company] = []
                        results[sector][company].append(os.path.join(root, file))
    
    return results

base_dir = 'ARP BOE'
report_files = check_tcfd_reports(base_dir)
for sector, companies in report_files.items():
    for company, files in companies.items():
        print(f"Sector: {sector}, Company: {company}")
        for file in files:
            print(f"  {file}")

Sector: Insurers, Company: ST. JAMES place
  ARP BOE\Insurers\ST. JAMES place\TCFD\sjp-tcfd-report-april2020.pdf
  ARP BOE\Insurers\ST. JAMES place\TCFD\sjp-tcfd-report-april2021.pdf
  ARP BOE\Insurers\ST. JAMES place\TCFD\SJP_TCFD_2023.pdf
  ARP BOE\Insurers\ST. JAMES place\TCFD\SJP_TCFD_Report_2022.pdf
Sector: Insurers, Company: Traveler Cos TRV
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2019.pdf
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2020.pdf
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2021.pdf
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2022.pdf
  ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2023.pdf


## Extracting text out of the PDFs 

In [3]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if reader.is_encrypted:
                reader.decrypt('')
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def traverse_and_extract_text_for_companies(root_dir, output_dir, companies, max_files_per_folder=10):
    total_size = 0

    for sector in companies:
        for company in companies[sector]:
            tcfd_path = os.path.join(root_dir, sector, company, "TCFD")
            if os.path.isdir(tcfd_path):
                company_output_dir = os.path.join(output_dir, company)
                os.makedirs(company_output_dir, exist_ok=True)
                
                file_count = 0
                for root, dirs, files in os.walk(tcfd_path):
                    for file in files:
                        if file.lower().endswith(".pdf") and "tcfd" in file.lower() and file_count < max_files_per_folder:
                            pdf_path = os.path.join(root, file)
                            file_size = os.path.getsize(pdf_path)
                            text = extract_text_from_pdf(pdf_path)

                            if text:
                                # Save the text as a .txt file
                                output_file = os.path.join(company_output_dir, f"{os.path.splitext(file)[0]}.txt")
                                with open(output_file, 'w', encoding='utf-8') as txt_file:
                                    txt_file.write(text)
                                print(f"Saved text for {pdf_path} to {output_file}")

                                total_size += file_size
                                file_count += 1

                            if file_count >= max_files_per_folder:
                                break

    return total_size

# Root directory path
root_dir = "ARP BOE"
# Output directory path
output_dir = "TCFD Texts"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Specified companies
companies = {
    "Insurers": ["Traveler Cos TRV", "ST. JAMES place"],
}

total_size = traverse_and_extract_text_for_companies(root_dir, output_dir, companies)

Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2019.pdf to TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2019.txt
Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2020.pdf to TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2020.txt
Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2021.pdf to TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2021.txt
Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2022.pdf to TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2022.txt
Saved text for ARP BOE\Insurers\Traveler Cos TRV\TCFD\Travelers_TCFDReport2023.pdf to TCFD Texts\Traveler Cos TRV\Travelers_TCFDReport2023.txt
Saved text for ARP BOE\Insurers\ST. JAMES place\TCFD\sjp-tcfd-report-april2020.pdf to TCFD Texts\ST. JAMES place\sjp-tcfd-report-april2020.txt
Saved text for ARP BOE\Insurers\ST. JAMES place\TCFD\sjp-tcfd-report-april2021.pdf to TCFD Texts\ST. JAMES place\sjp-tcfd-report-april2021.txt

## Tailored Cleaning and Pre-processing

This script demonstrates the cleaning of the TCFDs for Travelers Cos (TRV) and St. James's Place. The script performs several tasks:

- Define Cleaning Functions: Includes functions to clean the text by removing headers, footers, special characters and unnecessary spaces.
- Specific Cleaning for Each Company: Custom cleaning functions are tailored for the unique formats of the disclosures from each company.
- Save Cleaned Texts: The cleaned text files are saved to specified output directories, maintaining the directory structure from the input.

### Traveler Cos TRV

In [4]:
# Directory containing the text files
input_dir = "TCFD Texts/Traveler Cos TRV"
output_directory = 'Cleaned TCFD Texts/Traveler Cos TRV'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Define a function to clean the text
def clean_text(text, year):
    # Remove specific phrase with year
    text = re.sub(rf"Travelers Task Force on Climate-related Financial Disclosures Report\s*{year}\s*\d*", "", text)

    # Remove numbered list items ending with a dot (e.g., "3." but not "3.1")
    text = re.sub(r'\b\d+\.(?!\d)', "", text)

    # Remove bullet points
    text = re.sub(r'•', "", text)

    # Remove text starting from "Important Legal Information"
    text = re.split(r"Important Legal Information", text)[0]

    # Remove "Figure" followed by number and dot or if it is alone in a row
    text = re.sub(r'Figure(\s*\d+\.)?', "", text)

    # Remove bracketed numbers like [1]
    text = re.sub(r'\[\d+\]', "", text)

    # Remove inline numbers within words (e.g., "1kill")
    text = re.sub(r'(?<=\D)\d+(?=\D)', "", text)

    # Normalize spaced letters (e.g., e l e c t r i c v e h i c l e to electric vehicle)
    text = re.sub(r'\b(?:[a-zA-Z]\s)+[a-zA-Z]\b', lambda m: m.group(0).replace(' ', ''), text)

    # Perform additional string replacements
    text = re.sub(r'\*', '', text)
    text = re.sub(r'\[|\]', '', text)
    text = re.sub(r':', '', text)
    text = re.sub(r'- Q', '', text)
    text = re.sub(r'\}', '', text)
    text = re.sub(r'<|>', '', text)

    # Correct words with a full stop between them without space
    text = re.sub(r'(\w)\.(\w)', r'\1. \2', text)

    # Remove specific unwanted strings
    text = re.sub(r'‘‘’|’|n/an/a|n/a|®|-K|\(a\)|costs\.sustainability\.travelers\. com|- -|\(\)|\(\$,\)', '', text)

    # Remove patterns like /one. lnum, /two. lnum, /parenleft. lnum, etc.
    text = re.sub(r'/[a-zA-Z]+\. lnum', '', text)

    # Split text into lines
    lines = text.splitlines()

    # Remove rows based on the year
    if year == "2019" or year == "2020":
        lines = lines[3:]  # Remove the first 3 rows
    elif year == "2021":
        lines = lines[11:]  # Remove the first 11 rows
    elif year == "2022":
        lines = lines[13:]  # Remove the first 13 rows
    elif year == "2023":
        lines = lines[15:]  # Remove the first 15 rows

    # Remove rows that start with `.%,`, `$.,`, `–`, and any pattern with `(-in-)`
    def is_valid_line(line):
        line = line.strip()  # Remove leading and trailing whitespace
        
        # Check for patterns to remove
        if re.match(r'^\s*[\.\%,\$—]', line):
            return False
        
        # Remove lines with patterns like `.%`, `$.`, `(-in-)`, and consecutive symbols like `%%`, `$$`, `$,$,$,`
        if re.search(r'\.%|\$\.\s|\(-in-\)|%%|\$\$|\$, \$, \$,', line):
            return False
        
        # Remove lines with ™ symbols
        if '™' in line:
            return False
        
        # Remove lines with multiple consecutive commas or full stops
        if re.search(r'([.,]\s*){2,}', line):
            return False

        # Remove lines with only one word or only numbers
        if len(line.split()) <= 1 or re.fullmatch(r'\d+', line.strip()):
            return False

        return True

    # Apply the validation function to filter lines
    filtered_lines = [line for line in lines if is_valid_line(line)]

    # Join lines back into text
    text = "\n".join(filtered_lines)

    return text

# Process each text file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_dir, filename)

        # Extract the year from the filename (assuming the format is consistent)
        year_match = re.search(r'\d{4}', filename)
        if year_match:
            year = year_match.group(0)
        else:
            continue

        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Clean the text
        cleaned_text = clean_text(text, year)

        # Save the cleaned text back to a file
        cleaned_file_path = os.path.join(output_directory, f"cleaned_{filename}")
        with open(cleaned_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)

        print(f"Processed {filename} and saved to {cleaned_file_path}")

Processed Travelers_TCFDReport2019.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2019.txt
Processed Travelers_TCFDReport2020.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2020.txt
Processed Travelers_TCFDReport2021.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2021.txt
Processed Travelers_TCFDReport2022.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2022.txt
Processed Travelers_TCFDReport2023.txt and saved to Cleaned TCFD Texts/Traveler Cos TRV\cleaned_Travelers_TCFDReport2023.txt


### St. James Place

In [5]:
# Directory containing the text files
input_dir = "TCFD Texts/ST. JAMES place"
output_directory = 'Cleaned TCFD Texts/ST. JAMES place'

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Define a function to clean the text
def clean_text(text, year):
    
    # Remove specific phrase with year
    text = re.sub(rf"St. James’s Place TCFD Report\s*{year}\s*\d*", "", text)

    # Remove specific sections for 2020, including pipes "|"
    if year == "2020":
        text = re.sub(r'\b(?:Introduction|Governance|Strategy|Risk Management|Metrics & Targets|Glossary)\b', "", text)
        text = text.replace("|", "")

        # Remove numbers at the beginning of a word (e.g., "9cute" becomes "cute")
        text = re.sub(r'\b\d+([a-zA-Z]+)', r'\1', text)
        
        # Remove patterns like "2           St. James’s Place    TCFD Report"
        text = re.sub(r'^\d+\s+St\. James’s Place\s+TCFD Report.*$', '', text, flags=re.MULTILINE)

    # Remove numbered list items ending with a dot (e.g., "3." but not "3.1")
    text = re.sub(r'\b\d+\.(?!\d)', "", text)

    # Remove bullet points and specific characters (but not "L" or "l")
    text = re.sub(r'•||©', "", text)

    # Normalize spaced letters (e.g., e l e c t r i c v e h i c l e to electric vehicle)
    text = re.sub(r'\b(?:[a-zA-Z]\s)+[a-zA-Z]\b', lambda m: m.group(0).replace(' ', ''), text)

    # Split text into lines
    lines = text.splitlines()

    # The first 80 and last 90 rows have been removed, containing contents and legal statements.
    if len(lines) > 170:
        lines = lines[80:-90]
    
    # Define patterns for removal
    patterns = [
        r'^\s*[a-zA-Z0-9]\)\s*',  # Lines starting with "c)" or "1)"
        r'^\s*#\d+',              # Lines starting with "#3"
        r'^\s*Scope\b.*',         # Lines starting with "Scope"
        r'^\s*[%$\-].*',          # Lines starting with "%", "$", or "-"
        r'^\s*Pages\s*\d+–\d+\s*$', # Lines like "Pages 44–45"
        r'^\s*Figure(\s*\d+\.)?$', # "Figure" alone or followed by number
    ]

    # Filter lines based on the defined patterns and only keep valid lines
    filtered_lines = []
    previous_line = None
    
    for line in lines:
        line = line.lstrip()  # Remove leading spaces

        # Remove lines with only numbers, only letters, or combinations like 'S 0 1'
        if (re.match(r'^[0-9\s,]+$', line.strip()) or 
            re.match(r'^[a-zA-Z\s]+$', line.strip()) or 
            re.match(r'^[a-zA-Z0-9\s,]+$', line.strip())):
            continue
        
        # Only keep lines that do not match removal criteria
        if not any(re.match(pattern, line) for pattern in patterns):
            filtered_lines.append(line)

    # Remove empty lines
    filtered_lines = [line for line in filtered_lines if line.strip()]

    # Join lines back into text
    text = "\n".join(filtered_lines)

    return text

# Process each text file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_dir, filename)

        # Extract the year from the filename
        year_match = re.search(r'\d{4}', filename)
        if year_match:
            year = year_match.group(0)
        else:
            continue

        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Clean the text
        cleaned_text = clean_text(text, year)

        # Save the cleaned text back to a file
        cleaned_file_path = os.path.join(output_directory, f"cleaned_{filename}")
        with open(cleaned_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)

        print(f"Processed {filename} and saved to {cleaned_file_path}")

Processed sjp-tcfd-report-april2020.txt and saved to Cleaned TCFD Texts/ST. JAMES place\cleaned_sjp-tcfd-report-april2020.txt
Processed sjp-tcfd-report-april2021.txt and saved to Cleaned TCFD Texts/ST. JAMES place\cleaned_sjp-tcfd-report-april2021.txt
Processed SJP_TCFD_2023.txt and saved to Cleaned TCFD Texts/ST. JAMES place\cleaned_SJP_TCFD_2023.txt
Processed SJP_TCFD_Report_2022.txt and saved to Cleaned TCFD Texts/ST. JAMES place\cleaned_SJP_TCFD_Report_2022.txt


## Organising Cleaned TCFD Text Data

The script extracts and organises cleaned TCFD text data. It performs the following steps:

- Data Extraction: Reads text files from directories, extracting the year from file names and the company name from folder names.
- Paragraph Identification: Splits text into lines and identifies paragraphs based on end-of-paragraph criteria. 
- DataFrame Creation: Constructs a DataFrame with columns: `file_name`, `date`, `company_name` and `paragraph`.
- Data Sorting: Converts the 'year' column to integers, sorts the DataFrame by company name and year, and ensures proper organisation for analysis.

In [6]:
# Directory containing the cleaned text files
input_dir = "Cleaned TCFD Texts"

# Initialize a list to hold the text data
text_data = []

# Function to determine if a line signifies the end of a paragraph
def is_end_of_paragraph(line, next_line):
    # End-of-paragraph criteria: line ends with a period and next line starts with a capital letter
    return bool(re.search(r'\.\s*$', line)) and next_line and next_line[0].isupper()

# Iterate over each company folder in the directory
for company_folder in os.listdir(input_dir):
    company_path = os.path.join(input_dir, company_folder)
    if os.path.isdir(company_path):
        # Iterate over each text file in the company folder
        for filename in os.listdir(company_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(company_path, filename)
                
                # Extract the date from the filename (4-digit number at the end)
                date_match = re.search(r'(\d{4})\.txt$', filename)
                date = date_match.group(1) if date_match else None

                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    lines = text.split('\n')

                    # Process the lines to create paragraphs
                    if len(lines) > 1:
                        company_name = company_folder.replace("_", " ")  # Get company name from folder
                        paragraph = ""
                        for i in range(2, len(lines)):
                            line = lines[i]
                            next_line = lines[i + 1] if i + 1 < len(lines) else ""
                            if is_end_of_paragraph(line, next_line):
                                paragraph += line + " "
                                text_data.append([filename, date, company_name, paragraph.strip()])
                                paragraph = ""
                            else:
                                paragraph += line + " "
                        
                        # Add the last paragraph if there is no trailing empty line
                        if paragraph:
                            text_data.append([filename, date, company_name, paragraph.strip()])

# Create a DataFrame with the specified structure
paragraphs_df = pd.DataFrame(text_data, columns=['file_name', 'date', 'company_name', 'paragraph'])

# Print the new DataFrame
pd.set_option('display.max_columns', None)
paragraphs_df

Unnamed: 0,file_name,date,company_name,paragraph
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,commitment to help build a long -term future f...
1,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"Rosemary Hilary , Independent Non -executive ..."
2,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Group’s risk management frameworkEnsuring exec...
3,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,IndividualCross -function working group to dis...
4,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have explained our approach to meeting thes...
...,...,...,...,...
1098,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,O&G Business Unit - Percentage of Total Travel...
1099,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,We expect our renewable energy book of busines...
1100,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Travelers Underwriting Exposure to Carbon Int...
1101,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,The charts below illustrate the percentage of ...


Duplicate paragraphs are identified and removed, and the DataFrame is reset to ensure continuous indexing.

In [7]:
# Display the length of the DataFrame before removing duplicates
print("Before:", len(paragraphs_df))

# Create a copy of the DataFrame to check for duplicates
check_dup = paragraphs_df.copy()

# Check for duplicate paragraphs
check_dup['true_false'] = paragraphs_df.duplicated(subset=['paragraph'])

# Display the count of duplicate and unique paragraphs
print(check_dup['true_false'].value_counts())

# Keep only one row for each unique paragraph
paragraphs_df = paragraphs_df.drop_duplicates(subset=['paragraph'])

# Reset the index to make sure it's continuous
paragraphs_df = paragraphs_df.reset_index(drop=True)

# Display the length of the DataFrame after removing duplicates
print("After:", len(paragraphs_df))

Before: 1103
true_false
False    1059
True       44
Name: count, dtype: int64
After: 1059


In [8]:
paragraph_counts = paragraphs_df.groupby('file_name').size()
print(paragraph_counts)

file_name
cleaned_SJP_TCFD_2023.txt                152
cleaned_SJP_TCFD_Report_2022.txt         109
cleaned_Travelers_TCFDReport2019.txt      72
cleaned_Travelers_TCFDReport2020.txt      95
cleaned_Travelers_TCFDReport2021.txt     153
cleaned_Travelers_TCFDReport2022.txt     142
cleaned_Travelers_TCFDReport2023.txt     209
cleaned_sjp-tcfd-report-april2020.txt     39
cleaned_sjp-tcfd-report-april2021.txt     88
dtype: int64


In [9]:
# Display a random paragraph
with pd.option_context('display.max_colwidth', None):
    random_paragraph = paragraphs_df['paragraph'].sample(n=1).iloc[0]
random_paragraph

'With respect to our significant municipal bond portfolio, which  the impact of changing climate conditions on a given city, state  or region as part of our credit analysis. Since we assume catastrophe risks such as earthquakes and  windstorms in our capacity as an insurer, we also seek to manage  our portfolios credit risk to such events by assessing our  investment exposures to such catastrophes. In addition, for  municipal bond issuers in the Southwestern United States and  other areas of the country susceptible to drought, all investment  analyses include an assessment of water supply adequacy.'

## Text Cleaning and Post-processing

The script loads a spaCy model and performs text cleaning and post-processing. It includes functions to remove unwanted characters, stop words, numbers, punctuation, and to lemmatize the text. The cleaned text is stored in a new column in the DataFrame.

In [10]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Function to clean text in the text_series
def clean_text(text_series):
    # Convert text_series df to list
    text_list = text_series.to_list()

    # Remove whitespaces and trailing spaces
    def remove_whitespace(text):
        pattern = re.compile(r'\s+')
        without_whitespace = re.sub(pattern, ' ', text)
        text = without_whitespace.replace('?', ' ? ').replace(')', ') ')
        text = text.strip()
        return text

    text_list = list(map(lambda x: remove_whitespace(x), text_list))

    # Apply NLP pipeline to remove stop words, numbers, and lemmatize the words
    cleaned_text_list = []
    for text in tqdm(text_list):  # or tqdm.tqdm
        doc = nlp(text)
        cleaned_text = " ".join([
            token.lemma_
            for token in doc
            if not token.is_stop
            and not token.like_num
            and not token.is_punct
            and token.is_alpha
        ])
        cleaned_text_list.append(cleaned_text)
    return cleaned_text_list

# Function to post-process the DataFrame
def post_process(df):
    # Create a new column 'paragraph_clean'
    df['paragraph_clean'] = df['paragraph']

    # Remove unwanted characters and numeric values
    df['paragraph_clean'] = df['paragraph_clean'].str.replace(',', '', regex=False)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace('.', '', regex=False)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace('(', '', regex=False)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace(')', '', regex=False)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace(r'\d+\.\d+', '', regex=True)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace('\d+', '', regex=True)
    df['paragraph_clean'] = df['paragraph_clean'].astype(str)

    return df

# Apply post-processing to the DataFrame
paragraphs_df = post_process(paragraphs_df)

# Clean the 'paragraph_clean' column
paragraphs_df['paragraph_clean'] = clean_text(paragraphs_df['paragraph_clean'])

# Set display options and print DataFrame
pd.set_option('display.max_columns', None)
print(paragraphs_df['file_name'].nunique())
paragraphs_df

100%|██████████| 1059/1059 [00:24<00:00, 42.99it/s]

9





Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,commitment to help build a long -term future f...,commitment help build long future Force Climat...
1,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"Rosemary Hilary , Independent Non -executive ...",Rosemary Hilary Independent Non Director chair...
2,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Group’s risk management frameworkEnsuring exec...,Group risk management frameworkensuring execut...
3,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,IndividualCross -function working group to dis...,individualcross work group discuss emerge ESG ...
4,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have explained our approach to meeting thes...,explain approach meet objective documentour pe...
...,...,...,...,...,...
1054,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,O&G Business Unit - Percentage of Total Travel...,Business Unit Percentage Total Travelers Domes...
1055,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,We expect our renewable energy book of busines...,expect renewable energy book business continue...
1056,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Travelers Underwriting Exposure to Carbon Int...,traveler underwrite exposure Carbon Intensive ...
1057,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,The charts below illustrate the percentage of ...,chart illustrate percentage domestic premium a...


## Noun Lemmatisation of Cleaned TCFDs

The code applies lemmatisation to extract nouns from the 'paragraph_clean' column using spaCy and stores the results in a new column 'paragraph_noun'.

In [11]:
# Load spaCy model
nlp = spacy.load("en_core_web_lg", disable=["parser", "ner"])

# Lemmatization function
def lemmatization(texts, allowed_postags=["NOUN"]):
    doc = nlp(texts)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    final = " ".join(new_text)
    return final

# Apply lemmatization to the 'paragraph_clean' column
paragraphs_df['paragraph_noun'] = paragraphs_df['paragraph_clean'].apply(lemmatization)

# Display the DataFrame
pd.set_option('display.max_columns', None)
print(paragraphs_df['file_name'].nunique())
paragraphs_df

9


Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,commitment to help build a long -term future f...,commitment help build long future Force Climat...,commitment help recommendation climate risk di...
1,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"Rosemary Hilary , Independent Non -executive ...",Rosemary Hilary Independent Non Director chair...,chair impact
2,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Group’s risk management frameworkEnsuring exec...,Group risk management frameworkensuring execut...,risk management execution investment principle...
3,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,IndividualCross -function working group to dis...,individualcross work group discuss emerge ESG ...,work group activity climate risk opportunity m...
4,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have explained our approach to meeting thes...,explain approach meet objective documentour pe...,approach commitment governance framework frame...
...,...,...,...,...,...,...
1054,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,O&G Business Unit - Percentage of Total Travel...,Business Unit Percentage Total Travelers Domes...,business percentage total traveler traveler pr...
1055,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,We expect our renewable energy book of busines...,expect renewable energy book business continue...,energy book business time progress business
1056,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Travelers Underwriting Exposure to Carbon Int...,traveler underwrite exposure Carbon Intensive ...,traveler exposure carbon sector classification...
1057,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,The charts below illustrate the percentage of ...,chart illustrate percentage domestic premium a...,chart percentage premium associate carbon sect...


In [12]:
# Drop the 'paragraph_noun' if it is not in English
paragraphs_df = paragraphs_df[paragraphs_df['paragraph_noun'].str.contains('[a-zA-Z]')]
paragraphs_df = paragraphs_df.reset_index(drop=True)

## Tokenising and Filtering

The script performs detailed text tokenisation and filtering on TCFD disclosures. The steps involve defining functions to clean, tokenise text and removing general words that are not useful for analysis.

In [13]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Load a set of common English words
nltk.download('words')
english_words = set(words.words())

def clean_tokens_noun(text_series):
    # Step 1: Convert text_series df to list
    text_list = text_series.to_list()

    # Step 2: Change the list to lower case
    text_list = [x.lower() for x in text_list]

    # Step 3: Remove whitespaces and trailing spaces
    def remove_whitespace(text):
        pattern = re.compile(r'\s+')
        without_whitespace = re.sub(pattern, ' ', text)
        text = without_whitespace.replace('?', ' ? ').replace(')', ') ')
        text = text.strip()
        return text

    text_list = [remove_whitespace(x) for x in text_list]

    # Step 4: Process tokens and remove non-English words, single letters, and numbers
    tokens = []
    for text in tqdm(text_list):
        doc = nlp(text)
        tmp_tokens = [
            token.lemma_
            for token in doc
            if not token.is_stop
            and not token.like_num
            and not token.is_punct
            and token.is_alpha
            and token.lemma_ in english_words  # Only include English words
            and len(token.lemma_) > 1           # Exclude single letters
        ]
        tokens.append(tmp_tokens)
    return tokens

# Apply the clean_tokens_noun function to get the tokens of the 'paragraph_noun' column
paragraphs_df['token'] = clean_tokens_noun(paragraphs_df['paragraph_noun'])

# Print the length of the DataFrame and display it
print(len(paragraphs_df))
paragraphs_df

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dimi3\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
100%|██████████| 1054/1054 [00:05<00:00, 179.63it/s]

1054





Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun,token
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,commitment to help build a long -term future f...,commitment help build long future Force Climat...,commitment help recommendation climate risk di...,"[commitment, help, recommendation, climate, ri..."
1,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"Rosemary Hilary , Independent Non -executive ...",Rosemary Hilary Independent Non Director chair...,chair impact,"[chair, impact]"
2,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Group’s risk management frameworkEnsuring exec...,Group risk management frameworkensuring execut...,risk management execution investment principle...,"[risk, management, execution, investment, prin..."
3,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,IndividualCross -function working group to dis...,individualcross work group discuss emerge ESG ...,work group activity climate risk opportunity m...,"[work, group, activity, climate, risk, opportu..."
4,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have explained our approach to meeting thes...,explain approach meet objective documentour pe...,approach commitment governance framework frame...,"[approach, commitment, governance, framework, ..."
...,...,...,...,...,...,...,...
1049,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,O&G Business Unit - Percentage of Total Travel...,Business Unit Percentage Total Travelers Domes...,business percentage total traveler traveler pr...,"[business, percentage, total, traveler, travel..."
1050,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,We expect our renewable energy book of busines...,expect renewable energy book business continue...,energy book business time progress business,"[energy, book, business, time, progress, busin..."
1051,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Travelers Underwriting Exposure to Carbon Int...,traveler underwrite exposure Carbon Intensive ...,traveler exposure carbon sector classification...,"[traveler, exposure, carbon, sector, classific..."
1052,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,The charts below illustrate the percentage of ...,chart illustrate percentage domestic premium a...,chart percentage premium associate carbon sect...,"[chart, percentage, premium, associate, carbon..."


In [14]:
# Create column for length analysis outcome
paragraphs_df['word_count'] = paragraphs_df["token"].apply(lambda x: len(x))
paragraphs_df['characters_count'] = paragraphs_df["token"].apply(lambda x: sum(len(word) for word in x))
paragraphs_df['sentence_count'] = paragraphs_df['paragraph'].apply(lambda x: len(str(x).split(".")))
paragraphs_df['avg_word_length'] = paragraphs_df['characters_count'] / paragraphs_df['word_count']
paragraphs_df['avg_sentence_length'] = paragraphs_df['word_count'] / paragraphs_df['sentence_count']
paragraphs_df

Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun,token,word_count,characters_count,sentence_count,avg_word_length,avg_sentence_length
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,commitment to help build a long -term future f...,commitment help build long future Force Climat...,commitment help recommendation climate risk di...,"[commitment, help, recommendation, climate, ri...",89,626,10,7.033708,8.900000
1,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"Rosemary Hilary , Independent Non -executive ...",Rosemary Hilary Independent Non Director chair...,chair impact,"[chair, impact]",2,11,3,5.500000,0.666667
2,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Group’s risk management frameworkEnsuring exec...,Group risk management frameworkensuring execut...,risk management execution investment principle...,"[risk, management, execution, investment, prin...",14,105,4,7.500000,3.500000
3,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,IndividualCross -function working group to dis...,individualcross work group discuss emerge ESG ...,work group activity climate risk opportunity m...,"[work, group, activity, climate, risk, opportu...",38,256,8,6.736842,4.750000
4,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have explained our approach to meeting thes...,explain approach meet objective documentour pe...,approach commitment governance framework frame...,"[approach, commitment, governance, framework, ...",68,514,13,7.558824,5.230769
...,...,...,...,...,...,...,...,...,...,...,...,...
1049,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,O&G Business Unit - Percentage of Total Travel...,Business Unit Percentage Total Travelers Domes...,business percentage total traveler traveler pr...,"[business, percentage, total, traveler, travel...",71,486,8,6.845070,8.875000
1050,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,We expect our renewable energy book of busines...,expect renewable energy book business continue...,energy book business time progress business,"[energy, book, business, time, progress, busin...",6,38,2,6.333333,3.000000
1051,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Travelers Underwriting Exposure to Carbon Int...,traveler underwrite exposure Carbon Intensive ...,traveler exposure carbon sector classification...,"[traveler, exposure, carbon, sector, classific...",24,166,4,6.916667,6.000000
1052,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,The charts below illustrate the percentage of ...,chart illustrate percentage domestic premium a...,chart percentage premium associate carbon sect...,"[chart, percentage, premium, associate, carbon...",34,259,5,7.617647,6.800000


In [15]:
# Exclude rows where sentence_count is less than 3
paragraphs_df = paragraphs_df[paragraphs_df['sentence_count'] > 3]
# Reset the index
paragraphs_df = paragraphs_df.reset_index(drop=True)
# Display the length of the DataFrame
len(paragraphs_df)

624

In [16]:
# Drop the words that are too general for earnings calls
general_words = [
    'lady', 'gentleman', 'presentation',
    'question', 'answer', 'slide', 'mm', 'mm_mm', 'guy', 'sir', ' ', 'ytd', 'host_sir',
    'bb', 'ty', 'word', 'year', 'quer', 'month', 'period', 'day', 'time', 'result',
    'investor', 'week', 'update', 'business', 'lot', 'ratio', 'rate', 'quarter',
    'number', 'point', 'term', 'thing', 'level', 'bit', 'sort', 'reason', 'management',
    'fact', 'case', 'area', 'people', 'sense', 'item', 'issue', 'market', 'meeting',
    'questions', 'answers', 'managements', 'discussion', 'section', 'presentation', 
    'speaker', 'participant', 'afternoon', 'morning', 'conference', 'today', 'lady', 
    'gentleman', 'presentation', 'question', 'answer', 'slide',
    'mm', 'mm_mm', 'guy', 'sir', 'host_sir', 'bb', 'ty', 'word', 'year', 'quer',
    'month', 'period', 'day', 'time', 'result', 'investor', 'week', 'update', 'business', 'lot', 'ratio', 'rate', 'quarter',
    'number', 'point', 'term', 'thing', 'level', 'bit', 'sort', 'reason', 'management',
    'fact', 'case', 'area', 'people', 'sense', 'item', 'issue', 'market', 'earnings',
    'report', 'financial', 'results', 'quarterly', 'performance', 'guidance', 'statement',
    'outlook', 'projection', 'profit', 'loss', 'revenue', 'sales', 'expense', 'income',
    'cash', 'flow', 'margin', 'growth', 'decline', 'increase', 'decrease', 'forecast',
    'expectation', 'trend', 'metric', 'indicator', 'shareholder', 'stock', 'price', 'value',
    'equity', 'debt', 'asset', 'liability', 'balance', 'sheet', 'capital', 'investment',
    'portfolio', 'dividend', 'yield', 'ratio', 'return', 'earnings', 'per', 'share', 'EPS',
    'acquisition', 'merger', 'synergy', 'integration', 'strategy', 'execution', 'plan',
    'objective', 'goal', 'target', 'vision', 'mission', 'operation', 'process', 'initiative',
    'efficiency', 'optimization', 'innovation', 'technology', 'product', 'service', 'customer',
    'client', 'market', 'segment', 'competition', 'competitor', 'industry', 'sector', 'environment',
    'regulation', 'compliance', 'risk', 'opportunity', 'challenge', 'threat', 'advantage',
    'disadvantage', 'strength', 'weakness', 'SWOT', 'analysis', 'review', 'summary',
    'highlight', 'detail', 'report', 'note', 'comment', 'announcement', 'release', 'update','tcfd, Severe wind and hail n an a n an a, Hurricane n a n a, Winter storm n a n a'
]

# Apply the filtering to your DataFrame
paragraphs_df['token'] = paragraphs_df['token'].apply(lambda x: [i for i in x if i not in general_words])

# Display the updated DataFrame
paragraphs_df

Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun,token,word_count,characters_count,sentence_count,avg_word_length,avg_sentence_length
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,commitment to help build a long -term future f...,commitment help build long future Force Climat...,commitment help recommendation climate risk di...,"[commitment, help, recommendation, climate, di...",89,626,10,7.033708,8.900000
1,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Group’s risk management frameworkEnsuring exec...,Group risk management frameworkensuring execut...,risk management execution investment principle...,"[principle, framework, place, climate, climate...",14,105,4,7.500000,3.500000
2,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,IndividualCross -function working group to dis...,individualcross work group discuss emerge ESG ...,work group activity climate risk opportunity m...,"[work, group, activity, climate, climate, team...",38,256,8,6.736842,4.750000
3,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have explained our approach to meeting thes...,explain approach meet objective documentour pe...,approach commitment governance framework frame...,"[approach, commitment, governance, framework, ...",68,514,13,7.558824,5.230769
4,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have facilitated regular cross -function di...,facilitate regular cross discussion identify p...,cross discussion eg increase frequency weather...,"[cross, frequency, weather, event, transition,...",33,233,11,7.060606,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
619,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,"Importantly, all of our top Oil & Gas distrib...",importantly Oil Gas distribution partner sell ...,distribution partner traveler insurance produc...,"[distribution, partner, traveler, insurance, f...",24,178,4,7.416667,6.000000
620,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,O&G Business Unit - Percentage of Total Travel...,Business Unit Percentage Total Travelers Domes...,business percentage total traveler traveler pr...,"[percentage, total, traveler, traveler, suppor...",71,486,8,6.845070,8.875000
621,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Travelers Underwriting Exposure to Carbon Int...,traveler underwrite exposure Carbon Intensive ...,traveler exposure carbon sector classification...,"[traveler, exposure, carbon, classification, t...",24,166,4,6.916667,6.000000
622,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,The charts below illustrate the percentage of ...,chart illustrate percentage domestic premium a...,chart percentage premium associate carbon sect...,"[chart, percentage, premium, associate, carbon...",34,259,5,7.617647,6.800000


## Extracting and Cleaning Frequent Words

The script extracts the 20 most frequent words from the TCFD disclosures for each company per file and cleans the tokens by removing them. 

In [17]:
# Initialize a list to hold the results
most_frequent_words_per_file = []

# Iterate over each unique file
for _, row in paragraphs_df.iterrows():
    text = row['paragraph']
    file_name = row['file_name']
    company = row['company_name']

    # Tokenize and clean the text
    tokens = clean_tokens_noun(pd.Series([text]))[0]

    # Calculate word frequency
    word_freq = Counter(tokens).most_common(20)

    # Append the results to the list
    most_frequent_words_per_file.append([file_name, company, word_freq])

# Create a DataFrame with the results
frequent_words_per_file_df = pd.DataFrame(most_frequent_words_per_file, columns=['file_name', 'company_name', 'word_freq'])

# Display the DataFrame
pd.set_option('display.max_columns', None)
frequent_words_per_file_df

100%|██████████| 1/1 [00:00<00:00, 15.20it/s]
100%|██████████| 1/1 [00:00<00:00, 108.15it/s]
100%|██████████| 1/1 [00:00<00:00, 30.80it/s]
100%|██████████| 1/1 [00:00<00:00, 16.70it/s]
100%|██████████| 1/1 [00:00<00:00, 41.65it/s]
100%|██████████| 1/1 [00:00<00:00, 122.55it/s]
100%|██████████| 1/1 [00:00<00:00, 55.27it/s]
100%|██████████| 1/1 [00:00<00:00, 142.57it/s]
100%|██████████| 1/1 [00:00<00:00, 40.98it/s]
100%|██████████| 1/1 [00:00<00:00, 118.95it/s]
100%|██████████| 1/1 [00:00<00:00, 27.39it/s]
100%|██████████| 1/1 [00:00<00:00, 64.01it/s]
100%|██████████| 1/1 [00:00<00:00, 56.92it/s]
100%|██████████| 1/1 [00:00<00:00, 50.89it/s]
100%|██████████| 1/1 [00:00<00:00, 122.31it/s]
100%|██████████| 1/1 [00:00<00:00, 61.19it/s]
100%|██████████| 1/1 [00:00<00:00, 85.48it/s]
100%|██████████| 1/1 [00:00<00:00, 50.88it/s]
100%|██████████| 1/1 [00:00<00:00, 97.01it/s]
100%|██████████| 1/1 [00:00<00:00, 49.77it/s]
100%|██████████| 1/1 [00:00<00:00, 59.06it/s]
100%|██████████| 1/1 [00:00<0

Unnamed: 0,file_name,company_name,word_freq
0,cleaned_sjp-tcfd-report-april2020.txt,ST. JAMES place,"[(climate, 26), (risk, 19), (opportunity, 7), ..."
1,cleaned_sjp-tcfd-report-april2020.txt,ST. JAMES place,"[(risk, 3), (investment, 3), (responsible, 2),..."
2,cleaned_sjp-tcfd-report-april2020.txt,ST. JAMES place,"[(climate, 5), (risk, 4), (st, 3), (place, 3),..."
3,cleaned_sjp-tcfd-report-april2020.txt,ST. JAMES place,"[(climate, 10), (investment, 4), (risk, 3), (r..."
4,cleaned_sjp-tcfd-report-april2020.txt,ST. JAMES place,"[(risk, 3), (change, 2), (opportunity, 2), (bu..."
...,...,...,...
619,cleaned_Travelers_TCFDReport2023.txt,Traveler Cos TRV,"[(premium, 4), (insurance, 3), (oil, 2), (gas,..."
620,cleaned_Travelers_TCFDReport2023.txt,Traveler Cos TRV,"[(energy, 14), (renewable, 11), (traveler, 4),..."
621,cleaned_Travelers_TCFDReport2023.txt,Traveler Cos TRV,"[(carbon, 3), (premium, 3), (continue, 3), (tr..."
622,cleaned_Travelers_TCFDReport2023.txt,Traveler Cos TRV,"[(vehicle, 7), (percentage, 5), (premium, 4), ..."


In [18]:
# Initialize a list to hold the results
most_frequent_words_per_file = []

# Iterate over each unique file
for _, row in paragraphs_df.iterrows():
    text = row['paragraph']
    file_name = row['file_name']
    company = row['company_name']

    # Tokenize and clean the text
    tokens = clean_tokens_noun(pd.Series([text]))[0]

    # Calculate word frequency
    word_freq = Counter(tokens).most_common(50)
    most_frequent_words = set(word for word, freq in word_freq)

    # Append the results to the list
    most_frequent_words_per_file.append([file_name, company, most_frequent_words])

# Create a DataFrame with the results
frequent_words_per_file_df = pd.DataFrame(most_frequent_words_per_file, columns=['file_name', 'company_name', 'most_frequent_words'])

# Remove the most frequent words from each token
cleaned_tokens = []

for _, row in paragraphs_df.iterrows():
    text = row['paragraph']
    tokens = clean_tokens_noun(pd.Series([text]))[0]

    # Get the most frequent words for the current file
    file_name = row['file_name']
    company = row['company_name']
    
    # Filter to get frequent words for the current file
    frequent_words_series = frequent_words_per_file_df[
        (frequent_words_per_file_df['file_name'] == file_name) & 
        (frequent_words_per_file_df['company_name'] == company)
    ]['most_frequent_words']
    
    if len(frequent_words_series) > 0:
        frequent_words = frequent_words_series.values[0]  # Get the set of most frequent words
    else:
        frequent_words = set()

    # Remove the most frequent words from tokens
    cleaned_tokens.append([word for word in tokens if word not in frequent_words])

# Update the 'token' column with cleaned tokens
paragraphs_df['token'] = cleaned_tokens

# Display the DataFrame
pd.set_option('display.max_columns', None)
print(len(paragraphs_df))
paragraphs_df

100%|██████████| 1/1 [00:00<00:00, 42.14it/s]
100%|██████████| 1/1 [00:00<00:00, 65.04it/s]
100%|██████████| 1/1 [00:00<00:00, 48.15it/s]
100%|██████████| 1/1 [00:00<00:00, 47.50it/s]
100%|██████████| 1/1 [00:00<00:00, 97.24it/s]
100%|██████████| 1/1 [00:00<00:00, 133.57it/s]
100%|██████████| 1/1 [00:00<00:00, 65.86it/s]
100%|██████████| 1/1 [00:00<00:00, 120.82it/s]
100%|██████████| 1/1 [00:00<00:00, 48.14it/s]
100%|██████████| 1/1 [00:00<00:00, 127.05it/s]
100%|██████████| 1/1 [00:00<00:00, 49.38it/s]
100%|██████████| 1/1 [00:00<00:00, 76.75it/s]
100%|██████████| 1/1 [00:00<00:00, 134.44it/s]
100%|██████████| 1/1 [00:00<00:00, 64.05it/s]
100%|██████████| 1/1 [00:00<00:00, 139.10it/s]
100%|██████████| 1/1 [00:00<00:00, 82.31it/s]
100%|██████████| 1/1 [00:00<00:00, 138.59it/s]
100%|██████████| 1/1 [00:00<00:00, 76.34it/s]
100%|██████████| 1/1 [00:00<00:00, 110.72it/s]
100%|██████████| 1/1 [00:00<00:00, 65.78it/s]
100%|██████████| 1/1 [00:00<00:00, 80.69it/s]
100%|██████████| 1/1 [00:00

624





Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun,token,word_count,characters_count,sentence_count,avg_word_length,avg_sentence_length
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,commitment to help build a long -term future f...,commitment help build long future Force Climat...,commitment help recommendation climate risk di...,"[overall, relevant, appropriate, greenhouse, g...",89,626,10,7.033708,8.900000
1,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,Group’s risk management frameworkEnsuring exec...,Group risk management frameworkensuring execut...,risk management execution investment principle...,"[group, execution, responsible, principle, fra...",14,105,4,7.500000,3.500000
2,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,IndividualCross -function working group to dis...,individualcross work group discuss emerge ESG ...,work group activity climate risk opportunity m...,"[work, group, discuss, emerge, past, month, pr...",38,256,8,6.736842,4.750000
3,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have explained our approach to meeting thes...,explain approach meet objective documentour pe...,approach commitment governance framework frame...,"[explain, approach, meet, objective, performan...",68,514,13,7.558824,5.230769
4,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,We have facilitated regular cross -function di...,facilitate regular cross discussion identify p...,cross discussion eg increase frequency weather...,"[facilitate, regular, cross, discussion, physi...",33,233,11,7.060606,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
619,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,"Importantly, all of our top Oil & Gas distrib...",importantly Oil Gas distribution partner sell ...,distribution partner traveler insurance produc...,"[importantly, oil, distribution, partner, sell...",24,178,4,7.416667,6.000000
620,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,O&G Business Unit - Percentage of Total Travel...,Business Unit Percentage Total Travelers Domes...,business percentage total traveler traveler pr...,"[unit, percentage, total, domestic, renewable,...",71,486,8,6.845070,8.875000
621,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,Travelers Underwriting Exposure to Carbon Int...,traveler underwrite exposure Carbon Intensive ...,traveler exposure carbon sector classification...,"[underwrite, carbon, intensive, sector, global...",24,166,4,6.916667,6.000000
622,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,The charts below illustrate the percentage of ...,chart illustrate percentage domestic premium a...,chart percentage premium associate carbon sect...,"[chart, illustrate, percentage, domestic, prem...",34,259,5,7.617647,6.800000


In [19]:
print(len(paragraphs_df))
paragraphs_df['token_len'] = paragraphs_df['token'].apply(lambda x: len(x))
# Drop rows where the length of the token is less than 2
paragraphs_df = paragraphs_df[paragraphs_df['token_len'] > 2]
print(len(paragraphs_df))
# Recreate 'docs_tokens' from 'paragraphs_df'
temp_token = paragraphs_df['token'] # .apply(remove_brackets)
docs_tokens = []
for i in temp_token:
    docs_tokens.append(i)

624
619


## Creating and Saving the Final DataFrames

This script groups data by company and date and applies the splits. The resulting final DataFrames are saved as separate CSV files for each company.

In [20]:
# Create the final DataFrame with the specified columns
tcfd = paragraphs_df[['file_name', 'date', 'company_name', 'token', 'paragraph']]

# Rename the columns according to your specifications
tcfd.columns = ['file_name', 'year', 'company_name', 'token', 'paragraph']

# Display the DataFrame
tcfd

Unnamed: 0,file_name,year,company_name,token,paragraph
0,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"[overall, relevant, appropriate, greenhouse, g...",commitment to help build a long -term future f...
1,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"[group, execution, responsible, principle, fra...",Group’s risk management frameworkEnsuring exec...
2,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"[work, group, discuss, emerge, past, month, pr...",IndividualCross -function working group to dis...
3,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"[explain, approach, meet, objective, performan...",We have explained our approach to meeting thes...
4,cleaned_sjp-tcfd-report-april2020.txt,2020,ST. JAMES place,"[facilitate, regular, cross, discussion, physi...",We have facilitated regular cross -function di...
...,...,...,...,...,...
619,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,"[importantly, oil, distribution, partner, sell...","Importantly, all of our top Oil & Gas distrib..."
620,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,"[unit, percentage, total, domestic, renewable,...",O&G Business Unit - Percentage of Total Travel...
621,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,"[underwrite, carbon, intensive, sector, global...",Travelers Underwriting Exposure to Carbon Int...
622,cleaned_Travelers_TCFDReport2023.txt,2023,Traveler Cos TRV,"[chart, illustrate, percentage, domestic, prem...",The charts below illustrate the percentage of ...


In [21]:
# Function to clean text data
def clean_text(text):
    # Check if the input is a string or a list
    if isinstance(text, str):
        # Replace common problematic characters
        text = text.replace('β€™', "'")  # Replace specific problematic sequences
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
        text = text.strip()
        return text
    elif isinstance(text, list):
        # If input is a list, return a cleaned list of tokens
        return [clean_text(token) for token in text]
    return text  # Return as is if not string or list

# Rename columns as per your requirement
tcfd = paragraphs_df[['file_name', 'date', 'company_name', 'token', 'paragraph']]
tcfd.columns = ['file_name', 'year', 'company_name', 'token', 'paragraph']

# Clean text in 'paragraph' column
tcfd['paragraph'] = tcfd['paragraph'].apply(clean_text)

# Clean text in 'token' column
tcfd['token'] = tcfd['token'].apply(clean_text)

# Group by 'company_name' to create separate DataFrames for each company
company_dfs = {}
for company in tcfd['company_name'].unique():
    company_df = tcfd[tcfd['company_name'] == company][['file_name', 'year', 'token', 'paragraph']]
    
    # Sort DataFrame by 'year' in ascending order
    company_df = company_df.sort_values(by='year', ascending=True)
    
    # Clean company name for CSV file naming
    company_name_cleaned = "_".join(company.split()[:2]).replace('(', '').replace(')', '').replace('.', '').replace(',', '').replace("'", "") + '_df'
    
    # Store the DataFrame in the dictionary
    company_dfs[company_name_cleaned] = company_df

# Function to get DataFrame by company name
def get_company_df(company_name):
    company_name_cleaned = "_".join(company_name.split()[:2]).replace('(', '').replace(')', '').replace('.', '').replace(',', '').replace("'", "") + '_df'
    return company_dfs.get(company_name_cleaned, None)

# Save each company's DataFrame to a CSV file
output_dir = "tcfd_company_csvs"
os.makedirs(output_dir, exist_ok=True)

for company_name, df in company_dfs.items():
    csv_path = os.path.join(output_dir, f"{company_name}.csv")
    df.to_csv(csv_path, index=False, encoding='utf-8')  # Ensure UTF-8 encoding to handle special characters
    print(f"Saved {company_name} to {csv_path}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tcfd['paragraph'] = tcfd['paragraph'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tcfd['token'] = tcfd['token'].apply(clean_text)


Saved ST_JAMES_df to tcfd_company_csvs\ST_JAMES_df.csv
Saved Traveler_Cos_df to tcfd_company_csvs\Traveler_Cos_df.csv
