# Pre-Processing Bloomberg Earnings Calls

### This script demonstrates the pre-processing of earnings call transcripts for two insurance companies, Travelers Cos (TRV) and St. James's Place, preparing them for further NLP analysis. Each transcript contains two main sections: MD (Management Discussion) / Presentation and QA (Questions and Answers). The consistent format of these transcripts, downloaded from Bloomberg, makes them ideal for processing.

## Libraries

In [1]:
import os
import PyPDF2
import pandas as pd
import time
import re
import numpy as np
import spacy
from tqdm import tqdm
from collections import Counter

## Extracting text out of the PDFs 

In [2]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if reader.is_encrypted:
                reader.decrypt('')
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

# Function to traverse the directory and extract text from PDFs of specified companies in bloomberg folders
def traverse_and_extract_text(root_dir, output_dir, companies):
    total_size = 0

    for root, dirs, files in os.walk(root_dir):
        if 'bloomberg' in root.lower():  # Check if 'bloomberg' is in the directory path
            for sector, company_list in companies.items():
                for company in company_list:
                    if company.lower() in root.lower():
                        for file in files:
                            if file.endswith(".pdf"):
                                pdf_path = os.path.join(root, file)
                                file_size = os.path.getsize(pdf_path)
                                text = extract_text_from_pdf(pdf_path)

                                if text:
                                    # Create subdirectory for the company if it doesn't exist
                                    company_dir = os.path.join(output_dir, company.replace(" ", "_"))
                                    if not os.path.exists(company_dir):
                                        os.makedirs(company_dir)

                                    # Save the text as a .txt file in the company subdirectory
                                    output_file = os.path.join(company_dir, f"{os.path.splitext(file)[0]}.txt")
                                    with open(output_file, 'w', encoding='utf-8') as txt_file:
                                        txt_file.write(text)
                                    print(f"Saved text for {pdf_path} to {output_file}")

                                    total_size += file_size

    return total_size

# Root directory path
root_dir = "ARP BOE"
# Output directory path
output_dir = "Earnings Calls Texts"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Specified companies
companies = {
    "Insurers": ["Traveler Cos TRV", "ST. JAMES place"],
}

# Extract text from PDFs and save them
total_size_processed = traverse_and_extract_text(root_dir, output_dir, companies)
print(f"Total size of processed files: {total_size_processed} bytes")

Saved text for ARP BOE\Insurers\ST. JAMES place\Bloomberg\St James_s Place PLC Earnings Call 2019731 SD000000002886567163.pdf to Earnings Calls Texts\ST._JAMES_place\St James_s Place PLC Earnings Call 2019731 SD000000002886567163.txt
Saved text for ARP BOE\Insurers\ST. JAMES place\Bloomberg\St James_s Place PLC Earnings Call 2020227 DN000000002799172133.pdf to Earnings Calls Texts\ST._JAMES_place\St James_s Place PLC Earnings Call 2020227 DN000000002799172133.txt
Saved text for ARP BOE\Insurers\ST. JAMES place\Bloomberg\St James_s Place PLC Earnings Call 2020728 DN000000002875532448.pdf to Earnings Calls Texts\ST._JAMES_place\St James_s Place PLC Earnings Call 2020728 DN000000002875532448.txt
Saved text for ARP BOE\Insurers\ST. JAMES place\Bloomberg\St James_s Place PLC Earnings Call 2021225 RT000000002951492856.pdf to Earnings Calls Texts\ST._JAMES_place\St James_s Place PLC Earnings Call 2021225 RT000000002951492856.txt
Saved text for ARP BOE\Insurers\ST. JAMES place\Bloomberg\St Jam

## Extracting and Categorising Participants from Companies Transcripts

This script processes text files to extract and categorise participants from the earnings call transcripts of each company. It defines a participants_list function to identify and list "Company Participants" and "Other Participants". The script then reads the text files, converts them to DataFrames and applies the participants_list function to extract participants. The participants are accumulated, duplicates are removed and the lists are saved as CSV files in the "Participants CSVs" directory.

In [3]:
# Function to extract participants list from the DataFrame
def participants_list(df):
    Participant_start_index = df.index[df.iloc[:, 0] == 'Company Participants'].tolist()
    Participant_middle_index = df.index[df.iloc[:, 0] == 'Other Participants'].tolist()

    Participant_end_index = df.index[df.iloc[:, 0] == 'Presentation'].tolist()
    if Participant_end_index == []:
        Participant_end_index = df.index[df.iloc[:, 0] == 'Presentation'].tolist()
        if Participant_end_index == []:
            Participant_end_index = df.index[df.iloc[:, 0] == 'Questions And Answers'].tolist()
            Participant_end_index = [Participant_end_index[-1]]
            if Participant_end_index == []:
                Participant_end_index = df.index[df.iloc[:, 0] == 'Q&A'].tolist()
                Participant_end_index = [Participant_end_index[-1]]
        else:
            Participant_end_index = [Participant_end_index[-1]]

    if Participant_middle_index == []:
        Participant_middle_index = Participant_end_index.copy()

    company_participants = df.loc[Participant_start_index[0]+1:Participant_middle_index[0]-1].copy()
    company_participants.drop(company_participants.index[company_participants.iloc[:, 0] == ''].tolist(), inplace=True)
    company_participants = company_participants.values.tolist()

    other_participants = df.loc[Participant_middle_index[0]+1:Participant_end_index[0]-1].copy()
    other_participants.drop(other_participants.index[other_participants.iloc[:, 0] == ''].tolist(), inplace=True)
    other_participants = other_participants.values.tolist()

    return df, company_participants, other_participants

# Input and output directories
input_dir = "Earnings Calls Texts"
output_dir = "Participants CSVs"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Initialise lists to accumulate participants for each company
company_participants_all = {}
other_participants_all = {}

# Process each text file in the input directory
for company in os.listdir(input_dir):
    company_path = os.path.join(input_dir, company)
    if os.path.isdir(company_path):
        for filename in os.listdir(company_path):
            if filename.endswith(".txt"):
                # Load the saved text file
                file_path = os.path.join(company_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()

                # Convert text to DataFrame
                temp_df = pd.DataFrame(text.split('\n'), columns=[0])

                # Apply the participants_list function
                _, company_participants, other_participants = participants_list(temp_df)
                
                # Initialize lists if not already present for the company
                if company not in company_participants_all:
                    company_participants_all[company] = []
                if company not in other_participants_all:
                    other_participants_all[company] = []

                # Accumulate participants for the company
                company_participants_all[company].extend(company_participants)
                other_participants_all[company].extend(other_participants)

# Save all participants to separate CSV files for each company
for company in company_participants_all:
    company_df = pd.DataFrame(company_participants_all[company], columns=['Company Participants']).drop_duplicates().reset_index(drop=True)
    other_df = pd.DataFrame(other_participants_all[company], columns=['Other Participants']).drop_duplicates().reset_index(drop=True)

    company_csv_path = os.path.join(output_dir, f'{company}_company_participants.csv')
    other_csv_path = os.path.join(output_dir, f'{company}_other_participants.csv')

    company_df.to_csv(company_csv_path, index=False)
    other_df.to_csv(other_csv_path, index=False)

    print(f"Saved company participants for {company} to {company_csv_path}")
    print(f"Saved other participants for {company} to {other_csv_path}")


Saved company participants for ST._JAMES_place to Participants CSVs\ST._JAMES_place_company_participants.csv
Saved other participants for ST._JAMES_place to Participants CSVs\ST._JAMES_place_other_participants.csv
Saved company participants for Traveler_Cos_TRV to Participants CSVs\Traveler_Cos_TRV_company_participants.csv
Saved other participants for Traveler_Cos_TRV to Participants CSVs\Traveler_Cos_TRV_other_participants.csv


## Cleaning and Extracting Information from Company Transcripts

This script processes text files to extract key information and clean the text for further analysis. It features two main functions: extract_info, which retrieves the date, company name, and ticker from the text, and cleaning_text, which removes unnecessary content and formats the transcript data. The script reads the text files, extracts relevant details, applies cleaning transformations, and then saves the cleaned text to a new directory. Additionally, it uses pre-extracted lists of participants to enhance the cleaning process.

In [4]:
# Function to extract date, company name, and ticker from the text
def extract_info(text):
    date_pattern = re.compile(r'\d{4}-\d{2}-\d{2}')
    company_ticker_pattern = re.compile(r'(.*\s+\(\w+\s+Equity\))')

    date_match = date_pattern.search(text)
    company_ticker_match = company_ticker_pattern.search(text)

    date = date_match.group(0) if date_match else ''
    company_ticker = company_ticker_match.group(1) if company_ticker_match else ''

    company_name, ticker = '', ''
    if company_ticker:
        parts = company_ticker.rsplit('(', 1)
        company_name = parts[0].strip()
        ticker = parts[1].strip(')')

    return date, company_name, ticker

# Cleaning text function
def cleaning_text(df, date, company_name, ticker, company_participants_list, other_participants_list):
    # Remove the unnecessary string
    df[0] = df[0].str.replace('\n', '', regex=False)
    df[0] = df[0].str.replace('TRANSCRIPT', '', regex=False)
    df[0] = df[0].str.replace('\x0c\n', '', regex=False)
    df[0] = df[0].str.replace('FINAL', '', regex=False)
    df[0] = df[0].str.replace('*', '', regex=False)
    df[0] = df[0].str.replace('[', '', regex=False)
    df[0] = df[0].str.replace(']', '', regex=False)
    df[0] = df[0].str.replace(':', '', regex=False)
    df[0] = df[0].str.replace('A - ', '', regex=False)
    df[0] = df[0].str.replace('Q - ', '', regex=False)
    df[0] = df[0].str.replace('{BIO', '', regex=False)
    df[0] = df[0].str.replace('}', '', regex=False)
    df[0] = df[0].str.replace('<', '', regex=False)
    df[0] = df[0].str.replace('>', '', regex=False)
    df[0] = df[0].str.replace('GO', '', regex=False)

    # Remove occurrences of date, company name, and ticker
    df[0] = df[0].str.replace(date, '', regex=False)
    df[0] = df[0].str.replace(f"{company_name} ({ticker})", '', regex=False)
    
    # Drop rows that start with a 7 or 8-digit number
    df = df[~df[0].str.match(r'^\s*\d{7,8}', na=False)]

    # Remove names from the beginning of rows if they match any names in the lists
    all_participants_list = company_participants_list + other_participants_list
    def remove_participant_names(row):
        for name in all_participants_list:
            if row.startswith(name):
                return row[len(name):].strip()
        return row

    df[0] = df[0].apply(remove_participant_names)

    # Remove rows that contain 'Equity' within parentheses and move it below the date if it exists
    equity_rows = df[0].str.contains(r'\(.*Equity.*\)', na=False)
    if equity_rows.any():
        equity_info = df[equity_rows].iloc[0, 0]  # Extract the equity info
        df = df[~equity_rows]  # Remove the equity rows
    else:
        equity_info = ''

    # Drop rows that start with 'Operator'
    df = df[~df[0].str.match(r'^\s*Operator', na=False)]

    # After extracting the participants, we can drop those sections to make the transcript clearer
    df = df.reset_index(drop=True)
    # Drop rows from the third row till 'Presentation'
    presentation_index = df.index[df.iloc[:, 0].str.contains('Presentation')].tolist()
    if presentation_index:
        df = df.drop(range(2, presentation_index[0]))
    # Drop the row that ends with 'Investor Day' using regex
    df = df[~df[0].str.contains(r'Investor Day$')]
    # Drop the first row of the df
    df = df.reset_index(drop=True)
    df = df.iloc[1:, :]
    # Reset the index again to make sure the index is continuous for better processing
    df = df.reset_index(drop=True)

    # Using re to remove the unnecessary string
    def drop_unnecessary(x):
        page = re.findall(r'Page \d+ of \d+', x)
        Company_Name = re.findall(r'Company Name', x)
        Company_Ticker = re.findall(r'Company Ticker', x)
        Date = re.findall(r'Date', x)
        if page == [] and Company_Name == [] and Company_Ticker == [] and Date == []:
            return True
        else:
            return False

    true_false = df[0].apply(lambda x: drop_unnecessary(x))
    df = df[true_false]

    # Drop the final page declaration
    df = df[df[0] != 'This transcript may not be 100 percent accurate and may contain misspellings and ']
    df = df[df[0] != 'other inaccuracies. This transcript is provided "as is", without express or implied ']
    df = df[df[0] != 'warranties of any kind. Bloomberg retains all rights to this transcript and provides it ']
    df = df[df[0] != 'solely for your personal, non-commercial use. Bloomberg, its suppliers and third-']
    df = df[~df[0].str.contains(r'solely for your personal, non-commercial use\. Bloomberg, its suppliers and third-', regex=True)]
    df = df[df[0] != 'personal, non-commercial use. Bloomberg, its suppliers and third-party agents shall ']
    df = df[df[0] != 'party agents shall have no liability for errors in this transcript or for lost proﬁts, losses, ']
    df = df[df[0] != 'or direct, indirect, incidental, consequential, special or punitive damages in']
    df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
    df = df[df[0] != 'connection with the furnishing, performance or use of such transcript. Neither the ']
    df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
    df = df[df[0] != 'information nor any opinion expressed in this transcript constitutes a solicitation of ']
    df = df[df[0] != 'the purchase or sale of securities or commodities. Any opinion expressed in the ']
    df = df[df[0] != 'transcript does not necessarily reﬂect the views of Bloomberg LP. © COPYRIGHT ']
    df = df[df[0] != '2024, BLOOMBERG LP. All rights reserved. Any reproduction, redistribution or ']
    df = df[df[0] != 'retransmission is expressly prohibited.']

    # if could not be identified, we would apply re
    def drop_Bloomberg_mark(x):
        Bloomberg_mark = re.findall(r'reflect the views of Bloomberg LP', x)
        if Bloomberg_mark == []:
            return True
        else:
            return False

    true_false_bm = df[0].apply(lambda x: drop_Bloomberg_mark(x))
    df = df[true_false_bm]

    # Drop the empty row
    df = df[df[0] != '']

    # Reset the index to make sure the index is continuous for better processing
    df = df.reset_index(drop=True)

    # Add date and equity info to the top of the dataframe
    if date:
        df = pd.concat([pd.DataFrame([[date, company_name, ticker]], columns=[0, 1, 2]), df], ignore_index=True)
    if equity_info:
        df = pd.concat([pd.DataFrame([[equity_info]], columns=[0]), df], ignore_index=True)

    return df

# Path to the directory containing the text files
input_dir = "Earnings Calls Texts"
output_dir = "Cleaned Earnings Calls Texts"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load participants lists from CSV files for all companies
all_company_participants = []
all_other_participants = []

# Traverse the Participants CSVs folder to load participants for each company
participants_dir = "Participants CSVs"
for file in os.listdir(participants_dir):
    if file.endswith("_company_participants.csv"):
        company_participants_df = pd.read_csv(os.path.join(participants_dir, file))
        all_company_participants.extend(company_participants_df['Company Participants'].tolist())
    elif file.endswith("_other_participants.csv"):
        other_participants_df = pd.read_csv(os.path.join(participants_dir, file))
        all_other_participants.extend(other_participants_df['Other Participants'].tolist())

# Process and clean all text files
for root, dirs, files in os.walk(input_dir):
    for filename in files:
        if filename.endswith(".txt"):
            # Load the saved text file
            file_path = os.path.join(root, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            # Extract date, company name, and ticker
            date, company_name, ticker = extract_info(text)

            # Apply the cleaning function
            temp_df = pd.DataFrame(text.split('\n'), columns=[0])
            cleaned_temp_df = cleaning_text(temp_df, date, company_name, ticker, all_company_participants, all_other_participants)
            cleaned_text = '\n'.join(cleaned_temp_df[0].tolist())

            # Determine the relative output path
            relative_path = os.path.relpath(root, input_dir)
            output_file_dir = os.path.join(output_dir, relative_path)
            if not os.path.exists(output_file_dir):
                os.makedirs(output_file_dir)

            # Save the cleaned text as .txt files
            cleaned_file_path = os.path.join(output_file_dir, f"cleaned_{filename}")
            with open(cleaned_file_path, 'w', encoding='utf-8') as txt_file:
                txt_file.write(cleaned_text)

            print(f"Saved cleaned text for file: {filename} to {cleaned_file_path}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: St James_s Place PLC Earnings Call 2019731 SD000000002886567163.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2019731 SD000000002886567163.txt
Saved cleaned text for file: St James_s Place PLC Earnings Call 2020227 DN000000002799172133.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2020227 DN000000002799172133.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: St James_s Place PLC Earnings Call 2020728 DN000000002875532448.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2020728 DN000000002875532448.txt
Saved cleaned text for file: St James_s Place PLC Earnings Call 2021225 RT000000002951492856.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2021225 RT000000002951492856.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: St James_s Place PLC Earnings Call 2021728 RT000000002961195990.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2021728 RT000000002961195990.txt
Saved cleaned text for file: St James_s Place PLC Earnings Call 2022224 DN000000002974539999.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2022224 DN000000002974539999.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: St James_s Place PLC Earnings Call 2022728 DN000000002988128310.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2022728 DN000000002988128310.txt
Saved cleaned text for file: St James_s Place PLC Earnings Call 2023228 DN000000003004320905.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2023228 DN000000003004320905.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: St James_s Place PLC Earnings Call 2023727 DN000000003017788687.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2023727 DN000000003017788687.txt
Saved cleaned text for file: St James_s Place PLC Earnings Call 2024228 DN000000003032811522.txt to Cleaned Earnings Calls Texts\ST._JAMES_place\cleaned_St James_s Place PLC Earnings Call 2024228 DN000000003032811522.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 20191022 DN000000002732661665.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 20191022 DN000000002732661665.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2019418 DN000000002625973556.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2019418 DN000000002625973556.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2019723 RT000000002897838852.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2019723 RT000000002897838852.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 20201020 DN000000002919519869.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 20201020 DN000000002919519869.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2020123 DN000000002782019895.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2020123 DN000000002782019895.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2020421 RT000000002827435712.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2020421 RT000000002827435712.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 20211019 DN000000002966110191.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 20211019 DN000000002966110191.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2021121 RT000000002948798175.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2021121 RT000000002948798175.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2021420 DN000000002958474957.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2021420 DN000000002958474957.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2021720 RT000000002960746057.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2021720 RT000000002960746057.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 20221019 DN000000002995149139.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 20221019 DN000000002995149139.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2022120 RT000000002973194137.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2022120 RT000000002973194137.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2022419 RT000000002979120029.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2022419 RT000000002979120029.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2022721 RT000000002987761887.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2022721 RT000000002987761887.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 20231018 DN000000003023919633.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 20231018 DN000000003023919633.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2023124 RT000000003006445379.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2023124 RT000000003006445379.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2023419 DN000000003009088712.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2023419 DN000000003009088712.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2023720 DN000000003017167658.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2023720 DN000000003017167658.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2024119 RT000000003030171756.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2024119 RT000000003030171756.txt
Saved cleaned text for file: Travelers Cos IncThe Earnings Call 2024417 DN000000003036347706.txt to Cleaned Earnings Calls Texts\Traveler_Cos_TRV\cleaned_Travelers Cos IncThe Earnings Call 2024417 DN000000003036347706.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indirect, incidental, consequential, special or punitive damages in(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
  df = df[~df[0].str.contains(r'connection with the furnishing, performance or use of such transcript\. Neither the(\s+TRANSCRIPT\s+\d{4}-\d{2}-\d{2})?', regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[0] = df[0].apply(remove_participant_names)
  df = df[~df[0].str.contains(r'or direct, indire

## Extracting and Segregating Sections from Cleaned Transcripts

This script processes the cleaned text files to extract and segregate different sections for further analysis. For each file, it identifies and separates the 'Presentation' and 'Questions And Answers' sections. The script accumulates three types of text:

- Clear_MDQA: All the text in one file containing both the Presentation and QA sections.
- Clear_MD: Only the Presentation section.
- Clear_QA: Only the Questions and Answers section.

In [5]:
# Directories for input and output
input_dir = "Cleaned Earnings Calls Texts"
output_dir = "Clear Earnings Calls Texts"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process each company folder within the input directory
for company_folder in os.listdir(input_dir):
    company_input_dir = os.path.join(input_dir, company_folder)
    company_output_dir = os.path.join(output_dir, company_folder)
    
    # Create company output directory if it doesn't exist
    if not os.path.exists(company_output_dir):
        os.makedirs(company_output_dir)

    # Initialise text accumulators for the current company
    clear_mdqa_text = []
    clear_md_text = []
    clear_qa_text = []

    # Process each text file in the company input directory
    for root, dirs, files in os.walk(company_input_dir):
        for filename in files:
            if filename.endswith(".txt"):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    lines = text.split('\n')
                    temp_df = pd.DataFrame(lines, columns=[0])

                    # Find indices for 'Presentation' and 'Questions And Answers'
                    presentation_index = temp_df.index[temp_df[0].str.contains('Presentation', na=False)].tolist()
                    qa_index = temp_df.index[temp_df[0].str.contains('Questions And Answers', na=False)].tolist()

                    # Append to clear_mdqa_text (excluding 'Presentation' and 'Questions And Answers' rows)
                    for line in lines:
                        if 'Presentation' not in line and 'Questions And Answers' not in line:
                            clear_mdqa_text.append(line)

                    # Append to clear_md_text (from below 'Presentation' to above 'Questions And Answers')
                    if presentation_index and qa_index:
                        clear_md_text.extend(lines[presentation_index[0]+1:qa_index[0]])

                    # Append to clear_qa_text (from below 'Questions And Answers' to end)
                    if qa_index:
                        clear_qa_text.extend(lines[qa_index[0]+1:])

    # Save accumulated texts to respective files for the current company
    clear_mdqa_path = os.path.join(company_output_dir, f'{company_folder}_Clear_MDQA.txt')
    with open(clear_mdqa_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(clear_mdqa_text))

    clear_md_path = os.path.join(company_output_dir, f'{company_folder}_Clear_MD.txt')
    with open(clear_md_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(clear_md_text))

    clear_qa_path = os.path.join(company_output_dir, f'{company_folder}_Clear_QA.txt')
    with open(clear_qa_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(clear_qa_text))

    print(f"Saved {company_folder}_Clear_MDQA.txt to {clear_mdqa_path}")
    print(f"Saved {company_folder}_Clear_MD.txt to {clear_md_path}")
    print(f"Saved {company_folder}_Clear_QA.txt to {clear_qa_path}")

Saved ST._JAMES_place_Clear_MDQA.txt to Clear Earnings Calls Texts\ST._JAMES_place\ST._JAMES_place_Clear_MDQA.txt
Saved ST._JAMES_place_Clear_MD.txt to Clear Earnings Calls Texts\ST._JAMES_place\ST._JAMES_place_Clear_MD.txt
Saved ST._JAMES_place_Clear_QA.txt to Clear Earnings Calls Texts\ST._JAMES_place\ST._JAMES_place_Clear_QA.txt
Saved Traveler_Cos_TRV_Clear_MDQA.txt to Clear Earnings Calls Texts\Traveler_Cos_TRV\Traveler_Cos_TRV_Clear_MDQA.txt
Saved Traveler_Cos_TRV_Clear_MD.txt to Clear Earnings Calls Texts\Traveler_Cos_TRV\Traveler_Cos_TRV_Clear_MD.txt
Saved Traveler_Cos_TRV_Clear_QA.txt to Clear Earnings Calls Texts\Traveler_Cos_TRV\Traveler_Cos_TRV_Clear_QA.txt


# Structuring Earnings Call Transcripts into DataFrame

The code reads the text files, filtering out lines starting and ending with "Presentation" or "Questions And Answers," and stores the cleaned lines in a dictionary. Additionally, it creates a DataFrame with file names as columns and lines of text as rows, ensuring each column has an equal number of rows by padding shorter files with empty strings.

In [6]:
# Directory containing the cleaned text files
input_dir = "Cleaned Earnings Calls Texts"

# Initialise a dictionary to hold the text data
text_data = {}

# Check if the input directory exists and is not empty
if not os.path.exists(input_dir) or not os.listdir(input_dir):
    print(f"Directory {input_dir} does not exist or is empty.")
else:
    # Iterate over each company folder in the directory
    for company_folder in os.listdir(input_dir):
        company_path = os.path.join(input_dir, company_folder)
        if os.path.isdir(company_path):
            # Iterate over each text file in the company folder
            for filename in os.listdir(company_path):
                if filename.endswith(".txt"):
                    file_path = os.path.join(company_path, filename)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                        lines = text.split('\n')
                        # Remove lines that begin and end with "Presentation" or "Questions And Answers"
                        lines = [line for line in lines if not (line.strip().startswith("Presentation") and line.strip().endswith("Presentation"))]
                        lines = [line for line in lines if not (line.strip().startswith("Questions And Answers") and line.strip().endswith("Questions And Answers"))]
                        text_data[f"{company_folder}/{filename}"] = lines

    # Check if text_data is empty
    if not text_data:
        print(f"No valid text files found in {input_dir}.")
    else:
        # Determine the maximum number of lines in the files
        max_lines = max(len(lines) for lines in text_data.values())

        # Create a DataFrame with the file names as columns
        df = pd.DataFrame({filename: lines + [''] * (max_lines - len(lines)) for filename, lines in text_data.items()})
df.head(20)

Unnamed: 0,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2019731 SD000000002886567163.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2020227 DN000000002799172133.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2020728 DN000000002875532448.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2021225 RT000000002951492856.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2021728 RT000000002961195990.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2022224 DN000000002974539999.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2022728 DN000000002988128310.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2023228 DN000000003004320905.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2023727 DN000000003017788687.txt,ST._JAMES_place/cleaned_St James_s Place PLC Earnings Call 2024228 DN000000003032811522.txt,...,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 20221019 DN000000002995149139.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 2022120 RT000000002973194137.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 2022419 RT000000002979120029.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 2022721 RT000000002987761887.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 20231018 DN000000003023919633.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 2023124 RT000000003006445379.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 2023419 DN000000003009088712.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 2023720 DN000000003017167658.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 2024119 RT000000003030171756.txt,Traveler_Cos_TRV/cleaned_Travelers Cos IncThe Earnings Call 2024417 DN000000003036347706.txt
0,St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),St James's Place PLC (STJ LN Equity),...,Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity),Travelers Cos Inc/The (TRV US Equity)
1,2019-07-31,2020-02-27,2020-07-28,2021-02-25,2021-07-28,2022-02-24,2022-07-28,2023-02-28,2023-07-27,2024-02-28,...,2022-10-19,2022-01-20,2022-04-19,2022-07-21,2023-10-18,2023-01-24,2023-04-19,2023-07-20,2024-01-19,2024-04-17
2,"Should we get started? So good morning, everyo...","Good morning everyone. It's half ten, so we sh...","Good morning everyone, and welcome to our 2020...",Good morning. I hope you're keeping safe and w...,"Good morning, and welcome to our 2021 Interim ...",Good morning and welcome to our Results Webcas...,"Good morning, and welcome to our 2022 Half-Yea...","Good morning, and welcome to our Full-Year Res...","Good morning, and welcome to our 2023 Half-Yea...","Good morning, everyone. It's my pleasure to ta...",...,"Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen, and welcom...","Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen. Welcome to..."
3,presentation. Adopting our usual format at the...,many familiar faces here today and thank you f...,"Given COVID-19, today's presentation has been ...","today's presentation has been pre-recorded, an...","run through the ﬂows, funds under management, ...",again been pre-recorded and we'll be hosting a...,"today's presentation has been prerecorded, and...",undoubtedly another extraordinary year. After ...,"presentation has been pre-recorded, and we wil...",presentation as CEO of St. James's Place. We h...,...,Teleconference for Travelers. We ask that you ...,Teleconference for Travelers. We ask that you ...,Teleconference for Travelers. (Operator Instru...,Teleconference for Travelers. We ask that you ...,Teleconference for Travelers. We ask that you ...,Teleconference for Travelers. We ask that you ...,Teleconference for Travelers. We ask that you ...,Teleconference for Travelers. We ask that you ...,Teleconference for Travelers. We ask that you ...,Teleconference for Travelers. We ask that you ...
4,hand over to Craig to run through the ﬁnancial...,presentation of the new decade will follow tha...,a live Q&A at 10.45 AM. The agenda for this mo...,a.m. This morning's meeting will be in three s...,"handing over to Craig, to cover the ﬁnancial r...","In 2020, at the height of the pandemic, St. Ja...","AM. The agenda for this morning. In a moment, ...",geopolitical conditions across the globe quick...,This morning's session will follow a familiar ...,include how we're dealing with two historical ...,...,of formal remarks at which time you will be gi...,"of formal remarks. At which time, you will be ...","being recorded on April 19, 2022.","of formal remarks, at which time you will be g...","of formal remarks, at which time, you will be ...","of formal remarks, at which time you will be g...","of formal remarks, at which time you will be g...","of the formal remarks, at which time we will b...","of formal remarks, at which time you will be g...","of formal remarks, at which time you will be g..."
5,developments and outlook. We'll follow this wi...,hand over to Craig to run through the ﬁnancial...,followed by Craig running through the ﬁnancial...,focus on the future before I provide a brief s...,"outlook, and lead the Q&A session, with the fu...",resilience during diﬃcult circumstances. Thank...,"ﬁgures, then hand over to Craig to cover the ﬁ...",as the year progressed. We had to contend with...,of our new business ﬁgures and how we prepared...,throughout this presentation how fundamentally...,...,"answer session. As a reminder, this conference...","answer session. As a reminder, this conference...","At this time, I would like to turn the confere...","answer session. As a reminder, this conference...","answer session. As a reminder, this conference...","answer session. As a reminder, this conference...","answer session. As a reminder, this conference...","and-answer session. As a reminder, this confer...","answer session. As a reminder, this conference...","answer session. As a reminder, this conference..."
6,There are also a number of my executive team a...,other key matters of note. I have a number of ...,"partnership is adapted to COVID-19, before I w...",2020 was an extraordinary year for individuals...,Having announced on our strategic goals in Feb...,"and employees, but importantly also the invest...","cover a number of other topics, the continued ...",and the conﬂict in Ukraine which combined to c...,regime. I will then hand over to Craig who wil...,of this businesses. We continue to attract str...,...,2022.,2022.,Vice President of Investor Relations. Ms.Golds...,"At this time, I would like to turn the confere...",2023.,"At this time, I would like to turn the confere...","At this time, I would like to turn the confere...",2023.,2024.,"At this time, I would like to turn the confere..."
7,Please do look them up over coﬀee at the end.,colleagues here today and they are very welcome.,The ﬁrst six months of 2020 has been an extrao...,have been disrupted and we've all needed to ad...,Capital Market event in May. Today's interim r...,over the last few years. Whilst 2021 was anoth...,prospects for an advised business like St. Jam...,And in the UK this was compounded by political...,results. We will then be back to me where I'll...,management and deliver robust underlying ﬁnanc...,...,"At this time, I would like to turn the confere...","At this time, I would like to turn the confere...","Thank you. Good morning, and welcome to Travel...",Vice President of Investor Relations. Ms.Golds...,"At this time, I would like to turn the confere...",Vice President of Investor Relations. Ms.Golds...,Vice President of Investor Relations. Ms. Gold...,At this time. I would like to turn the confere...,"At this time, I would like to turn the confere...",Vice President of Investor Relations. Ms. Gold...
8,So the ﬁrst six months. It's fair to say that ...,"Now, last year was a challenging year for the ...",and across the world. A six-months period of t...,distancing and had to embrace technology. Our ...,"shorter than normal, which will leave more tim...",continuing to navigate lockdowns and disruptio...,performed during other diﬃcult market conditio...,environment 2022 marked the second-best year f...,progress against our business priorities and h...,challenging market conditions.,...,Vice President of Investor Relations. Ms. Gold...,Vice President of Investor Relations. Ms.Golds...,"2022 results. We released our press release, ﬁ...",quarter 2022 results. We released our press re...,Vice President of Investor Relations. Ms. Gold...,"Thank you. Good morning, and welcome to Travel...",20602454,Vice-President of Investor Relations. Ms. Gold...,Vice President of Investor Relations. Ms.Golds...,"Thank you. Good morning, and welcome to Travel..."
9,unprecedented political uncertainty in the U.K...,investor sentiment being impacted by the uncer...,began the year with renewed conﬁdence and mome...,James's Place Community have worked commendabl...,Let's start by recapping on those medium-term ...,vaccination programs saw many economies reboun...,outlook.,Place history. I will recap on these ﬂows then...,SJP.,"Digging into those headlines a little more, we...",...,"2022 results. We released our press release, ﬁ...",20602454,at travelers.com under the Investors section.,webcast presentation earlier this morning. All...,20602454,quarter 2022 results. We released our press re...,Thank you. Good morning and welcome to Travele...,"Thank you. Good morning, and welcome to Travel...",quarter 2023 results. We released our press re...,"2024 results. We released our press release, ﬁ..."


In [7]:
len(df)

1092

The code below compiles the data into a DataFrame with columns for the text line, file name, date, and company name.

In [8]:
# Directory containing the cleaned text files
input_dir = "Cleaned Earnings Calls Texts"

# Initialise a list to hold the text data
text_data = []

# Iterate over each company folder in the directory
for company_folder in os.listdir(input_dir):
    company_path = os.path.join(input_dir, company_folder)
    if os.path.isdir(company_path):
        # Iterate over each text file in the company folder
        for filename in os.listdir(company_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(company_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    lines = text.split('\n')
                    # Extract company name and date
                    if len(lines) > 1:
                        company_name = lines[0]
                        date = lines[1]
                        # Remove the first two lines (company name and date)
                        lines = lines[2:]
                        # Remove lines that begin and end with "Presentation" or "Questions And Answers"
                        lines = [line for line in lines if not (line.strip().startswith("Presentation") and line.strip().endswith("Presentation"))]
                        lines = [line for line in lines if not (line.strip().startswith("Questions And Answers") and line.strip().endswith("Questions And Answers"))]
                        for line in lines:
                            text_data.append([line, filename, date, company_name])

# Create a DataFrame with the specified structure
horizontal_df = pd.DataFrame(text_data, columns=['line', 'file_name', 'date', 'company_name'])

# Print the new DataFrame
horizontal_df

Unnamed: 0,line,file_name,date,company_name
0,"Should we get started? So good morning, everyo...",cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity)
1,presentation. Adopting our usual format at the...,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity)
2,hand over to Craig to run through the ﬁnancial...,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity)
3,developments and outlook. We'll follow this wi...,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity)
4,There are also a number of my executive team a...,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity)
...,...,...,...,...
24177,Thank you. I will turn the call to Ms. Goldste...,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity)
24178,follow-up please feel free to reach out direct...,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity)
24179,day.,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity)
24180,This concludes today's conference call. We tha...,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity)


In [9]:
len(horizontal_df)

24182

#  Paragraph-Based DataFrame

The code combines lines into paragraphs. Each paragraph is then stored in a DataFrame along with its corresponding file name, date and company name. Duplicate paragraphs are identified and removed, and the DataFrame is reset to ensure continuous indexing.

In [10]:
# Directory containing the cleaned text files
input_dir = "Cleaned Earnings Calls Texts"

# Initialize a list to hold the text data
text_data = []

# Function to determine if a line ends with a period and the next line starts with a capital letter
def is_end_of_paragraph(line, next_line):
    return bool(re.search(r'\.\s*$', line)) and next_line and next_line[0].isupper()

# Iterate over each company folder in the directory
for company_folder in os.listdir(input_dir):
    company_path = os.path.join(input_dir, company_folder)
    if os.path.isdir(company_path):
        # Iterate over each text file in the company folder
        for filename in os.listdir(company_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(company_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    lines = text.split('\n')
                    # Remove lines that begin and end with "Presentation" or "Questions And Answers"
                    lines = [line for line in lines if not (line.strip().startswith("Presentation") and line.strip().endswith("Presentation"))]
                    lines = [line for line in lines if not (line.strip().startswith("Questions And Answers") and line.strip().endswith("Questions And Answers"))]

                    if len(lines) > 1:
                        company_name = lines[0]
                        date = lines[1]
                        paragraph = ""
                        for i in range(2, len(lines)):
                            line = lines[i]
                            next_line = lines[i + 1] if i + 1 < len(lines) else ""
                            if is_end_of_paragraph(line, next_line):
                                paragraph += line + " "
                                text_data.append([filename, date, company_name, paragraph.strip()])
                                paragraph = ""
                            else:
                                paragraph += line + " "
                        
                        # Add the last paragraph if there is no trailing empty line
                        if paragraph:
                            text_data.append([filename, date, company_name, paragraph.strip()])

# Create a DataFrame with the specified structure
paragraphs_df = pd.DataFrame(text_data, columns=['file_name', 'date', 'company_name', 'paragraph'])

# Print the new DataFrame
paragraphs_df

Unnamed: 0,file_name,date,company_name,paragraph
0,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Should we get started? So good morning, everyo..."
1,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),There are also a number of my executive team a...
2,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),Please do look them up over coﬀee at the end.
3,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),So the ﬁrst six months. It's fair to say that ...
4,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),New gross inﬂows for the six months was GBP 7....
...,...,...,...,...
4524,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),"Hi, good morning and thanks for squeezing me i..."
4525,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),I did notice that the renewal rate in home did...
4526,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. It helps. Always much appreciated.
4527,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. I will turn the call to Ms. Goldste...


In [11]:
# Display the length of the DataFrame before removing duplicates
print("Before:", len(paragraphs_df))

# Create a copy of the DataFrame to check for duplicates
check_dup = paragraphs_df.copy()

# Check for duplicate paragraphs
check_dup['true_false'] = paragraphs_df.duplicated(subset=['paragraph'])

# Display the count of duplicate and unique paragraphs
print(check_dup['true_false'].value_counts())

# Keep only one row for each unique paragraph
paragraphs_df = paragraphs_df.drop_duplicates(subset=['paragraph'])

# Reset the index to make sure it's continuous
paragraphs_df = paragraphs_df.reset_index(drop=True)

# Display the length of the DataFrame after removing duplicates
print("After:", len(paragraphs_df))

Before: 4529
true_false
False    4211
True      318
Name: count, dtype: int64
After: 4211


In [12]:
paragraph_counts = paragraphs_df.groupby('file_name').size()
print(paragraph_counts)

file_name
cleaned_St James_s Place PLC Earnings Call 2019731 SD000000002886567163.txt     151
cleaned_St James_s Place PLC Earnings Call 2020227 DN000000002799172133.txt     140
cleaned_St James_s Place PLC Earnings Call 2020728 DN000000002875532448.txt     159
cleaned_St James_s Place PLC Earnings Call 2021225 RT000000002951492856.txt     141
cleaned_St James_s Place PLC Earnings Call 2021728 RT000000002961195990.txt     170
cleaned_St James_s Place PLC Earnings Call 2022224 DN000000002974539999.txt     144
cleaned_St James_s Place PLC Earnings Call 2022728 DN000000002988128310.txt     130
cleaned_St James_s Place PLC Earnings Call 2023228 DN000000003004320905.txt     142
cleaned_St James_s Place PLC Earnings Call 2023727 DN000000003017788687.txt     135
cleaned_St James_s Place PLC Earnings Call 2024228 DN000000003032811522.txt     193
cleaned_Travelers Cos IncThe Earnings Call 20191022 DN000000002732661665.txt    188
cleaned_Travelers Cos IncThe Earnings Call 2019418 DN0000000026259

In [13]:
# Display a random paragraph
with pd.option_context('display.max_colwidth', None):
    random_paragraph = paragraphs_df['paragraph'].sample(n=1).iloc[0]
random_paragraph


'I think of that as a credit business. But maybe things have changed.'

# Cleaning and Lemmatising 

The script utilises spaCy to process and clean the text data. Initially, it defines a function to clean tokens by removing whitespace, stop words, numbers and punctuation while lemmatizing the text. Non-English paragraphs are filtered out, and the DataFrame is reset for continuous indexing.

In [14]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Function to clean tokens in the text
def clean_tokens(text_series):
    # Convert text_series df to list
    text_list = text_series.to_list()

    # Remove whitespaces and trailing spaces
    def remove_whitespace(text):
        pattern = re.compile(r'\s+') 
        without_whitespace = re.sub(pattern, ' ', text)
        text = without_whitespace.replace('?', ' ? ').replace(')', ') ')
        text = text.strip()
        return text

    text_list = list(map(lambda x: remove_whitespace(x), text_list))

    # Apply NLP pipeline to remove stop words, numbers, and lemmatize the words
    tokens = []
    for text in tqdm(text_list): # or tqdm.tqdm
        tmp_tokens = [
            token.lemma_
            for token in nlp(text)
            if not token.is_stop 
            and not token.like_num
            and not token.is_punct
            and token.is_alpha
        ]
        tokens.append(tmp_tokens)
    return tokens 

# Function to post-process the DataFrame
def post_process(df):
    # Create a new column 'paragraph_clean'
    df['paragraph_clean'] = df['paragraph']
    
    # Remove unwanted characters and numeric values
    df['paragraph_clean'] = df['paragraph_clean'].str.replace(',', '', regex=False)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace('.', '', regex=False)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace('(', '', regex=False)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace(')', '', regex=False)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace(r'\d+\.\d+', '', regex=True)
    df['paragraph_clean'] = df['paragraph_clean'].str.replace('\d+', '', regex=True)
    df['paragraph_clean'] = df['paragraph_clean'].astype(str)
    
    return df

# Apply the post_process function to paragraphs_df
paragraphs_df = post_process(paragraphs_df)

pd.set_option('display.max_columns', None)
print(paragraphs_df['file_name'].nunique())
paragraphs_df

30


Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean
0,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Should we get started? So good morning, everyo...",Should we get started? So good morning everyon...
1,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),There are also a number of my executive team a...,There are also a number of my executive team a...
2,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),Please do look them up over coﬀee at the end.,Please do look them up over coﬀee at the end
3,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),So the ﬁrst six months. It's fair to say that ...,So the ﬁrst six months It's fair to say that w...
4,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),New gross inﬂows for the six months was GBP 7....,New gross inﬂows for the six months was GBP b...
...,...,...,...,...,...
4206,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),"Hi, good morning and thanks for squeezing me i...",Hi good morning and thanks for squeezing me in...
4207,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),I did notice that the renewal rate in home did...,I did notice that the renewal rate in home did...
4208,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. It helps. Always much appreciated.,Thank you It helps Always much appreciated
4209,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. I will turn the call to Ms. Goldste...,Thank you I will turn the call to Ms Goldstein...


In [15]:
# Load spaCy model
nlp = spacy.load("en_core_web_lg", disable=["parser", "ner"])

# Lemmatization function
def lemmatization(texts, allowed_postags=["NOUN"]):
    doc = nlp(texts)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    final = " ".join(new_text)
    return final

# Apply lemmatization to the 'paragraph_clean' column
paragraphs_df['paragraph_noun'] = paragraphs_df['paragraph_clean'].apply(lemmatization)
paragraphs_df

Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun
0,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Should we get started? So good morning, everyo...",Should we get started? So good morning everyon...,morning result presentation format year fund ﬂ...
1,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),There are also a number of my executive team a...,There are also a number of my executive team a...,number team non - exec morning
2,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),Please do look them up over coﬀee at the end.,Please do look them up over coﬀee at the end,end
3,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),So the ﬁrst six months. It's fair to say that ...,So the ﬁrst six months It's fair to say that w...,month period uncertainty environment trade rel...
4,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),New gross inﬂows for the six months was GBP 7....,New gross inﬂows for the six months was GBP b...,gross inﬂow month % half ﬂow
...,...,...,...,...,...,...
4206,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),"Hi, good morning and thanks for squeezing me i...",Hi good morning and thanks for squeezing me in...,morning thank couple line question renewal pre...
4207,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),I did notice that the renewal rate in home did...,I did notice that the renewal rate in home did...,rate home bit anomaly drop property year fact ...
4208,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. It helps. Always much appreciated.,Thank you It helps Always much appreciated,
4209,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. I will turn the call to Ms. Goldste...,Thank you I will turn the call to Ms Goldstein...,call closing remark follow up day


In [16]:
# Drop the 'paragraph_noun' if it is not in English
paragraphs_df = paragraphs_df[paragraphs_df['paragraph_noun'].str.contains('[a-zA-Z]')]
paragraphs_df = paragraphs_df.reset_index(drop=True)

# Tokenising and Analysing

The script begins by converting text to lowercase and removing any extra whitespace. Following this, the script calculates the counts of words, characters, and sentences, excluding rows with fewer than three sentences. Additionally, overly general words are dropped from the tokens.

Next, the script analyses word frequencies for each company and removes the 50 most frequent words. It also filters out tokens with fewer than two characters. The result is a final cleaned and tokenised DataFrame.

In [17]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def clean_tokens_noun(text_series):
    # Step 1: Convert text_series df to list
    text_list = text_series.to_list()

    # Step 2: Change the list to lower case
    text_list = list(map(lambda x: x.lower(), text_list))

    # Step 3: Remove whitespaces and trailing spaces
    def remove_whitespace(text):
        pattern = re.compile(r'\s+')
        Without_whitespace = re.sub(pattern, ' ', text)
        text = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
        text = text.strip()
        return text

    text_list = list(map(lambda x: remove_whitespace(x), text_list))
    
    # Expand the list of stopwords
    gist_file = open("gist_stopwords.txt", "r")
    try:
        content = gist_file.read()
        stopwords = content.split(",")
    finally:
        gist_file.close()

    stopwords = [i.replace('"', "").strip() for i in stopwords]
    # Add the stopwords to the list of stopwords
    for i in stopwords:
        nlp.Defaults.stop_words.add(i)

    # Create column for cleaned text_list
    tokens, tmp_tokens = [], []
    for text in tqdm(text_list):
        tmp_tokens = [
            token.lemma_
            for token in nlp(text)
            if not token.is_stop
            and not token.like_num
            and not token.is_punct
            and token.is_alpha
        ]
        tokens.append(tmp_tokens)
        tmp_tokens = []
    return tokens

In [18]:
# Use apply to get the token of the paragraph
paragraphs_df['token'] = clean_tokens_noun(paragraphs_df['paragraph_noun'])
print(len(paragraphs_df))
paragraphs_df

100%|██████████| 4102/4102 [00:15<00:00, 258.90it/s]

4102





Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun,token
0,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Should we get started? So good morning, everyo...",Should we get started? So good morning everyon...,morning result presentation format year fund ﬂ...,"[morning, result, presentation, format, year, ..."
1,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),There are also a number of my executive team a...,There are also a number of my executive team a...,number team non - exec morning,"[number, team, exec, morning]"
2,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),Please do look them up over coﬀee at the end.,Please do look them up over coﬀee at the end,end,[]
3,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),So the ﬁrst six months. It's fair to say that ...,So the ﬁrst six months It's fair to say that w...,month period uncertainty environment trade rel...,"[month, period, uncertainty, environment, trad..."
4,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),New gross inﬂows for the six months was GBP 7....,New gross inﬂows for the six months was GBP b...,gross inﬂow month % half ﬂow,"[gross, inﬂow, month, half, ﬂow]"
...,...,...,...,...,...,...,...
4097,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),We have time for one more question. It will --...,We have time for one more question It will -- ...,time question line line,"[time, question]"
4098,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),"Hi, good morning and thanks for squeezing me i...",Hi good morning and thanks for squeezing me in...,morning thank couple line question renewal pre...,"[morning, couple, question, renewal, premium, ..."
4099,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),I did notice that the renewal rate in home did...,I did notice that the renewal rate in home did...,rate home bit anomaly drop property year fact ...,"[rate, bit, anomaly, drop, property, year, fac..."
4100,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. I will turn the call to Ms. Goldste...,Thank you I will turn the call to Ms Goldstein...,call closing remark follow up day,"[closing, remark, follow, day]"


In [19]:
# Create column for length analysis outcome
paragraphs_df['word_count'] = paragraphs_df["token"].apply(lambda x: len(x))
paragraphs_df['characters_count'] = paragraphs_df["token"].apply(lambda x: sum(len(word) for word in x))
paragraphs_df['sentence_count'] = paragraphs_df['paragraph'].apply(lambda x: len(str(x).split(".")))
paragraphs_df['avg_word_length'] = paragraphs_df['characters_count'] / paragraphs_df['word_count']
paragraphs_df['avg_sentence_length'] = paragraphs_df['word_count'] / paragraphs_df['sentence_count']
paragraphs_df

Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun,token,word_count,characters_count,sentence_count,avg_word_length,avg_sentence_length
0,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Should we get started? So good morning, everyo...",Should we get started? So good morning everyon...,morning result presentation format year fund ﬂ...,"[morning, result, presentation, format, year, ...",10,68,6,6.800000,1.666667
1,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),There are also a number of my executive team a...,There are also a number of my executive team a...,number team non - exec morning,"[number, team, exec, morning]",4,21,2,5.250000,2.000000
2,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),Please do look them up over coﬀee at the end.,Please do look them up over coﬀee at the end,end,[],0,0,2,,0.000000
3,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),So the ﬁrst six months. It's fair to say that ...,So the ﬁrst six months It's fair to say that w...,month period uncertainty environment trade rel...,"[month, period, uncertainty, environment, trad...",15,120,9,8.000000,1.666667
4,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),New gross inﬂows for the six months was GBP 7....,New gross inﬂows for the six months was GBP b...,gross inﬂow month % half ﬂow,"[gross, inﬂow, month, half, ﬂow]",5,22,4,4.400000,1.250000
...,...,...,...,...,...,...,...,...,...,...,...,...
4097,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),We have time for one more question. It will --...,We have time for one more question It will -- ...,time question line line,"[time, question]",2,12,5,6.000000,0.400000
4098,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),"Hi, good morning and thanks for squeezing me i...",Hi good morning and thanks for squeezing me in...,morning thank couple line question renewal pre...,"[morning, couple, question, renewal, premium, ...",43,260,13,6.046512,3.307692
4099,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),I did notice that the renewal rate in home did...,I did notice that the renewal rate in home did...,rate home bit anomaly drop property year fact ...,"[rate, bit, anomaly, drop, property, year, fac...",39,251,11,6.435897,3.545455
4100,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. I will turn the call to Ms. Goldste...,Thank you I will turn the call to Ms Goldstein...,call closing remark follow up day,"[closing, remark, follow, day]",4,22,6,5.500000,0.666667


In [20]:
# Exclude rows where sentence_count is less than 3
paragraphs_df = paragraphs_df[paragraphs_df['sentence_count'] > 3]
# Reset the index
paragraphs_df = paragraphs_df.reset_index(drop=True)
# Display the length of the DataFrame
len(paragraphs_df)

2715

In [21]:
# Drop the words that are too general
general_words = [
    'afternoon', 'morning', 'conference', 'today', 'lady', 'gentleman', 'presentation',
    'question', 'answer', 'slide', 'mm', 'mm_mm', 'guy', 'sir', ' ', 'ytd', 'host_sir',
    'bb', 'ty', 'word', 'year', 'quer', 'month', 'period', 'day', 'time', 'result',
    'investor', 'week', 'update', 'business', 'lot', 'ratio', 'rate', 'quarter',
    'number', 'point', 'term', 'thing', 'level', 'bit', 'sort', 'reason', 'management',
    'fact', 'case', 'area', 'people', 'sense', 'item', 'issue', 'market', 'meeting',
    'questions', 'answers', 'managements', 'discussion', 'section', 'presentation', 'speaker', 'participant',  'afternoon', 'morning', 'conference','today','lady', 'gentleman', 'presentation',
    'question','answer', 'slide','proÔ¨Åt', 'eÔ¨Äect','proÔ¨Åtability','oÔ¨Ä','diÔ¨Äerent','eÔ¨Éciency','Ô¨Ågure','inÔ¨Çation','Ô¨Çow','conÔ¨Åt',
    'mm', 'mm_mm','guy','sir',' ','host_sir','bb','ty','word', 'year', 'quer','month','period', 'day', 'time','result', 'investor','week', 'update',
    'Ô¨Åre','diÔ¨Äerence','diÔ¨Écult','beneÔ¨Åt', 'business','lot','ratio','rate','quarter', 'number', 'point', 'term', 'thing', 'level', 
    'bit', 'sort', 'reason', 'management', 'fact', 'case', 'area', 'people', 'sense', 'item', 'issue', 'market'
]

paragraphs_df['token'] = paragraphs_df['token'].apply(lambda x: [i for i in x if i not in general_words])
paragraphs_df


Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun,token,word_count,characters_count,sentence_count,avg_word_length,avg_sentence_length
0,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Should we get started? So good morning, everyo...",Should we get started? So good morning everyon...,morning result presentation format year fund ﬂ...,"[format, fund, ﬂow, ﬁnancial, development, out...",10,68,6,6.800000,1.666667
1,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),So the ﬁrst six months. It's fair to say that ...,So the ﬁrst six months It's fair to say that w...,month period uncertainty environment trade rel...,"[uncertainty, environment, trade, relationship...",15,120,9,8.000000,1.666667
2,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),New gross inﬂows for the six months was GBP 7....,New gross inﬂows for the six months was GBP b...,gross inﬂow month % half ﬂow,"[gross, inﬂow, half, ﬂow]",5,22,4,4.400000,1.250000
3,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Looking back just ﬁve years ago, gross ﬂows fo...",Looking back just ﬁve years ago gross ﬂows for...,year ﬂow year -year period compound growth % a...,"[ﬂow, compound, growth, annum]",7,36,4,5.142857,1.750000
4,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Importantly, the continued strong retention of...",Importantly the continued strong retention of ...,retention client fund inﬂow period % fund mana...,"[retention, client, fund, inﬂow, fund, basis, ...",27,183,8,6.777778,3.375000
...,...,...,...,...,...,...,...,...,...,...,...,...
2710,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),We have time for one more question. It will --...,We have time for one more question It will -- ...,time question line line,[],2,12,5,6.000000,0.400000
2711,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),"Hi, good morning and thanks for squeezing me i...",Hi good morning and thanks for squeezing me in...,morning thank couple line question renewal pre...,"[couple, renewal, premium, change, auto, state...",43,260,13,6.046512,3.307692
2712,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),I did notice that the renewal rate in home did...,I did notice that the renewal rate in home did...,rate home bit anomaly drop property year fact ...,"[anomaly, drop, property, progress, insurance,...",39,251,11,6.435897,3.545455
2713,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),Thank you. I will turn the call to Ms. Goldste...,Thank you I will turn the call to Ms Goldstein...,call closing remark follow up day,"[closing, remark, follow]",4,22,6,5.500000,0.666667


In [22]:
# Initialize a dictionary to hold word frequencies for each company
company_word_freq = {}

# Iterate over each unique company
for company in paragraphs_df['company_name'].unique():
    # Filter the DataFrame for the current company
    company_df = paragraphs_df[paragraphs_df['company_name'] == company]

    # Combine tokens into a single list for the current company
    docs_tokens = []
    for tokens in company_df['token']:
        docs_tokens.extend(tokens)
    
    # Calculate word frequency for the current company
    word_freq = Counter(docs_tokens).most_common(50)
    company_word_freq[company] = word_freq

# Create a DataFrame for each company's word frequencies
company_word_freq_dfs = {}
for company, word_freq in company_word_freq.items():
    company_word_freq_dfs[company] = pd.DataFrame(word_freq, columns=['word', 'freq']).sort_values(by='freq', ascending=False)

# Display top 50 word frequencies for each company
company_word_freq_dfs

{"St James's Place PLC (STJ LN Equity)":            word  freq
 0        client   411
 1        growth   272
 2          cash   262
 3          fund   253
 4    investment   224
 5          cost   217
 6           ﬂow   177
 7          half   156
 8        advice   152
 9       partner   145
 10       change   129
 11      adviser   126
 12  partnership   122
 13      expense   113
 14      advisor   113
 15     guidance   110
 16       margin   107
 17     dividend   103
 18       charge   102
 19       impact    94
 20         plan    92
 21          tax    88
 22        model    85
 23  environment    80
 25      academy    78
 24       income    78
 27       future    77
 28     inﬂation    77
 26    gestation    77
 29        asset    73
 30       moment    69
 31        basis    68
 32         face    67
 33    retention    66
 34       target    64
 35  performance    63
 36     increase    59
 38      pension    58
 37        inﬂow    58
 39  shareholder    57
 40      outcome 

In [23]:
# Remove the 50 most frequent words
# Initialize a dictionary to hold word frequencies for each company
company_word_freq = {}

# Iterate over each unique company
for company in paragraphs_df['company_name'].unique():
    # Filter the DataFrame for the current company
    company_df = paragraphs_df[paragraphs_df['company_name'] == company]

    # Combine tokens into a single list for the current company
    docs_tokens = []
    for tokens in company_df['token']:
        docs_tokens.extend(tokens)
    
    # Calculate word frequency for the current company
    word_freq = Counter(docs_tokens).most_common(50)
    company_word_freq[company] = [word for word, freq in word_freq]

# Remove the most frequent words from the token column
mdy_list, tmp = [], []
for _, row in paragraphs_df.iterrows():
    company = row['company_name']
    review = row['token']
    word_list = company_word_freq[company]
    tmp = [word for word in review if word not in word_list]
    mdy_list.append(tmp)

paragraphs_df['token'] = mdy_list

In [24]:
print(len(paragraphs_df))
paragraphs_df['token_len'] = paragraphs_df['token'].apply(lambda x: len(x))
# Drop rows where the length of the token is less than 2
paragraphs_df = paragraphs_df[paragraphs_df['token_len'] > 2]
print(len(paragraphs_df))
# Recreate 'docs_tokens' from 'paragraphs_df'
temp_token = paragraphs_df['token'] # .apply(remove_brackets)
docs_tokens = []
for i in temp_token:
    docs_tokens.append(i)

2715
2232


# Preparing and Saving Final DataFrame

The script processes the cleaned and tokenised earnings call transcripts and sorts them by file name and date. It then groups the data by file name, date and company name, aggregating tokens and paragraphs. The tokens are flattened, and separate DataFrames are created for each company. Finally, the script saves each company's DataFrame to a CSV file in a designated directory.


In [25]:
# sort by the file_name and date 
fidf = paragraphs_df.sort_values(by=['file_name', 'date'])
fidf  

Unnamed: 0,file_name,date,company_name,paragraph,paragraph_clean,paragraph_noun,token,word_count,characters_count,sentence_count,avg_word_length,avg_sentence_length,token_len
0,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Should we get started? So good morning, everyo...",Should we get started? So good morning everyon...,morning result presentation format year fund ﬂ...,"[format, ﬁnancial, development, outlook]",10,68,6,6.800000,1.666667,4
1,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),So the ﬁrst six months. It's fair to say that ...,So the ﬁrst six months It's fair to say that w...,month period uncertainty environment trade rel...,"[uncertainty, trade, relationship, wealth, bac...",15,120,9,8.000000,1.666667,7
4,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"Importantly, the continued strong retention of...",Importantly the continued strong retention of ...,retention client fund inﬂow period % fund mana...,"[track, record, start, track, record, percenta...",27,183,8,6.777778,3.375000,9
5,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"So why is this? Well ﬁrst and foremost, St. Ja...",So why is this? Well ﬁrst and foremost St Jame...,relationship business % ﬂow client introductio...,"[relationship, introduction, life, journey, aﬀ...",26,175,8,6.730769,3.250000,12
6,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),Where gross ﬂows can be impacted is with discr...,Where gross ﬂows can be impacted is with discr...,ﬂow investment say bonus proceed disposal asse...,"[bonus, proceed, disposal, sale, individual, u...",18,122,7,6.777778,2.571429,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2708,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),"Okay, got it. And maybe secondly, I'm curious ...",Okay got it And maybe secondly I'm curious as ...,re - underwriting book takeaway month integration,"[underwriting, takeaway, integration]",5,40,7,8.000000,0.714286,3
2709,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),We're feeling really good about bringing and l...,We're feeling really good about bringing and l...,capability organization quality proﬁtability b...,"[capability, organization, quality, proﬁtabili...",12,97,7,8.083333,1.714286,7
2711,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),"Hi, good morning and thanks for squeezing me i...",Hi good morning and thanks for squeezing me in...,morning thank couple line question renewal pre...,"[couple, adequacy, couple, think, decline, pol...",43,260,13,6.046512,3.307692,14
2712,cleaned_Travelers Cos IncThe Earnings Call 202...,2024-04-17,Travelers Cos Inc/The (TRV US Equity),I did notice that the renewal rate in home did...,I did notice that the renewal rate in home did...,rate home bit anomaly drop property year fact ...,"[anomaly, drop, progress, insurance, coverage,...",39,251,11,6.435897,3.545455,15


In [26]:
# Copy fidf to fidfbase and select specific columns
fidfbase = fidf.copy()
fidfbase = fidfbase[['file_name', 'date', 'company_name', 'token', 'paragraph', 'paragraph_clean', 'paragraph_noun']]

# Groupby 'file_name', 'date', 'company_name' and aggregate tokens and paragraphs
token_grouped = fidfbase.groupby(['file_name', 'date', 'company_name'])['token'].apply(list).reset_index()
paragraph_grouped = fidfbase.groupby(['file_name', 'date', 'company_name'])['paragraph'].apply(list).reset_index()
paragraph_clean_grouped = fidfbase.groupby(['file_name', 'date', 'company_name'])['paragraph_clean'].apply(list).reset_index()
paragraph_noun_grouped = fidfbase.groupby(['file_name', 'date', 'company_name'])['paragraph_noun'].apply(list).reset_index()

# Merge token and paragraph lists into a single DataFrame
token_paragraph_merged = pd.merge(token_grouped, paragraph_grouped, on=['file_name', 'date', 'company_name'])

# Define function to flatten lists and remove brackets
def flatten_list(x):
    flattened_list = [item for sublist in x for item in sublist]
    return flattened_list

# Apply flatten_list to 'token' column in token_paragraph_merged
token_paragraph_merged['token'] = token_paragraph_merged['token'].apply(flatten_list)
token_paragraph_merged


Unnamed: 0,file_name,date,company_name,token,paragraph
0,cleaned_St James_s Place PLC Earnings Call 201...,2019-07-31,St James's Place PLC (STJ LN Equity),"[format, ﬁnancial, development, outlook, uncer...","[Should we get started? So good morning, every..."
1,cleaned_St James_s Place PLC Earnings Call 202...,2020-02-27,St James's Place PLC (STJ LN Equity),"[decade, format, ﬁnancial, matter, note, execu...","[Good morning everyone. It's half ten, so we s..."
2,cleaned_St James_s Place PLC Earnings Call 202...,2020-07-28,St James's Place PLC (STJ LN Equity),"[agenda, introduction, ﬁnancial, outlook, mome...","[Given COVID-19, today's presentation has been..."
3,cleaned_St James_s Place PLC Earnings Call 202...,2021-02-25,St James's Place PLC (STJ LN Equity),"[pandemic, review, focus, summary, individual,...",[Good morning. I hope you're keeping safe and ...
4,cleaned_St James_s Place PLC Earnings Call 202...,2021-07-28,St James's Place PLC (STJ LN Equity),"[outlook, session, team, combination, assumpti...","[Good morning, and welcome to our 2021 Interim..."
5,cleaned_St James_s Place PLC Earnings Call 202...,2022-02-24,St James's Place PLC (STJ LN Equity),"[height, resilience, circumstance, agility, em...","[In 2020, at the height of the pandemic, St. J..."
6,cleaned_St James_s Place PLC Earnings Call 202...,2022-07-28,St James's Place PLC (STJ LN Equity),"[agenda, ﬁgure, topic, medium, prospect, outlo...","[Good morning, and welcome to our 2022 Half-Ye..."
7,cleaned_St James_s Place PLC Earnings Call 202...,2023-02-28,St James's Place PLC (STJ LN Equity),"[start, globe, backdrop, uncertainty, history,...","[Good morning, and welcome to our Full-Year Re..."
8,cleaned_St James_s Place PLC Earnings Call 202...,2023-07-27,St James's Place PLC (STJ LN Equity),"[session, format, ﬁgure, regime, progress, pri...",[This morning's session will follow a familiar...
9,cleaned_St James_s Place PLC Earnings Call 202...,2024-02-28,St James's Place PLC (STJ LN Equity),"[pleasure, ceo, agenda, sight, headline, volum...","[Good morning, everyone. It's my pleasure to t..."


In [27]:
# Group by 'company_name' to create separate DataFrames for each company
company_dfs = {}
for company in fidfbase['company_name'].unique():
    company_df = fidfbase[fidfbase['company_name'] == company][['file_name', 'date', 'token', 'paragraph']]
    company_name_cleaned = "_".join(company.split()[:2]).replace('(', '').replace(')', '').replace('.', '').replace(',', '').replace("'", "") + '_df'
    company_dfs[company_name_cleaned] = company_df

# Example of accessing a specific company DataFrame
for company_name, df in company_dfs.items():
    print(f"{company_name}: {df.shape}")

# Function to get DataFrame by company name
def get_company_df(company_name):
    company_name_cleaned = "_".join(company_name.split()[:2]).replace('(', '').replace(')', '').replace('.', '').replace(',', '').replace("'", "") + '_df'
    return company_dfs.get(company_name_cleaned, None)

# Example usage:
df = get_company_df('Traveler Cos')
print(df)

# Save each company's DataFrame to a CSV file
output_dir = "company_csvs"
os.makedirs(output_dir, exist_ok=True)

for company_name, df in company_dfs.items():
    csv_path = os.path.join(output_dir, f"{company_name}.csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved {company_name} to {csv_path}")

St_Jamess_df: (845, 4)
Travelers_Cos_df: (1387, 4)
None
Saved St_Jamess_df to company_csvs\St_Jamess_df.csv
Saved Travelers_Cos_df to company_csvs\Travelers_Cos_df.csv
