### ECT_LLM - Part 1. Data Conversion

In [None]:
import pandas as pd
import re
import os
import torch

print("PyTorch version:", torch.__version__)
print("CUDA version used by PyTorch:", torch.version.cuda)
print("Is CUDA available:", torch.cuda.is_available())

PyTorch version: 2.6.0+cu118
CUDA version used by PyTorch: 11.8
Is CUDA available: True


In [3]:
dir_txt = "I:/Data_for_practice/ECT2001_2021/"
dir_txt

'I:/Data_for_practice/ECT2001_2021/'

In [20]:
# data/2021/2021-Apr-01-AESE.OQ-138459206198-transcript.txt
# data/2002/2002-Oct-10-YUM.N-140433860222-transcript.txt
with open(dir_txt + "2021/2021-Apr-01-AESE.OQ-138459206198-transcript.txt", "r", encoding="utf-8") as file:
    data = file.read()

data[:500]



In [21]:
# Split the data by lines to analyze its structure
lines = data.split("\n")
# Remove empty lines
lines = [line for line in lines if line.strip() != ""]
# Check the first 50 lines to get an idea of the structure
lines

['Refinitiv StreetEvents Event Transcript',
 'E D I T E D   V E R S I O N',
 'Q4 2020 Allied Esports Entertainment Inc Earnings Call',
 'MARCH 31, 2021 / 9:00PM GMT',
 'Corporate Participants',
 ' * Kwok Leung Ng',
 '   Allied Esports Entertainment Inc. - CEO & Director',
 ' * Anthony A. Hung',
 '   Allied Esports Entertainment Inc. - CFO',
 'Conference Call Participiants',
 ' * Lasse Glassen',
 '   ADDO Investor Relations - MD',
 'Presentation',
 '--------------------------------------------------------------------------------',
 'Operator    [1]',
 '--------------------------------------------------------------------------------',
 "Greetings. Welcome to Allied Esports Entertainment's Fourth Quarter and Full Year 2020 Earnings Conference Call. (Operator Instructions) Please note, this conference is being recorded.",
 'I will now turn the conference over to your host, Lasse Glassen, Managing Director of Investor Relations. Thank you. You may begin.',
 '--------------------------------

In [30]:
# # Extracting the participants' details using a refined approach
# participants_data = []

# # Splitting the section by asterisks to isolate participants
# for line in data.split('*'):
#     if line.strip():  # If the line is not empty
#         parts = line.strip().split('\n')
#         if len(parts) >= 2:
#             name = parts[0].strip()
#             org_and_title = parts[1].split(' - ')
#             organization = org_and_title[0].strip()
#             title = org_and_title[1].strip() if len(org_and_title) > 1 else ""
#             participants_data.append([name, organization, title])

# # Convert the list to a DataFrame
# df_par = pd.DataFrame(participants_data, columns=['Name', 'Organization', 'Title'])
# df_par = df_par[df_par["Name"].str.strip() != "Refinitiv StreetEvents Event Transcript"]
# df_par

Unnamed: 0,Name,Organization,Title,Category
0,Kwok Leung Ng,Allied Esports Entertainment Inc.,CEO & Director,Corporate Participants
1,Anthony A. Hung,Allied Esports Entertainment Inc.,CFO,Corporate Participants
2,Lasse Glassen,ADDO Investor Relations,MD,Conference Call Participants


In [31]:
participants_data = []
category = None   
current_name = None  

for line in data.split("\n"):
    line = line.strip()
    
    if "Corporate Participants" in line:
        category = "Corporate Participants"
    elif "Conference Call Participiants" in line:
        category = "Conference Call Participants"
    
    elif line.startswith("*"):
        current_name = line[1:].strip()  
    
    elif current_name and " - " in line:
        org_and_title = line.split(" - ")
        organization = org_and_title[0].strip()
        title = org_and_title[1].strip() if len(org_and_title) > 1 else ""
        
        participants_data.append([current_name, organization, title, category])
        
        current_name = None

df_par = pd.DataFrame(participants_data, columns=['Name', 'Organization', 'Title', 'Category'])
df_par

Unnamed: 0,Name,Organization,Title,Category
0,Kwok Leung Ng,Allied Esports Entertainment Inc.,CEO & Director,Corporate Participants
1,Anthony A. Hung,Allied Esports Entertainment Inc.,CFO,Corporate Participants
2,Lasse Glassen,ADDO Investor Relations,MD,Conference Call Participants


In [35]:
# Define a function to extract structured data from the text
def extract_sections(lines):
    sections = []
    section = []
    for line in lines:
        if line.startswith("==="):
            if section:
                sections.append(section)
                section = []
        else:
            section.append(line.strip())
    if section:
        sections.append(section)
    return sections

# Extract sections from the lines
sections = extract_sections(lines)

# Extract structured data from sections
structured_data = []
current_speaker = None
current_position = None
speech = []

for section in sections:
    if "Participants" in section[0]:
        for i in range(1, len(section)):
            if section[i].startswith("*"):
                current_speaker = section[i].replace("*", "").strip()
                current_position = section[i+1].strip()
    elif "Presentation" in section[0] or "Questions and Answers" in section[0]:
        for line in section[1:]:
            if line.startswith("----"):
                if current_speaker and speech:
                    structured_data.append([current_speaker, current_position, " ".join(speech)])
                    speech = []
            elif line.startswith("Operator"):
                current_speaker = "Operator"
                current_position = ""
                speech = [line.replace("Operator", "").strip()]
            elif line.endswith("]"):
                # Extract speaker and position
                line_parts = line.split(",")
                current_speaker = line_parts[0].split("  ")[-1].strip()
                if len(line_parts) > 1:
                    current_position = line_parts[1].strip()
                else:
                    current_position = ""
            else:
                speech.append(line)
        if speech:
            structured_data.append([current_speaker, current_position, " ".join(speech)])
            speech = []

In [38]:
# Convert structured data into a DataFrame
df = pd.DataFrame(structured_data, columns=["Speaker", "Position", "Speech"])
df['Date'] = extracted_date
df["Speech"] = df["Speech"].apply(lambda x: re.sub(r'\[\d+\]', '', x).strip())
df["Position"] = df["Position"].apply(lambda x: re.sub(r'\[:digit:{1,}]', '', str(x)).strip())
df["Position"] = df["Position"].apply(lambda x: re.sub(r'\[\d+\]', '', x).strip())
#df[["Affiliation", "Position"]] = df["Position"].str.split(" - ", expand=True) 
df["Position"] = df["Position"].fillna("")

# Drop all rows after the found index (including the row with "Definitions")
definitions_index = df[df["Speech"].str.contains("Definitions")].index
if not definitions_index.empty:
    df = df.iloc[:definitions_index[0]]

# Remove empty rows
df = df[df["Speech"].str.strip() != ""]

# Rearrange columns
df

Unnamed: 0,Speaker,Position,Speech,Date
1,Operator,,Greetings. Welcome to Allied Esports Entertain...,"MARCH 31, 2021"
2,Lasse Glassen,ADDO Investor Relations - MD,"Thank you, operator. Good afternoon and welcom...","MARCH 31, 2021"
3,Kwok Leung Ng,Allied Esports Entertainment Inc. - CEO & Dire...,"Thank you, Lasse, and thank you, everyone, for...","MARCH 31, 2021"
4,Anthony A. Hung,Allied Esports Entertainment Inc. - CFO,"Thank you, Frank. Good afternoon, everyone, an...","MARCH 31, 2021"
6,Operator,,(Operator Instructions) There are no questions...,"MARCH 31, 2021"
7,Lasse Glassen,ADDO Investor Relations - MD,"Okay. Thank you for your support, everyone, an...","MARCH 31, 2021"
9,Operator,,Thank you. This does conclude today's conferen...,"MARCH 31, 2021"
