In [1]:
import os
import re
import pandas as pd

In [2]:
stock_symbol = 'RELIANCE.NS'

save_dir = f'{stock_symbol}_xml_files'
xml_files = [os.path.join(save_dir, f) for f in os.listdir(save_dir) if f.endswith('.xml')]

for xml_file in xml_files:
    with open(xml_file, 'r') as file:
        lines = file.readlines()
        
    # Finding the start line
    start_index = next((i for i, line in enumerate(lines) if '<in-bse-fin:Symbol contextRef="OneD">' in line), None)
    if start_index is None:
        print(f"Warning: {xml_file} does not contain the start tag. Skipping...")
        continue

    # Slicing the lines to remove unnecessary parts and replacing 'in-bse-fin:'
    cleaned_lines = [line.replace('in-bse-fin:', '') for line in lines[start_index:]]

    # Writing back to the file
    with open(xml_file, 'w') as file:
        file.writelines(cleaned_lines)

print("XML files cleaned!")

XML files cleaned!


In [3]:
def clean_xml_content(content):
    # Find the first opening tag and last closing tag
    first_tag = content.find('<')
    last_tag = content.rfind('>')

    # Return the content between those indices
    return content[first_tag:last_tag + 1]

for xml_file in xml_files:
    with open(xml_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    cleaned_content = clean_xml_content(content)
    
    # Overwrite the original XML file with cleaned content
    with open(xml_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_content)

In [4]:
for xml_file in xml_files:
    with open(xml_file, 'r') as file:
        content = file.read()

    # Removing contextRef attributes using regex
    cleaned_content = re.sub(r' contextRef="[^"]*"', '', content)
    

    # Writing back to the file
    with open(xml_file, 'w') as file:
        file.write(cleaned_content)

print("Removed contextRef attributes from XML files!")

Removed contextRef attributes from XML files!


In [5]:
for xml_file in xml_files:
    with open(xml_file, 'r') as file:
        content = file.read()

    # Removing contextRef attributes using regex
    cleaned_content = re.sub(r' unitRef="[^"]*"', '', content)
    

    # Writing back to the file
    with open(xml_file, 'w') as file:
        file.write(cleaned_content)

print("Removed contextRef attributes from XML files!")

Removed contextRef attributes from XML files!


In [6]:
for xml_file in xml_files:
    with open(xml_file, 'r') as file:
        content = file.read()

    # Removing contextRef attributes using regex
    cleaned_content = re.sub(r' decimals="[^"]*"', '', content)
    

    # Writing back to the file
    with open(xml_file, 'w') as file:
        file.write(cleaned_content)

print("Removed contextRef attributes from XML files!")

Removed contextRef attributes from XML files!


In [7]:
for xml_file in xml_files:
    with open(xml_file, 'r') as file:
        content = file.read()

    # Removing the </xbrli:xbrl> string
    content = content.replace('</xbrli:xbrl>', '')

    # Writing the modified content back to the file
    with open(xml_file, 'w') as file:
        file.write(content)

print("Removed </xbrli:xbrl> from all XML files.")

Removed </xbrli:xbrl> from all XML files.


In [8]:
for xml_file in xml_files:
    with open(xml_file, 'r') as file:
        content = file.read()

    # 1. Removing all closing tags
    content = re.sub(r'</[^>]+>', '', content)

    # 2. Removing all leftover "<"
    content = content.replace('<', '')

    # 3. Replacing all ">" with " : "
    content = content.replace('>', ' : ')

    # Writing the modified content back to the file
    with open(xml_file, 'w') as file:
        file.write(content)

print("Files have been cleaned.")

Files have been cleaned.


In [9]:
# A list to store dictionaries with data from each XML file
all_data = []

for xml_file in xml_files:
    with open(xml_file, 'r') as file:
        # Dictionary to store data for the current XML file
        data_dict = {}
        lines = file.readlines()
        for line in lines:
            # Skip empty lines
            if line.strip() == "":
                continue
            # Split the line at the delimiter to get the key and the value
            parts = line.split(" : ")
            if len(parts) == 2:  # Ensure that we have a key and a value
                key, value = parts
                key = key.strip()
                value = value.strip()
                data_dict[key] = value
        all_data.append(data_dict)

# Create a DataFrame using the list of dictionaries
df = pd.DataFrame(all_data)

# Optional: save the DataFrame to a CSV file
# df.to_csv("combined_data.csv", index=False)

In [10]:
final_df = df[['DateOnWhichPriorIntimationOfTheMeetingForConsideringFinancialResultsWasInformedToTheExchange', 'DateOfStartOfReportingPeriod', 'DateOfEndOfReportingPeriod', 'NatureOfReportStandaloneConsolidated', 'Income', 'DilutedEarningsLossPerShareFromContinuingAndDiscontinuedOperations', 'PaidUpValueOfEquityShareCapital', 'FaceValueOfEquityShareCapital']]
final_df = final_df.rename(columns={
    'DateOnWhichPriorIntimationOfTheMeetingForConsideringFinancialResultsWasInformedToTheExchange': 'Announcement Date',
    'DateOfStartOfReportingPeriod': 'Start of Reporting Period',
    'DateOfEndOfReportingPeriod': 'End of Reporting Period',
    'NatureOfReportStandaloneConsolidated': 'Report Type',
    'Income': 'Revenue',
    'DilutedEarningsLossPerShareFromContinuingAndDiscontinuedOperations': 'EPS',
    'PaidUpValueOfEquityShareCapital': 'Paid-up Capital',
    'FaceValueOfEquityShareCapital': 'FV Capital'
})

In [11]:
# 1. Convert date columns to datetime format
date_columns = ['Announcement Date', 'Start of Reporting Period', 'End of Reporting Period']
for column in date_columns:
    final_df[column] = pd.to_datetime(final_df[column], errors='coerce')

# 2. Convert numeric columns to numeric format
numeric_columns = ['Revenue', 'EPS', 'Paid-up Capital', 'FV Capital']
for column in numeric_columns:
    final_df[column] = pd.to_numeric(final_df[column], errors='coerce')

# 3. Perform the arithmetic operation
final_df['RevenuePerShare'] = final_df['Revenue'] * final_df['FV Capital'] / final_df['Paid-up Capital']

In [12]:
final_df_2 = final_df[final_df['Report Type'] == 'Consolidated']
export = final_df_2[['Announcement Date', 'End of Reporting Period', 'RevenuePerShare', 'EPS']]
export = export.rename(columns={'Announcement Date': 'Date', 'End of Reporting Period': 'QuarterEnding'})
export.to_csv(f'{stock_symbol}_Fundamental_data.csv', index=False)