**Install & import packages**

In [1]:

import json
import requests
import pandas
from markdown import markdown
from IPython.display import Markdown

import xml.etree.ElementTree as ET    # Used for parsing XML files (like BeautifulSoup for HTML)
import pandas as pd

from pathlib import Path
from typing import Dict, List, Optional, Tuple
import re

---

<br>

# Fetch PDF

(not used)

In [3]:
# 1. Define URL to PDF (or XML) file
pdf_url = 'https://nsearchives.nseindia.com/corporate/BFUTILITIE_27112025155734_BRSR_BFUL_27112025.pdf'
pdf_file = pdf_url.split('/')[-1].split('.')[0]

# 2. Make GET request to fetch the PDF file
response = requests.get(pdf_url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'})

# 3. Check if our request was successful (Status 200), if so, save to a file.
if response.status_code == 200:
    with open('BFUTILITIE_27112025155734_BRSR_BFUL_27112025.pdf', 'wb') as f:
        f.write(response.content)
    print('PDF downloaded successfully.')
else:
    print('Failed to download file. Status code:', response.status_code)

PDF downloaded successfully.


<br>

### Fetch XML

In [4]:
# 1. Define URL to XML file
xml_url = 'https://nsearchives.nseindia.com/corporate/xbrl/BRSR_1580445_27112025035801_WEB.xml'
xml_file = xml_url.split('/')[-1]  # Get just the file name & endpoint (e.g. 'xyza.csv')

# 2. Make GET request to fetch the XML file
response = requests.get(xml_url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'})

# 3. Check if our request was successful (Status 200), if so, save to a file.
if response.status_code == 200:

    with open(f'{xml_file}', 'wb') as f:
        f.write(response.content)

    print('XML (XBRL) downloaded successfully.')
else:
    print('Failed to download file. Status code:', response.status_code)

XML (XBRL) downloaded successfully.


<br><br><br>

# Extract from XML

Convert XBRL file to a flat DataFrame:

- Every row is a value given in the report.
- `Element Name` corresponds to each unique table (or headline value, like company name).
- `Period/Context` relates to each value in a given table.


So, to filter the dataframe for a specific value, we need to provide both the `Element Name` and `Period/Context`.

<br>

Interesting tables in report:
- Table 20:
  - 20A: `NumberOfEmployeesOrWorkersIncludingDifferentlyAbled`
  - 20B: `NumberOfDifferentlyAbledEmployeesOrWorkers`

In [14]:
tree = ET.parse(xml_file)
root = tree.getroot()

# Structural elements to skip (not actual data facts)
skip_tags = {
    'xbrl', 'context', 'entity', 'identifier', 'period', 'startDate',
    'endDate', 'instant', 'scenario', 'typedMember', 'unit', 'measure',
    'schemaRef', 'segment'
}

rows = []
for elem in root.iter():
    # Extract local name from namespaced tag
    tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag

    # Skip structural elements
    if tag in skip_tags:
        continue

    # Skip domain definition elements
    if 'Domain' in tag and elem.text and 'Domain' in elem.text:
        continue

    # Get XBRL attributes
    ctx = elem.attrib.get('contextRef', '')
    unit = elem.attrib.get('unitRef', '')
    decimals = elem.attrib.get('decimals', '')

    # Only include actual facts (elements with contextRef)
    if ctx and elem.text is not None:
        rows.append({
            'Element Name': tag,
            'Period/Context': ctx,
            'Unit': unit,
            'Decimals': decimals,
            'Value': elem.text.strip()
        })

df = pd.DataFrame(rows)
df

Unnamed: 0,Element Name,Period/Context,Unit,Decimals,Value
0,CorporateIdentityNumber,DCYMain,,,L74899DL1993GOI053677
1,NameOfTheCompany,ICYMain,,,IFCI Limited
2,DateOfIncorporation,DCYMain,,,1993-05-21
3,AddressOfRegisteredOfficeOfCompany,DCYMain,,,"IFCI Limited, IFCI Tower, 61 Nehru Place, New ..."
4,AddressOfCorporateOfficeOfCompany,DCYMain,,,"IFCI Limited, IFCI Tower, 61 Nehru Place, New ..."
...,...,...,...,...,...
1499,MechanismsInPlaceToInformConsumersOfAnyRiskOfD...,DCYMain,,,Any risk of disruption/discontinuation of esse...
1500,DoesTheEntityDisplayProductInformationOnThePro...,DCYMain,,,Yes
1501,DetailsOfProductInformationOnTheProductOverAnd...,DCYMain,,,The Company being an NBFC offers various finan...
1502,DidYourEntityCarryOutAnySurveyWithRegardToCons...,DCYMain,,,


Elements, Period/Context:
- CorporateIdentityNumber, DCYMain
- NameOfTheCompany, ICYMain

In [None]:
df[df['Period']]

In [60]:
# Search for elements matching some text
df[df['Element Name'].str.contains('employee', case=False)]

Unnamed: 0,Element Name,Period/Context,Unit,Decimals,Value
39,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Gender_PermanentEmployees_TableA,pure,0,116
40,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Male_PermanentEmployees_TableA,pure,0,77
41,PercentageOfEmployeesOrWorkersIncludingDiffere...,D_Male_PermanentEmployees_TableA,pure,INF,0.6638
42,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Female_PermanentEmployees_TableA,pure,0,39
43,PercentageOfEmployeesOrWorkersIncludingDiffere...,D_Female_PermanentEmployees_TableA,pure,INF,0.3363
...,...,...,...,...,...
1198,MedianOfRemunerationOrSalaryOrWagesOfEmployees...,D_Female_p5,INR,0,34
1199,NumberOfEmployeesOtherThanBodAndKMPForRemunera...,D_OtherGender_p5,pure,0,0
1200,MedianOfRemunerationOrSalaryOrWagesOfEmployees...,D_OtherGender_p5,INR,0,0
1241,AverageNumberOfFemaleEmployeesOrWorkersAtTheBe...,DCYMain,pure,0,0


In [15]:
df['Element Name'].unique()

array(['CorporateIdentityNumber', 'NameOfTheCompany',
       'DateOfIncorporation', 'AddressOfRegisteredOfficeOfCompany',
       'AddressOfCorporateOfficeOfCompany', 'EMailOfTheCompany',
       'TelephoneOfCompany', 'WebsiteOfCompany',
       'DateOfStartOfFinancialYear', 'DateOfEndOfFinancialYear',
       'DateOfStartOfPreviousYear', 'DateOfEndOfPreviousYear',
       'DateOfStartOfPriorToPreviousYear',
       'DateOfEndOfPriorToPreviousYear', 'ValueOfSharesPaidUp',
       'NameOfContactPerson', 'ContactNumberOfContactPerson',
       'EMailOfContactPerson', 'ReportingBoundary',
       'WhetherTheCompanyHasUndertakenAssessmentOrAssuranceOfTheBRSRCore',
       'NotesGeneralDisclosureExplanatoryTextBlock',
       'NameOfStockExchangeWhereTheCompanyIsListed',
       'DescriptionOfMainActivity', 'DescriptionOfBusinessActivity',
       'PercentageOfTotalTurnoverForBusinessActivities',
       'ProductOrServiceSoldByTheEntity',
       'NICCodeOfProductOrServiceSoldByTheEntity',
       'Percent

In [31]:
df['Period/Context'].unique()

array(['DCYMain', 'ICYMain', 'D_StockExchangeAxis1',
       'D_StockExchangeAxis2', 'D_BusinessActivities1',
       'D_ProductServiceSold1', 'D_Plant_National', 'D_Office_National',
       'D_Location_National', 'D_Plant_International',
       'D_Office_International', 'D_Location_International',
       'D_Gender_PermanentEmployees_TableA',
       'D_Male_PermanentEmployees_TableA',
       'D_Female_PermanentEmployees_TableA',
       'D_OtherGender_PermanentEmployees_TableA',
       'D_Gender_OtherThanPermanentEmployees_TableA',
       'D_Male_OtherThanPermanentEmployees_TableA',
       'D_Female_OtherThanPermanentEmployees_TableA',
       'D_OtherGender_OtherThanPermanentEmployees_TableA',
       'D_Gender_Employees_TableA', 'D_Male_Employees_TableA',
       'D_Female_Employees_TableA', 'D_OtherGender_Employees_TableA',
       'D_Gender_PermanentWorkers_TableA',
       'D_Male_PermanentWorkers_TableA',
       'D_Female_PermanentWorkers_TableA',
       'D_OtherGender_PermanentWorkers_T

In [17]:
### 1. Filter to values for "Employees and workers (including differently abled)" table
df[df['Element Name'] == 'NumberOfEmployeesOrWorkersIncludingDifferentlyAbled']


Unnamed: 0,Element Name,Period/Context,Unit,Decimals,Value
39,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Gender_PermanentEmployees_TableA,pure,0,116
40,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Male_PermanentEmployees_TableA,pure,0,77
42,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Female_PermanentEmployees_TableA,pure,0,39
44,NumberOfEmployeesOrWorkersIncludingDifferently...,D_OtherGender_PermanentEmployees_TableA,pure,0,0
46,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Gender_OtherThanPermanentEmployees_TableA,pure,0,23
47,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Male_OtherThanPermanentEmployees_TableA,pure,0,19
49,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Female_OtherThanPermanentEmployees_TableA,pure,0,4
51,NumberOfEmployeesOrWorkersIncludingDifferently...,D_OtherGender_OtherThanPermanentEmployees_TableA,pure,0,0
53,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Gender_Employees_TableA,pure,0,139
54,NumberOfEmployeesOrWorkersIncludingDifferently...,D_Male_Employees_TableA,pure,0,96


<br>

Extract differently abled count

In [29]:
df[df['Element Name'] == 'NumberOfDifferentlyAbledEmployeesOrWorkers']

Unnamed: 0,Element Name,Period/Context,Unit,Decimals,Value
81,NumberOfDifferentlyAbledEmployeesOrWorkers,D_Gender_PermanentEmployees_TableB,pure,0,1
82,NumberOfDifferentlyAbledEmployeesOrWorkers,D_Male_PermanentEmployees_TableB,pure,0,0
84,NumberOfDifferentlyAbledEmployeesOrWorkers,D_Female_PermanentEmployees_TableB,pure,0,1
86,NumberOfDifferentlyAbledEmployeesOrWorkers,D_OtherGender_PermanentEmployees_TableB,pure,0,0
88,NumberOfDifferentlyAbledEmployeesOrWorkers,D_Gender_OtherThanPermanentEmployees_TableB,pure,0,0
89,NumberOfDifferentlyAbledEmployeesOrWorkers,D_Male_OtherThanPermanentEmployees_TableB,pure,0,0
91,NumberOfDifferentlyAbledEmployeesOrWorkers,D_Female_OtherThanPermanentEmployees_TableB,pure,0,0
93,NumberOfDifferentlyAbledEmployeesOrWorkers,D_OtherGender_OtherThanPermanentEmployees_TableB,pure,0,0
95,NumberOfDifferentlyAbledEmployeesOrWorkers,D_Gender_Employees_TableB,pure,0,1
96,NumberOfDifferentlyAbledEmployeesOrWorkers,D_Male_Employees_TableB,pure,0,0


In [36]:
# 1. Get total employees
total_employees_row = df[(df['Element Name'] == 'NumberOfEmployeesOrWorkersIncludingDifferentlyAbled') & (df['Period/Context'] == 'D_Gender_Employees_TableA')].copy()
# Get first value in 'Value' column
total_employees_value = total_employees_row['Value'].iloc[0]

# 2. Get total differently abled employees
total_diff_employees_row = df[(df['Element Name'] == 'NumberOfDifferentlyAbledEmployeesOrWorkers') & (df['Period/Context'] == 'D_Gender_Employees_TableB')]
# Get first value in 'Value' column
total_diff_employees_value = total_diff_employees_row['Value'].iloc[0]


# 3. Get company name
company_name_row = df[(df['Element Name'] == 'NameOfTheCompany') & (df['Period/Context'] == 'ICYMain')].copy()
company_name = company_name_row['Value'].iloc[0]

print(f'Company: {company_name}')
print(f'Employees, total: {total_employees_value}')
print(f'Employees, disabled: {total_diff_employees_value}')

pd.DataFrame([[company_name, total_employees_value, total_diff_employees_value]], columns=['company', 'total', 'disabled'])

Company: IFCI Limited
Employees, total: 139
Employees, disabled: 1


Unnamed: 0,company,total,disabled
0,IFCI Limited,139,1


In [None]:


'NumberOfDifferentlyAbledEmployeesOrWorkers'

<br>
<br>
<br>

## Loop through all firms

In [21]:
# 1. Read in file with links to all firms report files
all_firms_df = pd.read_csv('all_firm_links.csv')
all_firms_df = all_firms_df.rename(columns={'**XBRL \n': 'xml_urls'}) # Rename a column in dataframe
# all_firms_df.columns



# 2. Make new folders to hold raw XML files and parsed CSV files
import os
os.makedirs('firm_data_xml', exist_ok=True)  # Make a folder named 'firm_data_xml'
os.makedirs('firm_data_csv', exist_ok=True)  # Make a folder named 'firm_data_csv'


for x in all_firms_df['xml_urls'][:10]:
  print(x)
  #######

  # 1. Define URL to XML file
  xml_file = x.split('/')[-1]  # Get just the file name & endpoint (e.g. 'xyza.csv')

  # 2. Make GET request to fetch the XML file
  response = requests.get(x, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'})

  # 3. Check if our request was successful (Status 200), if so, save to a file.
  if response.status_code == 200:

    # Save in folder
    with open(f'firm_data_xml/{xml_file}', 'wb') as f:
        f.write(response.content)

    print('XML (XBRL) downloaded successfully.')
  else:
    print('Failed to download file. Status code:', response.status_code)



  # 4. Extract table of values
  tree = ET.parse(xml_file)
  root = tree.getroot()

  # Structural elements to skip (not actual data facts)
  skip_tags = {
      'xbrl', 'context', 'entity', 'identifier', 'period', 'startDate',
      'endDate', 'instant', 'scenario', 'typedMember', 'unit', 'measure',
      'schemaRef', 'segment'
  }

  rows = []
  for elem in root.iter():
      # Extract local name from namespaced tag
      tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag

      # Skip structural elements
      if tag in skip_tags:
          continue

      # Skip domain definition elements
      if 'Domain' in tag and elem.text and 'Domain' in elem.text:
          continue

      # Get XBRL attributes
      ctx = elem.attrib.get('contextRef', '')
      unit = elem.attrib.get('unitRef', '')
      decimals = elem.attrib.get('decimals', '')

      # Only include actual facts (elements with contextRef)
      if ctx and elem.text is not None:
          rows.append({
              'Element Name': tag,
              'Period/Context': ctx,
              'Unit': unit,
              'Decimals': decimals,
              'Value': elem.text.strip()
          })

  data_temp = pd.DataFrame(rows)

  # Save to csv
  data_temp.to_csv(f'firm_data_csv/{xml_file.split('.')[0]}.csv', index=False)
  print("CSV file extracted successfully")

https://nsearchives.nseindia.com/corporate/xbrl/BRSR_1580445_27112025035801_WEB.xml
XML (XBRL) downloaded successfully.
CSV file extracted successfully
https://nsearchives.nseindia.com/corporate/xbrl/BRSR_1579180_24112025024100_WEB.xml
XML (XBRL) downloaded successfully.
CSV file extracted successfully
https://nsearchives.nseindia.com/corporate/xbrl/BRSR_1561070_30102025125314_WEB.xml
XML (XBRL) downloaded successfully.
CSV file extracted successfully
https://nsearchives.nseindia.com/corporate/xbrl/BRSR_1556251_24102025034723_WEB.xml
XML (XBRL) downloaded successfully.
CSV file extracted successfully
https://nsearchives.nseindia.com/corporate/xbrl/BRSR_1556193_24102025025946_WEB.xml
XML (XBRL) downloaded successfully.
CSV file extracted successfully
https://nsearchives.nseindia.com/corporate/xbrl/BRSR_1550931_16102025074159_WEB.xml
XML (XBRL) downloaded successfully.
CSV file extracted successfully
https://nsearchives.nseindia.com/corporate/xbrl/BRSR_1550623_16102025055155_WEB.xml
XML 

Loop through all XML files:
- extract total employees
- extract differently abled employees.

In [39]:
pd.DataFrame(columns=['company', 'total', 'disabled'])

Unnamed: 0,company,total,disabled


In [56]:
# Use `listdir` method to get a list of all files in a particular folder
all_csv_files = os.listdir('firm_data_csv') # List of files in this folder.

# Make an empty dataframe with three columns
combined_data = pd.DataFrame(columns=['company', 'total', 'disabled'])

# Loop through
for csv_file in all_csv_files:
  if csv_file == '.ipynb_checkpoints':
    continue

  # 1. Load file into dataframe (making sure to add folder before file name)
  temp_df = pd.read_csv(f'firm_data_csv/{csv_file}')

  # 2. Extract values (as we did with one company)
  # 2a. Get total employees
  total_employees_row = temp_df[(temp_df['Element Name'] == 'NumberOfEmployeesOrWorkersIncludingDifferentlyAbled') & (temp_df['Period/Context'] == 'D_Gender_Employees_TableA')].copy()
  total_employees_value = total_employees_row['Value'].iloc[0] # Get first value in 'Value' column

  # 2b. Get total differently abled employees
  total_diff_employees_row = temp_df[(temp_df['Element Name'] == 'NumberOfDifferentlyAbledEmployeesOrWorkers') & (temp_df['Period/Context'] == 'D_Gender_Employees_TableB')]
  total_diff_employees_value = total_diff_employees_row['Value'].iloc[0]

  # 2c. Get company name
  company_name_row = temp_df[(temp_df['Element Name'] == 'NameOfTheCompany') & (temp_df['Period/Context'] == 'ICYMain')].copy()
  company_name = company_name_row['Value'].iloc[0]


  # 3. Add to a combined dataframe (first make dataframe with one row, then add to combined)
  df_new_row = pd.DataFrame([[company_name, total_employees_value, total_diff_employees_value]], columns=['company', 'total', 'disabled'])
  combined_data = pd.concat([combined_data, df_new_row], ignore_index=True)



In [59]:
# Calculate percentage of total
combined_data['disabled'] = combined_data['disabled'].astype(int)
combined_data['total'] = combined_data['total'].astype(int)

combined_data['disabled_percent'] = combined_data['disabled'] / combined_data['total']
combined_data

Unnamed: 0,company,total,disabled,disabled_percent
0,BF UTILITIES LIMITED,7,0,0.0
1,THE ANDHRA SUGARS LIMITED,651,3,0.004608
2,Sirca Paints India Limited,483,0,0.0
3,Indraprastha Gas Limited,720,4,0.005556
4,Network People Services Technologies Limited,324,1,0.003086
5,HMT LIMITED,29,0,0.0
6,IFCI Limited,139,1,0.007194
7,Accelya Solutions India Limited,1351,4,0.002961
8,KIOCL Limited,249,7,0.028112
9,THE NEW INDIA ASSURANCE COMPANY LIMITED,10951,321,0.029312


In [None]:
from google.colab import drive
drive.mount('/content/drive')