In [1]:
import pandas as pd
from datetime import datetime
from CustomSession import CustomSession

In [2]:
class BSE(CustomSession):
    def __init__(self):
        """
        Initializes the BSE object with custom session and BSE-specific headers.
        """
        super().__init__(headers={
            'authority': 'api.bseindia.com',
            'accept': 'application/json, text/plain, */*',
            'accept-language': 'en-US,en;q=0.9',
            'dnt': '1',
            'origin': 'https://www.bseindia.com',
            'referer': 'https://www.bseindia.com/',
            'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-site',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
        })

        self._base_url = 'https://api.bseindia.com/BseIndiaAPI/api/'

    def extract_annual_report_links(self, bse_code: str, from_year: str = None, to_year: str = str(datetime.now().year)):
        """
        Fetches the links of annual reports for a company based on its BSE code.

        Args:
            bse_code (str): The BSE code of the company.
            from_year (str): Start year for fetching reports (optional).
            to_year (str): End year for fetching reports (defaults to current year).

        Returns:
            List[Dict]: A list of dictionaries containing 'year' and 'PDFDownload' (link).
        """
        params = {
            'scripcode': bse_code,
        }

        
        if from_year is None:
            from_year = '0000'

        # Ensure from_year and to_year are treated as integers for comparison
        try:
            from_year = int(from_year)
            to_year = int(to_year)
        except ValueError:
            print(f"Invalid year format: from_year={from_year}, to_year={to_year}")
            return []

        
        response = self.hit_and_get_data('https://api.bseindia.com/BseIndiaAPI/api/AnnualReport_New/w', params=params)

       
        report_data = []
        for report in response.get('Table', []):
            year = report.get('Year')
            link = report.get('PDFDownload', '')



            # Ensure year is not None and can be converted to an integer
            if year is not None:
                try:
                    year_int = int(year)
                except ValueError:
                    print(f"Skipping invalid year: {year}")
                    continue  # Skip if year is not a valid integer    

            
            if from_year <= year_int <= to_year:
                report_data.append({
                    'bse_code': bse_code,
                    'year': year,
                    'link': link
                })

        return report_data



In [3]:
def create_annual_report_dataset(bse_codes: list, from_year: str = None, to_year: str = str(datetime.now().year), output_file: str = 'annual_reports.csv'):
    """
    Fetches annual reports for a list of companies and creates a dataset with all the details.

    Args:
        bse_codes (list): List of BSE codes to fetch reports for.
        from_year (str): Start year for fetching reports (optional).
        to_year (str): End year for fetching reports (defaults to current year).
        output_file (str): Path to the CSV file where data will be saved.

    Returns:
        None: The function saves the dataset as a CSV file.
    """
    bse_session = BSE()
    all_reports = []

    for code in bse_codes:
        print(f"Fetching reports for BSE code: {code}")
        reports = bse_session.extract_annual_report_links(bse_code=code, from_year=from_year, to_year=to_year)
        all_reports.extend(reports)

    
    df = pd.DataFrame(all_reports)

    
    df.to_csv(output_file, index=False)
    print(f"Dataset saved to {output_file}")

In [4]:
if __name__ == "__main__":
    df = pd.read_csv('refined_companies_tick.csv')
    bse_codes = df['BSE Code'].tolist()

   
    create_annual_report_dataset(bse_codes, to_year='2023', output_file='bse_annual_reports(4).csv')

Fetching reports for BSE code: 500410
Fetching reports for BSE code: 500003
Fetching reports for BSE code: 590006
Fetching reports for BSE code: 532141
Fetching reports for BSE code: 502330
Fetching reports for BSE code: 500877
Fetching reports for BSE code: 500101
Fetching reports for BSE code: 500477
Fetching reports for BSE code: 500023
Fetching reports for BSE code: 500820
Fetching reports for BSE code: 500472
Fetching reports for BSE code: 506820
Fetching reports for BSE code: 500027
Fetching reports for BSE code: 500490
Fetching reports for BSE code: 500031
Fetching reports for BSE code: 500033
Fetching reports for BSE code: 502355
Fetching reports for BSE code: 500041
Fetching reports for BSE code: 500042
Fetching reports for BSE code: 500043
Fetching reports for BSE code: 506285
Fetching reports for BSE code: 509480
Fetching reports for BSE code: 503960
Fetching reports for BSE code: 500493
Fetching reports for BSE code: 505688
Fetching reports for BSE code: 500335
Fetching rep