In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# List of URLs to scrape
urls = [
    'https://www.screener.in/company/VOLTAS/consolidated/',
    'https://www.screener.in/company/BLUESTARCO/consolidated/',
    'https://www.screener.in/company/CROMPTON/consolidated/',
    'https://www.screener.in/company/ORIENTELEC/',
    'https://www.screener.in/company/HAVELLS/consolidated/',
    'https://www.screener.in/company/SYMPHONY/consolidated/',
    'https://www.screener.in/company/WHIRLPOOL/'
]

# Lists to hold the extracted values for each category per company
data = []

# Function to scrape a single URL
def scrape_company_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the company name from the URL
    company_name = url.split('/')[-3].upper() if 'consolidated' in url else url.split('/')[-2].upper()

    # Find the relevant section in the HTML (profit-loss)
    sec = soup.find_all('section', id='profit-loss')

    # Loop through the section to find the data table
    for s in sec:
        lvl1 = s.find_all('div', class_='responsive-holder fill-card-width')
        for l in lvl1:
            tables = l.find_all('table')
            for t in tables:
                tbodies = t.find_all('tbody')
                for tbody in tbodies:
                    tr_elements = tbody.find_all('tr')

                    # Initialize variables for the features
                    sales, net_profit, opm, eps = [None] * 3, [None] * 3, [None] * 3, [None] * 3

                    # Loop through each row in the table
                    for tr in tr_elements:
                        td_elements = tr.find_all('td')
                        if len(td_elements) > 4:
                            # Extract the label (first column)
                            label = td_elements[0].get_text(strip=True).lower()

                            # Extract the last three numerical values (for Mar 2022, Mar 2023, and Mar 2024)
                            if 'opm' in label:
                                # Handle OPM%: remove '%' and convert to float
                                last_values = []
                                for td in td_elements[-4:-1]:  # Taking values from Mar 2022 to Mar 2024
                                    value = td.get_text(strip=True).replace(',', '').replace('%', '')
                                    try:
                                        last_values.append(float(value) / 100)  # Convert to decimal
                                    except ValueError:
                                        last_values.append(None)
                                opm = last_values
                            else:
                                # Handle other values normally
                                try:
                                    last_values = [float(td.get_text(strip=True).replace(',', '')) for td in td_elements[-4:-1]]  # From Mar 2022 to Mar 2024
                                except ValueError:
                                    last_values = [None, None, None]  # In case of invalid numbers

                                if 'sales' in label:
                                    sales = last_values
                                elif 'net profit' in label:
                                    net_profit = last_values
                                elif 'eps' in label:
                                    eps = last_values

                    # Append the latest values for this company to the data list
                    # Assuming you want the rows for each year separately
                    for i, year in enumerate(['Mar 2022', 'Mar 2023', 'Mar 2024']):
                        data.append({
                            'Timestamp': year,
                            'Company': company_name,
                            'Sales': sales[i],
                            'Net Profit': net_profit[i],
                            'OPM': opm[i],  # OPM is now handled correctly as a percentage
                            'EPS': eps[i]
                        })

# Loop through each URL and scrape data
for url in urls:
    scrape_company_data(url)

# Create a DataFrame to store the data
df = pd.DataFrame(data)

# Show the DataFrame
df

Unnamed: 0,Timestamp,Company,Sales,Net Profit,OPM,EPS
0,Mar 2022,VOLTAS,7934.0,506.0,0.07,15.23
1,Mar 2023,VOLTAS,9499.0,136.0,0.05,4.08
2,Mar 2024,VOLTAS,12481.0,248.0,0.03,7.62
3,Mar 2022,BLUESTARCO,6064.0,168.0,0.06,8.71
4,Mar 2023,BLUESTARCO,7977.0,401.0,0.06,20.79
5,Mar 2024,BLUESTARCO,9685.0,414.0,0.07,20.18
6,Mar 2022,CROMPTON,5394.0,578.0,0.14,9.13
7,Mar 2023,CROMPTON,6870.0,476.0,0.11,7.28
8,Mar 2024,CROMPTON,7313.0,442.0,0.1,6.84
9,Mar 2022,ORIENTELEC,2448.0,127.0,0.09,5.97


In [3]:
df.to_csv('Battle_Performance_Stats.csv',index=False)