In [321]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import undetected_chromedriver as uc
import time


In [322]:
def select_fund(driver, wait, fund_name, index=0):
    """
    Fills the specified fund search box with the given fund name and selects the matched suggestion.
    
    Args:
        driver: Selenium WebDriver instance.
        wait: WebDriverWait instance.
        fund_name: Name of the fund to search.
        index: Index (0-4) for the respective fund search input box.
    """
    try:
        # Determine correct input box ID
        input_id = "peer-fund-search" if index == 0 else f"peer-fund-search{index+1}"
        fund_input = wait.until(EC.presence_of_element_located((By.ID, input_id)))
        
        fund_input.click()
        fund_input.clear()
        fund_input.send_keys(fund_name)
        print(f"🔍 Searching for: {fund_name} in #{input_id}")

        # Wait for suggestions
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".tt-dataset-fund_search .tt-suggestion")))
        time.sleep(5)  # Wait for suggestions to stabilize

        # Get all suggestions
        suggestions = driver.find_elements(By.CSS_SELECTOR, ".tt-dataset-fund_search .tt-suggestion")

        for suggestion in suggestions:
            suggestion_text = suggestion.text.strip().lower()
            if fund_name.lower().replace("direct", "dir")[:15] in suggestion_text:
                suggestion.click()
                print(f"✅ Selected: {suggestion.text.strip()}")
                return
        print(f"❌ No matching suggestion found for: {fund_name}")

    except Exception as e:
        print(f"❌ Error in select_fund [{index}]: {e}")


In [323]:
# def scrape_full_page_text(driver):
#     try:
#         body = driver.find_element(By.TAG_NAME, "body")
#         return body.text
#     except Exception as e:
#         print("❌ Error scraping page text:", e)
#         return ""

In [324]:
from bs4 import BeautifulSoup
import pandas as pd

def extract_comparison_table_basics(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table", {"id": "peer-comparison-tab"})
    if not table:
        raise ValueError("❌ 'peer-comparison-tab' table not found.")

    thead = table.find("thead")
    if thead is None:
        raise ValueError("❌ Table header ('thead') not found.")

    header_cells = thead.find_all("th")[1:]  # skip first empty cell
    fund_names = [th.get_text(strip=True) for th in header_cells]

    tbody = table.find("tbody")
    if tbody is None:
        raise ValueError("❌ Table body ('tbody') not found.")

    rows = []
    for tr in tbody.find_all("tr"):
        cells = tr.find_all("td")
        if len(cells) < len(fund_names) + 1:
            continue
        metric = cells[0].get_text(strip=True)
        values = [td.get_text(strip=True) for td in cells[1:]]
        rows.append([metric] + values)

    df = pd.DataFrame(rows, columns=["Metric"] + fund_names)

    mf_basics = df.copy()

    if 'Metric' in mf_basics.columns:
        mf_basics.set_index('Metric', inplace=True)

    # Drop the unwanted rows
    mf_basics_cleaned = mf_basics.drop(["VR Rating", "Our Opinion"])

    # Transpose the DataFrame: rows ↔ columns
    mf_basics_transposed = mf_basics_cleaned.transpose()

    # Reset index (optional, for cleaner display)
    mf_basics_transposed.reset_index(inplace=True)
    mf_basics_transposed.rename(columns={"index": "Fund Name"}, inplace=True)

    # Show the result
    # print(mf_basics_transposed)

    return mf_basics_transposed



In [325]:
from bs4 import BeautifulSoup
import pandas as pd

def extract_trailing_returns_table(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table", {"id": "trailingReturnTabs"})
    if not table:
        raise ValueError("❌ Trailing Returns table not found.")

    # Extract headers
    thead = table.find("thead")
    if not thead:
        raise ValueError("❌ Table header ('thead') not found in trailing returns.")

    header_cells = thead.find_all("th")[1:]  # skip first empty cell
    fund_names = [th.get_text(strip=True) for th in header_cells]

    # Extract body
    tbody = table.find("tbody")
    if not tbody:
        raise ValueError("❌ Table body ('tbody') not found in trailing returns.")

    rows = []
    for tr in tbody.find_all("tr"):
        tds = tr.find_all("td")
        if len(tds) < len(fund_names) + 1:
            continue
        metric = tds[0].get_text(strip=True)
        values = [td.get_text(strip=True) for td in tds[1:]]
        rows.append([metric] + values)

    # Create DataFrame
    df = pd.DataFrame(rows, columns=["Return Period"] + fund_names)
    
    # Set "Return Period" as index
    df.set_index("Return Period", inplace=True)

    # Transpose and clean for easier comparison
    trailing_returns = df.transpose().reset_index()
    trailing_returns.rename(columns={"index": "Fund Name"}, inplace=True)

    # print(trailing_returns)
    return trailing_returns


In [326]:
def extract_risk_ratios_table(html: str) -> pd.DataFrame:
    from bs4 import BeautifulSoup
    import pandas as pd

    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", {"id": "riskRatiosTabs"})
    if not table:
        raise ValueError("❌ Risk Ratios table not found.")


    # Try finding all rows in the table
    rows = table.find_all("tr")
    if not rows:
        raise ValueError("❌ No rows found in table.")

    # Find header row (the one with fund names)
    header_row = rows[0]
    header_cells = header_row.find_all("th")[1:]  # Skip the first blank th
    if not header_cells:
        raise ValueError("❌ No header cells found.")
    
    fund_names = [cell.get_text(strip=True) for cell in header_cells]

    # Extract the data rows
    data = []
    for row in rows[1:]:
        cells = row.find_all(["td", "th"])
        if len(cells) < len(fund_names) + 1:
            continue  # Skip incomplete rows

        metric_name = cells[0].get_text(strip=True)
        values = [cell.get_text(strip=True) for cell in cells[1:]]
        data.append([metric_name] + values)

    # Form DataFrame
    df = pd.DataFrame(data, columns=["Metric"] + fund_names)
    df = df.set_index("Metric").transpose().reset_index()
    df = df.rename(columns={"index": "Fund Name"})

    # print(df)
    return df


In [327]:
from bs4 import BeautifulSoup
import pandas as pd

def extract_asset_allocation_table(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table", {"id": "asssetAllocationTabs"})
    if not table:
        raise ValueError("❌ Asset Allocation table not found.")

    # Extract column headers (fund names)
    headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")[1:]]  # skip first blank column

    # Extract rows
    row_labels = []
    data = []
    for tr in table.find("tbody").find_all("tr"):
        tds = tr.find_all("td")
        row_labels.append(tds[0].get_text(strip=True))
        values = [td.get_text(strip=True) for td in tds[1:]]
        data.append(values)

    # Create DataFrame
    df = pd.DataFrame(data, index=row_labels, columns=headers)
    df = df.transpose().reset_index()
    df.rename(columns={"index": "Fund Name"}, inplace=True)
    # print(df)
    return df.reset_index()


In [328]:
from bs4 import BeautifulSoup
import pandas as pd

def extract_sector_distribution_table(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table", {"id": "sectorDistributionTabs"})
    if not table:
        raise ValueError("❌ Sector Distribution table not found.")

    # Extract headers (fund names)
    headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")[1:]]  # Skip first "Sectors"

    # Extract rows
    data = []
    row_labels = []

    for tr in table.find("tbody").find_all("tr"):
        tds = tr.find_all("td")
        sector_name = tds[0].get_text(strip=True)
        values = [td.get_text(strip=True) for td in tds[1:]]
        row_labels.append(sector_name)
        data.append(values)

    # Create DataFrame
    df = pd.DataFrame(data, index=row_labels, columns=headers)

    df = df.transpose().reset_index()
    df.rename(columns={"index": "Fund Name"}, inplace=True)

    # print(df)
    return df.reset_index()


In [329]:
from bs4 import BeautifulSoup
import pandas as pd

def extract_fund_holdings_summary_table(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table", {"id": "holdingtables"})
    if not table:
        raise ValueError("❌ Fund Holdings Summary table not found.")

    # Extract fund names from the <thead>
    headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")[1:]]  # Skip first empty column

    # Extract metric rows from the <tbody>
    data = []
    row_labels = []

    for tr in table.find("tbody").find_all("tr"):
        tds = tr.find_all("td")
        metric_name = tds[0].get_text(strip=True)
        values = [td.get_text(strip=True) for td in tds[1:]]
        row_labels.append(metric_name)
        data.append(values)

    # Create DataFrame
    df = pd.DataFrame(data, index=row_labels, columns=headers)
    df = df.transpose().reset_index()
    df.rename(columns={"index": "Fund Name"}, inplace=True)

    # print(df)
    return df.reset_index(drop=True)


In [330]:
# from langchain_community.chat_models import ChatOllama
# from langchain.schema import SystemMessage, HumanMessage
# import pandas as pd
# import json

# def extract_risk_table(html_snippet: str) -> pd.DataFrame:
#     """
#     Uses LLaMA 3 via Ollama to convert scraped mutual fund HTML/text into a structured DataFrame.

#     Parameters:
#         html_snippet (str): Scraped text or HTML content of mutual fund comparison.

#     Returns:
#         pd.DataFrame: Structured data as a DataFrame.
#     """
#     print("🔍 Extracting mutual fund comparison table using LLaMA 3...")
#     llm = ChatOllama(model="llama3")

#     prompt = f"""
#         Below is raw text scraped from a mutual fund comparison webpage.

#         Please return valid, complete json file :
#         - Strictly return the JSON data without any additional text.
#         - Each row should contain the fund names like Nippon India Large Cap Dir, HDFC Large Cap Dir those availabe on the html. No additional funds.
#         - metrics like Mean (%), Std Dev (%), Sharpe, Sortino, Beta, Alpha should be captured in columns from the html page. No self calculated metrics. 
#         - Use only actual data. Do not use ellipses (...) or placeholders.
#         - No explanations, no markdown, no comments, no print statements. Strictly follow this. Only return the json file.
#         - Do not include Markdown formatting or text like 'Here is the JSON:, Here is the valid, complete JSON file: and all.
#         - Don't generate any data from self. if data not available just appear it as it is.

#         Raw data:
#         ---
#         {html_snippet[:3000]}
#         ---
#         """

#     messages = [
#         SystemMessage(content="You are a helpful data scientist."),
#         HumanMessage(content=prompt)
#     ]

#     response = llm.invoke(messages)

#     try:
#         data = json.loads(response.content.strip())
#         df = pd.DataFrame(data)
#         print(df)
#         return df
#     except Exception as e:
#         raise RuntimeError(f"❌ Failed to parse JSON from LLaMA output.\nError: {e}\nOutput:\n{response.content}")

In [332]:
import time
import os
import undetected_chromedriver as uc
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Assumes select_fund and all extract_*_table functions are already defined

def compare_and_extract_funds(funds_to_search):
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/117.0.0.0 Safari/537.36")
    prefs = {
        "credentials_enable_service": False,
        "profile.password_manager_enabled": False
    }
    options.add_experimental_option("prefs", prefs)

    driver = uc.Chrome(options=options)
    wait = WebDriverWait(driver, 20)

    try:
        # 🔐 LOGIN
        driver.get("https://www.valueresearchonline.com/login")
        wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'Log in with password')]"))).click()
        wait.until(EC.presence_of_element_located((By.ID, "username"))).send_keys("taswinikumar69@gmail.com")
        wait.until(EC.element_to_be_clickable((By.ID, "proceed-btn"))).click()
        time.sleep(1)
        wait.until(EC.presence_of_element_located((By.ID, "login_password"))).send_keys("Value*0321")
        wait.until(EC.element_to_be_clickable((By.ID, "login-btn"))).click()
        wait.until(EC.presence_of_element_located((By.ID, "navbarDropdown-my-investment")))
        print("✅ Logged in successfully")

        # 🔄 FUND COMPARE
        driver.get("https://www.valueresearchonline.com/funds/fund-compare/")
        wait.until(EC.presence_of_element_located((By.ID, "navbarDropdown-my-investment")))

        for idx, fund in enumerate(funds_to_search[:5]):
            select_fund(driver, wait, fund, index=idx)
            time.sleep(1)

        try:
            compare_btn = wait.until(EC.presence_of_element_located((By.ID, "compare_fund")))
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", compare_btn)
            time.sleep(1)
            compare_btn.click()
            print("✅ Clicked 'Compare These Funds'")
        except Exception as e:
            driver.execute_script("arguments[0].click();", compare_btn)

        time.sleep(4)
        os.makedirs("data", exist_ok=True)

        # 📊 BASIC & RETURN TABLES
        html = driver.page_source
        mf_basics = extract_comparison_table_basics(html)
        mf_basics.to_csv("data/mf_basics.csv", index=False)
        print("✅ Basic table saved")

        mf_return = extract_trailing_returns_table(html)
        mf_return.to_csv("data/mf_return.csv", index=False)
        print("✅ Return table saved")

        # 📉 RISK RATIOS
        driver.execute_script("document.querySelector(\"a[href='#riskRatiosTab']\").click()")
        wait.until(EC.presence_of_element_located((By.XPATH, "//table[@id='riskRatiosTabs']//tr[td]")))
        mf_risk = extract_risk_ratios_table(driver.page_source)
        mf_risk.to_csv("data/mf_risk.csv", index=False)
        print("✅ Risk Ratio table saved")

        # 📊 ASSET ALLOCATION
        driver.execute_script("document.querySelector(\"a[href='#asssetAllocationTab']\").click()")
        wait.until(EC.presence_of_element_located((By.XPATH, "//table[@id='asssetAllocationTabs']//tr[td]")))
        mf_asset = extract_asset_allocation_table(driver.page_source)
        mf_asset.to_csv("data/mf_asset_allocation.csv", index=False)
        print("✅ Asset Allocation table saved")

        # 📊 SECTOR DISTRIBUTION
        driver.execute_script("document.querySelector(\"a[href='#sectorDistributionTab']\").click()")
        wait.until(EC.presence_of_element_located((By.XPATH, "//table[@id='sectorDistributionTabs']//tr[td]")))
        df_sector = extract_sector_distribution_table(driver.page_source)
        df_sector.to_csv("data/mf_sector_distribution.csv", index=False)
        print("✅ Sector Distribution table saved")

        # 📋 HOLDINGS
        driver.execute_script("document.querySelector(\"a[href='#holdingsTab']\").click()")
        wait.until(EC.presence_of_element_located((By.XPATH, "//table[@id='holdingtables']//tr[td]")))
        df_holdings = extract_fund_holdings_summary_table(driver.page_source)
        df_holdings.to_csv("data/mf_holdings.csv", index=False)
        print("✅ Holdings table saved")

        # ✅ Merge all tables on 'Fund Name'
        for df in [mf_basics, mf_return, mf_risk, mf_asset, df_sector, df_holdings]:
            df.columns = df.columns.str.strip()  # Clean column names

        merged_df = mf_basics \
            .merge(mf_return, on="Fund Name", how="outer") \
            .merge(mf_risk, on="Fund Name", how="outer") \
            .merge(mf_asset, on="Fund Name", how="outer") \
            .merge(df_sector, on="Fund Name", how="outer") \
            .merge(df_holdings, on="Fund Name", how="outer")

        merged_df.to_csv("data/mf_merged.csv", index=False)
        print("✅ Tables merged and saved successfully")

    except Exception as e:
        print("❌ Error:", e)
    finally:
        driver.quit()


In [333]:
fund_list = [
        "Nippon India Large Cap Fund Dir",
        "HDFC Large Cap Dir"
    ]
compare_and_extract_funds(fund_list)

✅ Logged in successfully
🔍 Searching for: Nippon India Large Cap Fund Dir in #peer-fund-search
❌ No matching suggestion found for: Nippon India Large Cap Fund Dir
🔍 Searching for: HDFC Large Cap Dir in #peer-fund-search2
✅ Selected: 
✅ Basic table saved
✅ Return table saved
✅ Risk Ratio table saved
✅ Asset Allocation table saved
✅ Sector Distribution table saved
✅ Holdings table saved
✅ Tables merged and saved successfully
