In [4]:
import sys, os
import hashlib
import tempfile
import subprocess

def get_hash(file):
  
    BUF_SIZE = 65536
    sha256 = hashlib.sha256()
  
    with open(file, 'rb') as f:
        while True:
            data = f.read(BUF_SIZE)
            if not data:
                break
            sha256.update(data)
  
    return sha256.hexdigest()

def is_file_modified(filename, file_path, url):
    # Create a temporary file path with .xlsx suffix which will automatically get deleted once operation is completed
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_file_path = os.path.join(temp_dir, "temp.xlsx")
        command = f"wget -O {temp_file_path} {url}"
        subprocess.run(command, shell=True, check=True)
        f1=get_hash(temp_file_path)

    f2=get_hash(file_path)

    if f1==f2:
        print(f"File '{filename}' hasn't been modified. Skipping replacing")
        return False
    else:
        print(f"File '{filename}' has been modified")
        return True
        command = f"wget -O '{file_path}' '{url}'"  # wget command to download and save with specific filename
        os.system(command)  # Execute the command
        print(f"Re-Downloaded and saved: {filename} from {url}")

def nepali_to_english_number(nepali_str):
    nepali_num_map = str.maketrans('०१२३४५६७८९', '0123456789')
    return nepali_str.translate(nepali_num_map)



In [11]:
from pathlib import Path
import uuid
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import os, tempfile, subprocess, time
from Levenshtein import jaro

# Set up Chrome options for headless mode (if you want it headless)
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options) # Initialize WebDriver with options

# Function to check if the target title is uploaded to IDMS Portal since last run
def check_title_and_modified_date(url):
    '''
    Function to check url: https://data.<IDMS municipaltiy>.gov.np/datasets?sort=metadata_modified+desc 
    Checks:
        1. Last Modified: <Date>
        2. If Last Modified > Defined Condition and Title is in [List of Titles to Scrape] Then:
            Returns url_lists to follow the link where we can find csv, excel files in the url_lists
    '''
    try:
        # Request the page content
        response = requests.get(url+'/datasets?formats=XLSX&sort=title_string+desc')
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.text, "html.parser")

        # Initialize a list to store datasets
        url_list=[]

        # # Example: Find all dataset entries (modify selectors based on actual structure)
        dataset_items = soup.find_all("div", class_="datasetList_datasetCard__Wg_Mx")  # Adjust based on actual HTML structure
        for item in dataset_items:
            link = item.find("a")

            a_href = link.get("href") # We got href
            a_text = link.get_text(strip=True) # We got title of href

            # Find Modified Date
            for p in item.find_all("p"):
                if "Last Modified" in p.text:
                    text=p.text
            # Use regex to extract the date part only
            match = re.search(r'\b[A-Za-z]{3} \d{2} \d{4}\b', text)
            date_str = match.group()  # This will print only the date part
            # Convert date string to datetime object
            modified_date = datetime.strptime(date_str, "%b %d %Y")

            url_list.append([url+a_href, a_text, modified_date])

        return url_list
            # break
            # if a_text=='२०७८ को जनगणना अनुसार तुलसीपुर  उप–महानगरपालिकाको विस्तृत जनसांख्यिक विवरण' and modified_date<datetime.now():
            #     return (a_href, a_text, modified_date) 

    except requests.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

# Function to get csv data from the url if there has been files uploaded to the url found by check_title_and_modified_date function
def get_csv_links(a_href, a_text, modified_date):

    csv_links=[]

    # Navigate to the page
    driver.get(a_href)  # Replace with the actual URL

    # Find the button containing the span with the text "API" and click it
    try:
        api_buttons = driver.find_elements(By.XPATH, "//button[.//span[text()='API']]")

        for api_button in api_buttons:
            api_button.click()
            print("Clicked the API button successfully.")

            # Wait for the modal or API link to appear
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.CLASS_NAME, "api_apiEndpoint__wY3N_"))
            )
            
            # Locate the <a> tag within the modal and get the href attribute
            api_link = driver.find_element(By.XPATH, "//div[@class='api_apiEndpoint__wY3N_']//a")
            link_url = api_link.get_attribute("href")
            print("API Link URL:", link_url)
            csv_links.append(link_url)
            # Locate and click the close button to close the modal
            close_button = driver.find_element(By.XPATH, "//button[@aria-label='Close']")
            close_button.click()
            print("Closed the modal.")

            # Add a brief pause if needed
            WebDriverWait(driver, 1)
        
        return csv_links
    except Exception as e:
        print("Error clicking the API button:", e)

def get_excel_links(a_href, a_text, modified_date):

    '''
    Function the depends upon check_title_and_modified_date() function 
    Finds:
        1. Finds Preview button for Excel files only (Excludes CSVs, PDFs) and it's title 
        2. Returns Dictionary with mapping
            {
                'name of file': 'link to the file',
                'स्वास्थ्य कर्मचारी विवरण (२०८०-०८१)': 'https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/cd7a5513-ee59-48cc-8655-a07315b366e3/download/-copy-2.xlsx', 
                'स्वास्थ्य कर्मचारी विवरण (२०७८-०७९)': 'https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/8e988fbe-5e0e-4529-ae5e-9ec2a30034ae/download/-33.xlsx'
            }
    '''

    excel_title_links={}
    
    # response = requests.get(a_href)
    # response.raise_for_status()  # Check if the request was successful
    # soup = BeautifulSoup(response.text, "html.parser")

    # print(soup.find("div",class_="ant-row Resource_files__tBhWR"))
    # return
    # Navigate to the page
    driver.get(a_href)  # Replace with the actual URL

    try:
        h4 = driver.find_elements(
            By.XPATH,
            "//div[contains(@class, 'Resource_files__tBhWR')]" +
            "[div[contains(@class, 'ant-col ant-col-xs-4 ant-col-lg-2') and .//*[name()='svg' and @fill='purple']]]" +
            "//h4"
        )

        preview_excel_buttons = driver.find_elements(
            By.XPATH,
            "//div[contains(@class, 'Resource_files__tBhWR')]" +
            "[div[contains(@class, 'ant-col ant-col-xs-4 ant-col-lg-2') and .//*[name()='svg' and @fill='purple']]]" +
            "//button[div[contains(@class, 'Resource_btndiv__uJ7Uo')]//span[text()='Preview']]"
        )

        print("h4==previewbutton",len(h4)==len(preview_excel_buttons))

        # print(h4,h4[0].text)
        # print(preview_excel_buttons)
        for i, preview_excel_button in enumerate(preview_excel_buttons):
            driver.maximize_window()
            try:
                # print(preview_excel_button)
                preview_excel_button.click()
                # print("Clicked the API button successfully.")
                
                # Wait for the modal or API link to appear
                WebDriverWait(driver, 20).until(
                    EC.visibility_of_element_located((By.CLASS_NAME, "xlsx_container__5SzzP"))
                )
                # time.sleep(1)
                # Locate the iframe and retrieve the src attribute
                iframe = driver.find_element(By.XPATH, "//div[@class='xlsx_container__5SzzP']//iframe[@title='PDF Preview']")
                iframe_src = iframe.get_attribute("src")
                # print(f"Iframe src link: {iframe_src} and title: {h4[i].text}")
                excel_title_links[h4[i].get_attribute("innerText")]=iframe_src.split("src=")[-1]

                # driver.save_screenshot(f"debug_screenshot{i}{str(uuid.uuid4())}.png")
                # Wait for either of the buttons to be present
                close_button = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, "//button[contains(@class, 'preview_close_btn__2sBmv') or contains(@class, 'xlsx_close_btn__MKD_c') or contains(@class,'preview_modal__footer__button__Mo1ZZ')]"))
                )

                # Resize the button with JavaScript
                driver.execute_script("""
                    arguments[0].style.width = '100px';
                    arguments[0].style.height = '100px';
                    arguments[0].style.fontSize = '100px';  // Optional: Make the text larger if needed
                """, close_button)

                # Fallback JavaScript click
                driver.execute_script("arguments[0].click();", close_button)
                # driver.save_screenshot(f"debug_screenshot{i}{str(uuid.uuid4())}.png")
                # WebDriverWait(driver, timeout=5).until(lambda x: x.find_element_by_xpath("//button[@class='xlsx_close_btn__MKD_c']"))
                # Locate and click the element
                # close_button = WebDriverWait(driver, 20).until(
                #     EC.element_to_be_clickable((By.XPATH, "//button[@class='xlsx_close_btn__MKD_c']"))
                # )

                # ActionChains(driver).click(close_button).perform()

                # Use JavaScript to click the button
                # close_button = WebDriverWait(driver, 10).until(
                #     EC.presence_of_element_located((By.XPATH, "//button[@class='xlsx_close_btn__MKD_c']"))
                # )
                # driver.execute_script("arguments[0].click();", close_button)

                # close_button = WebDriverWait(driver, 20).until(
                #     EC.element_to_be_clickable((By.XPATH, "//button[@class='xlsx_close_btn__MKD_c']"))
                # )

                # # time.sleep(1)
                # driver.execute_script("arguments[0].scrollIntoView();", close_button)/sv
                # driver.execute_script("arguments[0].click();", close_button)

                # close_button = driver.find_element(By.XPATH, "//button[@class='xlsx_close_btn__MKD_c']/svg") # preview_close_btn__2sBmv
                # close_button.click()
                # print("Closed the modal.")

                # Add a brief pause if needed
                WebDriverWait(driver, 1)
            
            except Exception as e:
                driver.save_screenshot(f"debug_screenshot{str(uuid.uuid4())}.png")
                print(e)
                # continue
        return excel_title_links
    except Exception as e:
        print(e)
        
    
# def save_file_to_location(excel_title_links, city):
#     path="/home/oem/wiseyak/abhi/IDMS-backend/crawler/tmp"

#     for key, url in excel_title_links.items():
#         filename = f"{key}.xlsx"  # Set filename with .xlsx extension
#         file_path = os.path.join(path, filename)  # Full path to save the file
        
#         # Check if the file already exists
#         if os.path.exists(file_path):
            
#             print(f"File '{filename}' already exists.")
#             file_modified=is_file_modified(filename=filename,file_path=file_path, url=url)

#             if file_modified:
#                 command = f"wget -O '{file_path}' '{url}'"  # wget command to download and save with specific filename
#                 os.system(command)  # Execute the command
#         else:
#             command = f"wget -O '{file_path}' '{url}'"  # wget command to download and save with specific filename
#             os.system(command)  # Execute the command
#             print(f"Downloaded and saved: {filename} from {url}")

def save_file_to_location(excel_title_links, city):
    filename_mappings = {
        'local_activities': ['गत आ.व. ०८०।०८१ को मुख्या क्रियाकलाप अनुसार', 'चालु  आ.व. ०८१।०८२ को मुख्या क्रियाकलाप अनुसार'],
        'quadrimester_expense': ['चालु आ.व. ०८१।०८२ को चौमासिक खर्च विवरण', 'गत आ.व. ०८०।०८१ को चौमासिक खर्च विवरण'],
        'health_employee_details': ['स्वास्थ्य कर्मचारी विवरण (२०७८-०७९)'],
        '': []
    }

    base_path = f"/home/oem/wiseyak/abhi/IDMS-backend/data/{city}"

    for key, url in excel_title_links.items():
        topic = None
        folder_name = None

        for category, titles in filename_mappings.items():
            for title in titles:
                if jaro(key, title) > 0.9:
                    topic = category
                    filename_portion = nepali_to_english_number(key.split(' ')[3]).replace('(', '').replace(')', '')
                    parts = filename_portion.split('-')
                    folder_name = f"{parts[0]}-{parts[1][1:]}"  # Add '2' and clean
                    folder_name = folder_name.replace(".xlsx", "")
                    break

            if topic:
                break

        # Default handling if no match
        if not topic or not folder_name:
            print(f"Could not determine topic or folder name for {key}. Skipping.")
            continue

        folder_path = f"{base_path}/{topic}/{folder_name}"
        file_path = f"{folder_path}/{key}.xlsx"

        # Create folder structure
        Path(folder_path).mkdir(parents=True, exist_ok=True)

        # Check if the file already exists
        if os.path.exists(file_path):
            print(f"File '{file_path}' already exists.")
            if is_file_modified(filename=f"{key}.xlsx", file_path=file_path, url=url):
                command = f"wget -O '{file_path}' '{url}'"
                os.system(command)
                print(f"File '{file_path}' updated.")
        else:
            command = f"wget -O '{file_path}' '{url}'"
            os.system(command)
            print(f"Downloaded and saved: '{file_path}' from {url}.")

if __name__=='__main__':
    # define urls
    urls={
        'https://data.lekbeshimun.gov.np':'lekbeshi',
        'https://data.tulsipurmun.gov.np':'tulsipur',
        'https://data.birgunjmun.gov.np':'birgunj'
    }

    # urls=[
    #     'https://data.lekbeshimun.gov.np',
    #     'https://data.tulsipurmun.gov.np',
    #     'https://data.birgunjmun.gov.np'
    #     ]
    
    for url, city in urls.items():
        url_list = check_title_and_modified_date(url)

        for url in url_list:
            a_href, a_text, modified_date=url
            print(a_href, a_text, modified_date)
            excel_title_links=get_excel_links(a_href,a_text,modified_date)
            print(excel_title_links)
            save_file_to_location(excel_title_links, city)
            
            # csv_links=get_csv_links(a_href, a_text, modified_date)
            # print(csv_links)

            time.sleep(1)
        time.sleep(1)

    # Close the WebDriver
    driver.quit()

https://data.lekbeshimun.gov.np/datasets/health-dpt-data स्वास्थ्य कर्मचारी विवरण (२०७८-७९) (२०८०-०८१) 2024-10-30 00:00:00
h4==previewbutton True
{'स्वास्थ्य कर्मचारी विवरण (२०८०-०८१)': 'https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/cd7a5513-ee59-48cc-8655-a07315b366e3/download/-copy-2.xlsx', 'स्वास्थ्य कर्मचारी विवरण (२०७८-०७९)': 'https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/8e988fbe-5e0e-4529-ae5e-9ec2a30034ae/download/-33.xlsx'}
Downloaded and saved: '/home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/health_employee_details/2080-81/स्वास्थ्य कर्मचारी विवरण (२०८०-०८१).xlsx' from https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/cd7a5513-ee59-48cc-8655-a07315b366e3/download/-copy-2.xlsx.
Downloaded and saved: '/home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/health_employee_details/2078-79/स्वास्थ्य कर्मचारी विवरण (२०७८-०७९).xlsx' from https://dms.lekbeshimun.gov.np/datas

--2024-11-15 13:38:01--  https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/cd7a5513-ee59-48cc-8655-a07315b366e3/download/-copy-2.xlsx
Resolving dms.lekbeshimun.gov.np (dms.lekbeshimun.gov.np)... 103.175.192.149
Connecting to dms.lekbeshimun.gov.np (dms.lekbeshimun.gov.np)|103.175.192.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24444 (24K) [application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
Saving to: ‘/home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/health_employee_details/2080-81/स्वास्थ्य कर्मचारी विवरण (२०८०-०८१).xlsx’

     0K .......... .......... ...                             100% 76.9M=0s

2024-11-15 13:38:02 (76.9 MB/s) - ‘/home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/health_employee_details/2080-81/स्वास्थ्य कर्मचारी विवरण (२०८०-०८१).xlsx’ saved [24444/24444]

--2024-11-15 13:38:02--  https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/8e988fbe-5e0e-4529

https://data.lekbeshimun.gov.np/datasets/administration-data लेकवेशी कर्मचारी विवरण २०७९ 2023-09-13 00:00:00
h4==previewbutton True
{'लेकवेशी कर्मचारी विवरण २०७९.xlsx': 'https://dms.lekbeshimun.gov.np/dataset/c8784466-9ae2-496e-a8a3-d17dd6fd8128/resource/0aa4f7c4-bfa3-4289-a8e1-9373a2fe3f34/download/-.xlsx'}
Could not determine topic or folder name for लेकवेशी कर्मचारी विवरण २०७९.xlsx. Skipping.
https://data.tulsipurmun.gov.np/datasets/wardwise-disability-id-card-data Wardwise Disability ID Card Data 2024-07-19 00:00:00
h4==previewbutton True
{'Disability ID Card Report upto fiscal year 2080.81.xlsx': 'https://dms.tulsipurmun.gov.np/dataset/bbbd7b66-2d8e-4bfd-b9ed-d2f7679cf599/resource/3f95728f-916d-417d-9557-e67abdbd7110/download/disability-id-card-report-upto-fiscal-year-2080.81.xlsx'}
Could not determine topic or folder name for Disability ID Card Report upto fiscal year 2080.81.xlsx. Skipping.


KeyboardInterrupt: 

In [None]:
from Levenshtein import jaro

filename_mappings={
    'local_activities':['गत आ.व. ०८०।०८१ को मुख्या क्रियाकलाप अनुसार','चालु  आ.व. ०८१।०८२ को मुख्या क्रियाकलाप अनुसार'],
    'quadrimester_expense':['चालु आ.व. ०८१।०८२ को चौमासिक खर्च विवरण','गत आ.व. ०८०।०८१ को चौमासिक खर्च विवरण'],
    'health_employee_details':['स्वास्थ्य कर्मचारी विवरण (२०७८-०७९)'],
    '':[]
}

filename='स्वास्थ्य कर्मचारी विवरण (२०८०-०८१)'

for k, v in filename_mappings.items():
    for i in v:
        if jaro(filename,i)>0.9:
            print("filename Belongs to", k)
            filename=nepali_to_english_number(i.split(' ')[3]).replace('(','').replace(')','')

            # Add '2' at the first position and after '-'
            parts = filename.split('-')
            folder_name = f"{parts[0]}-{parts[1][1:]}"  # Add '2' at the first part and remove the first character of the second part
            print("To folder", folder_name)

Title Belongs to health_employee_details
To folder 2078-79


In [None]:
def save_file_to_location(excel_title_links, city):
    path=f"/home/oem/wiseyak/abhi/IDMS-backend/data/{city}"

    for key, url in excel_title_links.items():
        filename = f"{key}.xlsx"  # Set filename with .xlsx extension

        for k, v in filename_mappings.items():
            for i in v:
                if jaro(filename,i)>0.9:
                    print("filename Belongs to", k)
                    filename=nepali_to_english_number(i.split(' ')[3]).replace('(','').replace(')','')

                    # Add '2' at the first position and after '-'
                    parts = filename.split('-')
                    folder_name = f"{parts[0]}-{parts[1][1:]}"  # Add '2' at the first part and remove the first character of the second part
                    print("To folder", folder_name)

                    file_path = os.path.join(f"path/folder_name", filename)  # Full path to save the file
                    
                    # Check if the file already exists
                    if os.path.exists(file_path):
                        
                        print(f"File '{filename}' already exists.")
                        file_modified=is_file_modified(filename=filename,file_path=file_path, url=url)

                        if file_modified:
                            command = f"wget -O '{file_path}' '{url}'"  # wget command to download and save with specific filename
                            os.system(command)  # Execute the command
                    else:
                        command = f"wget -O '{file_path}' '{url}'"  # wget command to download and save with specific filename
                        os.system(command)  # Execute the command
                        print(f"Downloaded and saved: {filename} from {url}")

# Tests

In [115]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Set up Chrome options for headless mode (if you want it headless)
chrome_options = Options()
chrome_options.add_argument("--headless")

# Initialize WebDriver with options
driver = webdriver.Chrome(options=chrome_options)

# Navigate to the page
driver.get("https://data.tulsipurmun.gov.np/datasets/tulsipur")  # Replace with the actual URL

# Find the button containing the span with the text "API" and click it
try:
    # Locate the button by searching for a span with text "API"
    api_buttons = driver.find_elements(By.XPATH, "//button[.//span[text()='API']]")

    for api_button in api_buttons:
        api_button.click()
        print("Clicked the API button successfully.")

        # Wait for the modal or API link to appear
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CLASS_NAME, "api_apiEndpoint__wY3N_"))
        )
        
        # Locate the <a> tag within the modal and get the href attribute
        api_link = driver.find_element(By.XPATH, "//div[@class='api_apiEndpoint__wY3N_']//a")
        link_url = api_link.get_attribute("href")
        print("API Link URL:", link_url)

        # Locate and click the close button to close the modal
        close_button = driver.find_element(By.XPATH, "//button[@aria-label='Close']")
        close_button.click()
        print("Closed the modal.")

        # Add a brief pause if needed
        WebDriverWait(driver, 1)
        
except Exception as e:
    print("Error clicking the API button:", e)

# Close the WebDriver
driver.quit()

Clicked the API button successfully.
API Link URL: https://data.tulsipurmun.gov.np/api/v1/data_search?id=ea59a17f-0e43-4073-80cc-397d61047f5c&sort=_id%20asc
Closed the modal.
Clicked the API button successfully.
API Link URL: https://data.tulsipurmun.gov.np/api/v1/data_search?id=e3e8fd44-2e8e-4396-a291-3be6cbf7116f&sort=_id%20asc
Closed the modal.
Clicked the API button successfully.
API Link URL: https://data.tulsipurmun.gov.np/api/v1/data_search?id=a761c6cf-ecf3-4fe6-94df-501c7cc11591&sort=_id%20asc
Closed the modal.
Clicked the API button successfully.
API Link URL: https://data.tulsipurmun.gov.np/api/v1/data_search?id=2055b7fb-f8e7-423e-a8ae-afc00e2c3ea1&sort=_id%20asc
Closed the modal.
Clicked the API button successfully.
API Link URL: https://data.tulsipurmun.gov.np/api/v1/data_search?id=ac10776f-b420-45e5-a52c-b9813127890e&sort=_id%20asc
Closed the modal.
Clicked the API button successfully.
API Link URL: https://data.tulsipurmun.gov.np/api/v1/data_search?id=c4835460-50d5-4f7d-a

# Cosine Similarity

In [4]:
from Levenshtein import jaro

jaro("hello world!", "Hello worlds")

0.888888888888889

In [14]:
# jaro("०७९–०८० तारजाली वितरण","०८०–०८१ तारजाली वितरण")

jaro("Number of students in Private schools","Number of Students in Public Schools")


0.8147147147147148

In [27]:
jaro('चालु आ.व. ०८१।०८२ को मुख्या क्रियाकलाप अनुसार','गत आ.व. ०८०।०८१ को मुख्या क्रियाकलाप अनुसार') 

0.7897071490094746

# Test cases for topics

In [None]:
from Levenshtein import jaro

excel_title_links={
                'स्वास्थ्य कर्मचारी विवरण (२०७८-०७९)': 'https://example.com/',
                'गत आ.व. ०८०।०८१ को मुख्या क्रियाकलाप अनुसार': 'https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/cd7a5513-ee59-48cc-8655-a07315b366e3/download/-copy-2.xlsx', 
                'चालु आ.व. ०८१।०८२ को चौमासिक खर्च विवरण': 'https://dms.lekbeshimun.gov.np/dataset/e4a37dfb-f978-46b4-b402-9cdacc08e972/resource/8e988fbe-5e0e-4529-ae5e-9ec2a30034ae/download/-33.xlsx'
}

filename_mappings = {
        'local_activities': ['गत आ.व. ०८०।०८१ को मुख्या क्रियाकलाप अनुसार', 'चालु  आ.व. ०८१।०८२ को मुख्या क्रियाकलाप अनुसार'],
        'quadrimester_expense': ['चालु आ.व. ०८१।०८२ को चौमासिक खर्च विवरण', 'गत आ.व. ०८०।०८१ को चौमासिक खर्च विवरण'],
        'health_employee_details': ['स्वास्थ्य कर्मचारी विवरण (२०७८-०७९)'],
        '': []
}

base_path = f"/home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi"

for key, url in excel_title_links.items():
    topic = None
    folder_name = None

    for category, titles in filename_mappings.items():
        for title in titles:
            if jaro(key, title) > 0.9:
                topic = category

                if topic =='health_employee_details':
                    filename_portion = nepali_to_english_number(key.split(' ')[3]).replace('(', '').replace(')', '')
                    parts = filename_portion.split('-')
                    folder_name = f"{parts[0]}-{parts[1][1:]}"  # Add '2' and clean
                    folder_name = folder_name.replace(".xlsx", "")
                    break

                elif topic in ['quadrimester_expense', 'local_activities']:
                    filename_portion = nepali_to_english_number(key.split(' ')[2]).replace('।','-')
                    parts = filename_portion.split('-')
                    folder_name = f"2{parts[0]}-{parts[1][1:]}"  # Add '2' and clean
                    break

        if topic:
            break

    # Default handling if no match
    if not topic or not folder_name:
        print(f"Could not determine topic or folder name for {key}. Skipping.")
        continue

    folder_path = f"{base_path}/{topic}/{folder_name}"
    file_path = f"{folder_path}/{key}.xlsx"

    print(f"folder_path: {folder_path}")
    print(f"file_path: {file_path}")

    # # Create folder structure
    # Path(folder_path).mkdir(parents=True, exist_ok=True)

    # # Check if the file already exists
    # if os.path.exists(file_path):
    #     print(f"File '{file_path}' already exists.")
    #     if is_file_modified(filename=f"{key}.xlsx", file_path=file_path, url=url):
    #         command = f"wget -O '{file_path}' '{url}'"
    #         os.system(command)
    #         print(f"File '{file_path}' updated.")
    # else:
    #     command = f"wget -O '{file_path}' '{url}'"
    #     os.system(command)
    #     print(f"Downloaded and saved: '{file_path}' from {url}.")

folder_path: /home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/health_employee_details/2078-79
file_path: /home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/health_employee_details/2078-79/स्वास्थ्य कर्मचारी विवरण (२०७८-०७९).xlsx
folder_path: /home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/local_activities/2080-81
file_path: /home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/local_activities/2080-81/गत आ.व. ०८०।०८१ को मुख्या क्रियाकलाप अनुसार.xlsx
folder_path: /home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/quadrimester_expense/2081-82
file_path: /home/oem/wiseyak/abhi/IDMS-backend/data/lekbeshi/quadrimester_expense/2081-82/चालु आ.व. ०८१।०८२ को चौमासिक खर्च विवरण.xlsx
