In [3]:
pip install webdriver-manager


Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import time
from datetime import datetime, timedelta
import os
from bs4 import BeautifulSoup

def get_issuers(driver):
    driver.get('https://www.mse.mk/en/stats/symbolhistory/REPL')
    options = driver.find_elements(By.CSS_SELECTOR, ".form-control option")
    issuer_codes = [option.text for option in options if option.text.isalpha()]
    return issuer_codes

def check_last_date(issuer_code, data_file="stock_market_data123.csv"):
    if os.path.exists(data_file) and os.path.getsize(data_file) > 0:
        try:
            data = pd.read_csv(data_file, parse_dates=['Date'])
            data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
            issuer_data = data[data['Issuer_Code'] == issuer_code]
            if not issuer_data.empty:
                last_date = issuer_data['Date'].max()
                return last_date + timedelta(days=1)  # Next date for data retrieval
        except Exception as e:
            print(f"Error reading data file: {e}")
    return datetime.now().date() - timedelta(days=365*10)  # If no data, go back 10 years

def save_data_to_csv(new_data, data_file="stock_market_data123.csv"):
    if os.path.exists(data_file) and os.path.getsize(data_file) > 0:
        try:
            existing_data = pd.read_csv(data_file, parse_dates=['Date'])
            existing_data['Date'] = pd.to_datetime(existing_data['Date']).dt.date
            combined_data = pd.concat([existing_data, pd.DataFrame(new_data)], ignore_index=True)
            combined_data.drop_duplicates(subset=['Issuer_Code', 'Date'], keep='last', inplace=True)
        except Exception as e:
            print(f"Error reading existing data from {data_file}: {e}")
            combined_data = pd.DataFrame(new_data)
    else:
        combined_data = pd.DataFrame(new_data)
        
    try:
        combined_data.to_csv(data_file, index=False)
        print(f"Data successfully saved to {data_file}")
    except Exception as e:
        print(f"Error saving data to {data_file}: {e}")
        
def fill_missing_data(driver, issuer_codes, data_file="stock_market_data123.csv"):
    for code in issuer_codes:
        all_data = []
        start_date = check_last_date(code, data_file)
        end_date = datetime.now().date()

        driver.get('https://www.mse.mk/en/stats/symbolhistory/REPL')
        wait = WebDriverWait(driver, 5)
        select_elementCode = wait.until(EC.presence_of_element_located((By.ID, "Code")))
        select = Select(select_elementCode)
        select.select_by_value(code)
        print(f"Processing code: {code}")

        for year in range(start_date.year, end_date.year + 1):
            try:
                date_inputFrom = wait.until(EC.presence_of_element_located((By.ID, "FromDate")))
                date_inputTo = wait.until(EC.presence_of_element_located((By.ID, "ToDate")))
                date_inputFrom.clear()
                date_inputTo.clear()
                
                date_inputFrom.send_keys(f"{start_date.strftime('%m/%d/%Y')}" if year == start_date.year else f"01/01/{year}")
                date_inputTo.send_keys(f"{end_date.strftime('%m/%d/%Y')}" if year == end_date.year else f"12/31/{year}")
               
                button = driver.find_element(By.CLASS_NAME, "btn-primary-sm")
                button.click()
                
                table_rows = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.XPATH, "//table[@id='resultsTable']//tbody//tr")))
                if not table_rows:
                    print(f"No data found for {code} in year {year}")
                    continue
                for row in table_rows:
                    cols = row.find_elements(By.TAG_NAME, "td")
                    if len(cols) >= 9:
                        data_dict = {
                            "Issuer_Code": code,
                            "Date": cols[0].text.strip(),
                            "Last_Price": cols[1].text.strip(),
                            "Max_Price": cols[2].text.strip(),
                            "Min_Price": cols[3].text.strip(),
                            "Average_Price": cols[4].text.strip(),
                            "Percent": cols[5].text.strip(),
                            "Quantity": cols[6].text.strip(),
                            "Traffic": cols[7].text.strip(),
                            "Sum_Traffic": cols[8].text.strip()
                        }
                        all_data.append(data_dict)
                print(f"Data found for {code} in year {year}") 
            except (TimeoutException, NoSuchElementException):
                print(f"No data for code {code} in year {year}")
                continue

        save_data_to_csv(all_data, data_file)
        print(f"Data for {code} saved to {data_file}")

chrome_options = Options()


service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-images")  # To avoid loading images

start_time = time.time()

try:
    issuer_codes = get_issuers(driver)
    fill_missing_data(driver, issuer_codes)
finally:
    driver.quit()

execution_time = time.time() - start_time
print(f"Execution Time: {execution_time:.2f} seconds")
print("Data collection complete.") 