# Setup
Configuration, Logger, Counter and Downloader

In [None]:
import os
import logging

import numpy as np
import pandas as pd

from core.config import Config, split_in_chunks
from data.download import LSEGDataDownloader

os.environ["RD_LIB_CONFIG_PATH"] = "/Configuration"

config = Config()
logging.basicConfig(
        filename=config.log_file,
        encoding="utf-8",
        level=config.log_level,
        format='%(asctime)s %(levelname)-8s %(message)s',
        datefmt = '%Y-%m-%d %H:%M:%S'
    )
logger = logging.getLogger()

## Downloading time series Data

In [None]:
from core.exceptions import DataValidationError, DataDownloadError

removed_companies: list[str] = []
with open(config.removed_companies_file, "r") as file:
    for line in file:
        removed_companies.append(line.strip())

companies: list[str] = [company for company in config.companies if removed_companies not in config.companies]
companies_chunks: list[list[str]] = split_in_chunks(
    companies,
    chunk_size=config.companies_chunk_size_historic,
    chunk_limit=config.chunk_limit
)

with (LSEGDataDownloader(config) as downloader):
    for i, company_chunk in enumerate(companies_chunks[11:]):
        for attempt in range(100):
            try:
                print(f"Downloading historic data: for {company_chunk[0]} to {company_chunk[-1]}")
                data: pd.DataFrame = downloader.download_historic_from(company_chunk, config.historic_features)
                standardized_histordict: dict[str, pd.DataFrame] = downloader.standardize_historic_data(data, i)
                del data
                for key, new_df in standardized_histordict.items():
                    new_df.to_csv(config.filtered_historic_dir / f"company-{key}.csv")
                break
            except DataValidationError as e:
                company_chunk.remove(e.__getcompanies__())
                with open(config.removed_companies_file, "a") as file:
                    file.write(f"\n{e.__getcompanies__()}")
                print(f"Removed {e.__getcompanies__()} Try again.")
                continue
        else:
            raise DataDownloadError("Too many attempts to download historic data", company_chunk)

## Downloading static Data

In [None]:
from core.exceptions import DataValidationError, DataDownloadError

removed_companies: list[str] = []
with open(config.removed_companies_file, "r") as file:
    for line in file:
        removed_companies.append(line.strip())

companies: list[str] = [company for company in config.companies if removed_companies not in config.companies]
companies_chunks: list[list[str]] = split_in_chunks(
    companies,
    chunk_size=config.companies_chunk_size_static,
    chunk_limit=config.chunk_limit
)

with LSEGDataDownloader(config) as downloader:
    for i, company_chunk in enumerate(companies_chunks):
        for attempt in range(100):
            try:
                print(f"Downloading static data: for {company_chunk[0]} to {company_chunk[-1]}")
                statdict: dict[str, pd.DataFrame] = downloader.download_static_from(company_chunk, config.static_features)
                for name, frame in statdict.items():
                    frame.to_csv(config.filtered_static_dir / f"company-{name}.csv")
                break
            except DataValidationError as e:
                company_chunk.remove(e.__getcompanies__())
                with open(config.removed_companies_file, "a") as file:
                    file.write(f"\n{e.__getcompanies__()}")
                print(f"Removed {e.__getcompanies__()} Try again.")
                continue
        else:
            raise DataDownloadError("Too many attempts to download static data", company_chunk)

# Other

In [None]:
import re

dir1 = config.dataset_dir / "historic"
dir2 = config.dataset_dir / "static"
files1: list[str] = [file.name for file in dir1.glob("*.csv")]
names1: list[str] = [re.findall(r"company-+(.*).csv", file)[0] for file in files1]
files2: list[str] = [file.name for file in dir2.glob("*.csv")]
names2: list[str] = [re.findall(r"company-+(.*).csv", file)[0] for file in files2]
not_in1: list[str] = [name for name in names2 if name not in names1]
not_in2: list[str] = [name for name in names1 if name not in names2]
not_in_both: list[str] = not_in1 + not_in2

In [None]:
def is_unique(s: pd.DataFrame):
    a: np.ndarray = s.to_numpy()
    return (a[0] == a).all()
without_same_results: pd.DataFrame = all_static_frame[all_static_frame.apply(is_unique, axis=1)]