# Setup
Configuration, Logger, Counter and Downloader

In [1]:
import os
import logging

import numpy as np
import pandas as pd

from core.config import Config, split_in_chunks
from data.download import LSEGDataDownloader

os.environ["RD_LIB_CONFIG_PATH"] = "/Configuration"

config = Config()
logging.basicConfig(
        filename=config.log_file,
        encoding="utf-8",
        level=config.log_level,
        format='%(asctime)s %(levelname)-8s %(message)s',
        datefmt = '%Y-%m-%d %H:%M:%S'
    )
logger = logging.getLogger()

## Downloading time series Data

In [2]:
from core.exceptions import DataValidationError, DataDownloadError

removed_companies: list[str] = []
with open(config.removed_companies_file, "r") as file:
    for line in file:
        removed_companies.append(line.strip())

companies: list[str] = [company for company in config.companies if removed_companies not in config.companies]
companies_chunks: list[list[str]] = split_in_chunks(
    companies,
    chunk_size=config.companies_chunk_size_historic,
    chunk_limit=config.chunk_limit
)

with (LSEGDataDownloader(config) as downloader):
    for i, company_chunk in enumerate(config.companies_historic_chunks):
        for attempt in range(2):
            try:
                print(f"Downloading historic data: for {company_chunk[0]} to {company_chunk[-1]}")
                standardized_histordict: dict[str, pd.DataFrame] = downloader.download_historic_from(company_chunk, config.historic_features, i)
                for key, new_df in standardized_histordict.items():
                    new_df.to_csv(config.eda_filtered_historic_dir / f"company-{key}.csv")
                break
            except DataValidationError as e:
                company_chunk.remove(e.__getcompanies__())
                with open(config.removed_companies_file, "a") as file:
                    file.write(f"\n{e.__getcompanies__()}")
                print(f"Removed {e.__getcompanies__()} Try again.")
                continue
            except Exception as e:
                print(f"Error downloading historic data: {company_chunk}")
        else:
            raise DataDownloadError("Too many attempts to download historic data", company_chunk)

Downloading historic data: for 000063.SZ to 068270.KS
Error downloading historic data: ['000063.SZ', '000100.KS', '000120.KS', '000150.KS', '0002.HK', '000210.KS', '000270.KS', '0004.HK', '000660.KS', '000720.KS', '000810.KS', '000880.KS', '000895.SZ', '0010.HK', '0012.HK', '0013.HK', '001740.KS', '0019.HK', '002129.SZ', '002340.SZ', '002352.SZ', '002380.KS', '002459.SZ', '002475.SZ', '002506.SZ', '002555.SZ', '002790.KS', '002797.SZ', '002916.SZ', '002939.SZ', '003550.KS', '003670.KS', '004000.KS', '004020.KS', '005300.KS', '005380.KS', '005490.KS', '005930.KS', '005940.KS', '006360.KS', '006400.KS', '006800.KS', '007310.KS', '0083.HK', '009150.KS', '009240.KS', '009540.KS', '009830.KS', '010060.KS', '0101.HK', '010130.KS', '010140.KS', '010620.KS', '011070.KS', '011170.KS', '011200.KS', '011790.KS', '012330.KS', '012450.KS', '015760.KS', '016360.KS', '0175.HK', '017670.KS', '018260.KS', '018880.KS', '0200.HK', '021240.KS', '023530.KS', '024110.KS', '028050.KS', '028260.KS', '028670.K

DataDownloadError: Connection failed for ['000063.SZ', '000100.KS', '000120.KS', '000150.KS', '0002.HK', '000210.KS', '000270.KS', '0004.HK', '000660.KS', '000720.KS', '000810.KS', '000880.KS', '000895.SZ', '0010.HK', '0012.HK', '0013.HK', '001740.KS', '0019.HK', '002129.SZ', '002340.SZ', '002352.SZ', '002380.KS', '002459.SZ', '002475.SZ', '002506.SZ', '002555.SZ', '002790.KS', '002797.SZ', '002916.SZ', '002939.SZ', '003550.KS', '003670.KS', '004000.KS', '004020.KS', '005300.KS', '005380.KS', '005490.KS', '005930.KS', '005940.KS', '006360.KS', '006400.KS', '006800.KS', '007310.KS', '0083.HK', '009150.KS', '009240.KS', '009540.KS', '009830.KS', '010060.KS', '0101.HK', '010130.KS', '010140.KS', '010620.KS', '011070.KS', '011170.KS', '011200.KS', '011790.KS', '012330.KS', '012450.KS', '015760.KS', '016360.KS', '0175.HK', '017670.KS', '018260.KS', '018880.KS', '0200.HK', '021240.KS', '023530.KS', '024110.KS', '028050.KS', '028260.KS', '028670.KS', '0288.HK', '0293.HK', '029780.KS', '030200.KS', '0316.HK', '032640.KS', '032830.KS', '0330.HK', '033780.KS', '034220.KS', '0345.HK', '034730.KS', '035420.KS', '035720.KS', '036460.KS', '036570.KS', '0384.HK', '0388.HK', '042660.KS', '042670.KS', '047040.KS', '047050.KS', '051900.KS', '051910.KS', '055550.KS', '0656.HK', '066570.KS', '068270.KS'] with []

## Downloading static Data

In [None]:
from core.exceptions import DataValidationError, DataDownloadError

removed_companies: list[str] = []
with open(config.removed_companies_file, "r") as file:
    for line in file:
        removed_companies.append(line.strip())

companies: list[str] = [company for company in config.companies if removed_companies not in config.companies]
companies_chunks: list[list[str]] = split_in_chunks(
    companies,
    chunk_size=config.companies_chunk_size_static,
    chunk_limit=config.chunk_limit
)

with LSEGDataDownloader(config) as downloader:
    for i, company_chunk in enumerate(companies_chunks):
        for attempt in range(100):
            try:
                print(f"Downloading static data: for {company_chunk[0]} to {company_chunk[-1]}")
                statdict: dict[str, pd.DataFrame] = downloader.download_static_from(company_chunk, config.static_features)
                for name, frame in statdict.items():
                    frame.to_csv(config.filtered_static_dir / f"company-{name}.csv")
                break
            except DataValidationError as e:
                company_chunk.remove(e.__getcompanies__())
                with open(config.removed_companies_file, "a") as file:
                    file.write(f"\n{e.__getcompanies__()}")
                print(f"Removed {e.__getcompanies__()} Try again.")
                continue
        else:
            raise DataDownloadError("Too many attempts to download static data", company_chunk)

# Other

In [None]:
import re

dir1 = config.dataset_dir / "historic"
dir2 = config.dataset_dir / "static"
files1: list[str] = [file.name for file in dir1.glob("*.csv")]
names1: list[str] = [re.findall(r"company-+(.*).csv", file)[0] for file in files1]
files2: list[str] = [file.name for file in dir2.glob("*.csv")]
names2: list[str] = [re.findall(r"company-+(.*).csv", file)[0] for file in files2]
not_in1: list[str] = [name for name in names2 if name not in names1]
not_in2: list[str] = [name for name in names1 if name not in names2]
not_in_both: list[str] = not_in1 + not_in2

In [None]:
def is_unique(s: pd.DataFrame):
    a: np.ndarray = s.to_numpy()
    return (a[0] == a).all()
without_same_results: pd.DataFrame = all_static_frame[all_static_frame.apply(is_unique, axis=1)]