# Setup
Configuration, Logger, Counter and Downloader

In [None]:
import os
import logging

import numpy as np
import pandas as pd

from core.config import Config
from data.cleaning import read_all_static_csv, read_all_historic_csv
from data.download import LSEGDataDownloader

os.environ["RD_LIB_CONFIG_PATH"] = "/Configuration"

config = Config()
logging.basicConfig(
        filename=config.log_file,
        encoding="utf-8",
        level=config.log_level,
        format='%(asctime)s %(levelname)-8s %(message)s',
        datefmt = '%Y-%m-%d %H:%M:%S'
    )
logger = logging.getLogger()

## Downloading time series Data

In [None]:
with (LSEGDataDownloader(config) as downloader):
    logger.info("Downloading historic frames from LSEG database A-Z")
    for chunk in config.companies_historic_chunks:
        historic_result: dict[str, pd.DataFrame] = (
            downloader.download_all_historic_chunks(chunk)
        )

## Downloading static Data

## Merging Frames

In [None]:
static_dictionary: dict[str, pd.DataFrame] = read_all_static_csv(config.static_dir)
all_static_frame: pd.DataFrame = pd.concat(static_dictionary.values())

In [None]:
historic_dictionary: dict[str, pd.DataFrame] = read_all_historic_csv(config.historic_dir)
all_historic_frame: pd.DataFrame = pd.concat(historic_dictionary.values())

## Analysing Frames

### Counting same static columns/features from companys

In [None]:
from collections import Counter
static_counter: Counter = Counter()
for dataframe in static_dictionary.values():
    static_counter.update(dataframe.columns.to_list())
most_common_static_columns: list[tuple[str, int]] = static_counter.most_common()

In [None]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_static_columns, columns=["Features", "sameFeatureCount"])
bar_data.drop(columns="Features", inplace=True)
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: x/len(static_dictionary) * 100)
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data[1], range(0, 101, 10), labels=labels, right=True)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)

import plotly.express as px
fig = px.bar(grouped, x="companyCountIn%", y="sameFeatureCount", color="sameFeatureCount", color_continuous_scale="mint", text_auto=True)
fig.update_traces(cliponaxis=False)
fig.update_xaxes(title='Anzahl Unternehmen in %')
fig.update_yaxes(title='Anzahl gleicher statischer Features')
fig.show()

In [None]:
# Count NaN in the whole static frame and filter

### Counting same time series columns/features from companys

In [None]:
from collections import Counter
historic_counter: Counter = Counter()
for dataframe in historic_dictionary.values():
    column_names: list[str] = dataframe.columns.to_list()
    historic_counter.update(column_names)
most_common_historic_columns: list[tuple[str, int]] = historic_counter.most_common()
filtered_columns: list[str] = [counts[0] for counts in most_common_historic_columns if counts[1] < 100]
historic_dictionary_len = len(historic_dictionary)
del historic_dictionary

In [None]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_historic_columns, columns=["Features", "sameFeatureCount"])
bar_data.drop(columns="Features", inplace=True)
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: x/historic_dictionary_len * 100)
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(0, 101, 10), labels=labels, right=True)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)

import plotly.express as px
fig = px.bar(grouped, x="companyCountIn%", y="sameFeatureCount", color="sameFeatureCount", color_continuous_scale="mint", text_auto=True)
fig.update_traces(cliponaxis=False)
fig.update_xaxes(title='Anzahl Unternehmen in %')
fig.update_yaxes(title='Anzahl gleicher zeitreihen Features')
fig.show()

In [None]:
# Count NaN in the whole time series frame and filter

In [None]:
reduced_historic: pd.DataFrame = all_historic_frame.drop(columns=filtered_columns)

# Other Functions

In [None]:
import re

dir1 = config.dataset_dir / "historic"
dir2 = config.dataset_dir / "static"
files1: list[str] = [file.name for file in dir1.glob("*.csv")]
names1: list[str] = [re.findall(r"company-+(.*).csv", file)[0] for file in files1]
files2: list[str] = [file.name for file in dir2.glob("*.csv")]
names2: list[str] = [re.findall(r"company-+(.*).csv", file)[0] for file in files2]
not_in1: list[str] = [name for name in names2 if name not in names1]
not_in2: list[str] = [name for name in names1 if name not in names2]
not_in_both: list[str] = not_in1 + not_in2

In [None]:
def is_unique(s: pd.DataFrame):
    a: np.ndarray = s.to_numpy()
    return (a[0] == a).all()
without_same_results: pd.DataFrame = all_static_frame[all_static_frame.apply(is_unique, axis=1)]