In [1]:
from core import Config
from data.cleaning import read_all_static_csv
import pandas as pd

config = Config()
static_dictionary: dict[str, pd.DataFrame] = read_all_static_csv(config.static_dir)
all_static_frame: pd.DataFrame = pd.concat(static_dictionary.values())

In [2]:
from collections import Counter
static_counter: Counter = Counter()
for dataframe in static_dictionary.values():
    static_counter.update(dataframe.columns.to_list())
static_dictionary_len = len(static_dictionary)
del static_dictionary
most_common_static_columns: list[tuple[str, int]] = static_counter.most_common()

In [3]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_static_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: int(x/static_dictionary_len * 100))
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(0, 101, 10), labels=labels)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)
mask = bar_data.loc[bar_data["sameFeatureCount"] < 90, "Features"]
all_static_frame.drop(columns=mask, inplace=True)

In [5]:
import plotly.express as px

fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "sameFeatureCount": "Anzahl überschneidender statischer Features",
        "companyCountIn%": "Anzahl Unternehmen in %"
    },
)
fig.update_traces(cliponaxis=False)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "static-same-features-count.png")
fig.show()

## Working on Filtered Dataset

In [None]:
static_dictionary = read_all_static_csv(config.filtered_static_dir)
all_static_frame = pd.concat(static_dictionary.values())
del static_dictionary

In [None]:
# Count NaN in the whole static frame and filter
nan_count: pd.Series = all_static_frame.isna().sum()
filtered: pd.DataFrame = all_static_frame.filter(nan_count[nan_count == 0].index)
filtered_static_feature_list: list[str] = all_static_frame.columns.to_list()
filtered_static_feature_list.sort()

## Write list of features

In [None]:
with open(config.static_features_file, "w") as file:
    file.write("\n".join(str(i) for i in filtered_static_feature_list))