In [None]:
from data.cleaning import read_all_historic_csv
from core import Config
import pandas as pd

config = Config()
historic_dictionary: dict[str, pd.DataFrame] = read_all_historic_csv(config.historic_dir)
all_historic_frame: pd.DataFrame = pd.concat(historic_dictionary.values())

In [None]:
from collections import Counter
historic_counter: Counter = Counter()
for dataframe in historic_dictionary.values():
    historic_counter.update(dataframe.columns.to_list())
most_common_historic_columns: list[tuple[str, int]] = historic_counter.most_common()
historic_dictionary_len = len(historic_dictionary)
del historic_dictionary

In [None]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_historic_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: x/historic_dictionary_len * 100)
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(0, 101, 10), labels=labels, right=True)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)
mask = bar_data.loc[bar_data["sameFeatureCount"] < 90, "Features"]
all_historic_frame.drop(columns=mask, inplace=True)

In [None]:
import plotly.express as px
fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "companyCountIn%": "Anzahl Unternehmen in %",
        "sameFeatureCount": "Anzahl überschneidender zeitreihen Features",
    }
)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "historic-same-features-count.png")
fig.show()

In [None]:
# figure out the desired Scope 3.1 purchased goods and services value and how it was reported
scope3_1_count_per_year: pd.Series = (
    all_historic_frame["TR.UpstreamScope3PurchasedGoodsAndServices"].notna().groupby(level=1).sum()
)
fig = px.bar(
    scope3_1_count_per_year,
    x=scope3_1_count_per_year.index,
    y="TR.UpstreamScope3PurchasedGoodsAndServices",
    color="TR.UpstreamScope3PurchasedGoodsAndServices",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "TR.UpstreamScope3PurchasedGoodsAndServices": "Anzahl Reports von Scope 3.1",
        "Date": "Jahr",
    }
)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "scope3_1_count_per_year.png")
fig.show()

## Working with filtered Dataset

In [None]:
from data.cleaning import read_all_historic_csv
from core import Config
import pandas as pd

config = Config()
filtered_historic_frame = pd.read_csv(config.filtered_dir / "filtered_historic.csv", index_col=[0, 1])

## Filter companies with many NaN values (over 83%)

In [39]:
nan_counts = filtered_historic_frame.isna().sum(axis=1).groupby(level=0).sum().sort_values(ascending=False)
nan_counts.name = "NaN Count"
# companies with NaN values over 83% (Why 83%? Because with 82% there would fall to many companies out of the data)
companies_with_nan_over_83 = nan_counts[nan_counts > (filtered_historic_frame.shape[1] * (2024-2016+1)) * 0.83].to_frame()

In [44]:
import plotly.express as px
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data = pd.DataFrame()
bar_data["Nan Count"] = pd.cut(companies_with_nan_over_83, range(0, 101, 10), labels=labels, right=True)
fig = px.bar(
    companies_with_nan_over_83,
    x=companies_with_nan_over_83.index,
    y="NaN Count",
    color="NaN Count",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "NaN Count": "Anzahl NaN Werte",
        "Company": "Unternehmen",
    }
)
fig.show()

ValueError: Input array must be 1 dimensional

## Converting dtypes

In [None]:
converted = filtered_historic_frame.convert_dtypes()
g = converted.columns.to_series().groupby(converted.dtypes.apply(lambda x: x.name))
converted[g.get_group("string")] = converted[g.get_group("string")].astype("category")
my_list = list(converted.select_dtypes(include='boolean').columns)
types = list(converted.dtypes.values)

## Impute values

In [None]:
#remove values with only one unique value
boolean_columns = converted.select_dtypes('bool').columns
is_single_value = [
    col for col in boolean_columns
    if converted[col].dropna().nunique() == 1
]
for col in is_single_value:
    print(f"{converted[col].value_counts()}")
#is_single_value_df: pd.DataFrame = filtered_historic_frame_converted.loc[:, is_single_value]
#filtered_historic_frame_converted.columns[is_single_value].dtype
#filtered_historic_frame_converted.drop(columns=is_single_value_df.columns, inplace=True)

In [None]:
boolean_cols = converted.select_dtypes('bool')
summary = pd.DataFrame({
    'True': boolean_cols.sum(),
    'False': (~boolean_cols).sum()
})

In [None]:
# Count NaN in the whole time series frame and filter
nan_count: pd.Series = filtered_historic_frame.isna().sum()
# filter how many NaN per year
nan_per_year: pd.DataFrame = filtered_historic_frame.isna().groupby(level=1).sum()
nan_per_year_and_column: pd.Series = nan_per_year.sum(axis=1)
# filter features with threshold NaNs
filtered: pd.DataFrame = filtered_historic_frame.drop(nan_count.index[nan_count > 15000], axis=1)

### Write List of filterted features

In [None]:
filtered_time_series_feature_list: list[str] = all_historic_frame.columns.to_list()
filtered_time_series_feature_list.sort()
with open(config.historic_features_file, "w") as file:
    file.write("\n".join(str(i) for i in filtered_time_series_feature_list))

# TODO
- convert categorical data to category
- Check how balanced categorical data is