In [None]:
from data.cleaning import read_all_historic_csv
from core import Config
import pandas as pd

config = Config()
historic_dictionary: dict[str, pd.DataFrame] = read_all_historic_csv(config.filtered_dir_historic)
all_historic_frame: pd.DataFrame = pd.concat(historic_dictionary.values(), names=[0, 1])

# Explorative Data Analysis

In [None]:
all_historic_frame.info()

## Filter out every company without reporting Scope 3.1 properly

In [None]:
scope31: pd.Series = all_historic_frame["TR.UpstreamScope3PurchasedGoodsAndServices"]
# list every index that has only NaN values
nan_index = scope31.groupby(level=0).filter(lambda x: x.isna().sum() == len(x))
all_historic_frame.drop(index=nan_index.index, inplace=True)

## Detect all features that have most companies in common

In [None]:
from collections import Counter

historic_counter: Counter = Counter()
for dataframe in historic_dictionary.values():
    historic_counter.update(dataframe.columns.to_list())
most_common_historic_columns: list[tuple[str, int]] = historic_counter.most_common()
historic_dictionary_len = len(historic_dictionary)
del historic_dictionary

In [None]:
import plotly.express as px

# figure out the desired Scope 3.1 purchased goods and services value and how it was reported
scope3_1_count_per_year: pd.Series = (
    all_historic_frame["TR.UpstreamScope3PurchasedGoodsAndServices"].notna().groupby(level=1).sum()
)
fig = px.bar(
    scope3_1_count_per_year,
    x=scope3_1_count_per_year.index,
    y="TR.UpstreamScope3PurchasedGoodsAndServices",
    color="TR.UpstreamScope3PurchasedGoodsAndServices",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "TR.UpstreamScope3PurchasedGoodsAndServices": "Anzahl Reports von Scope 3.1",
        "Date": "Jahr",
    }
)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "scope3_1_count_per_year.png")
fig.show()

When inspecting the Data for how many companies reported Scope 3.1, then we can see that only from 2016 the first values are reported. This gradually rises up to 2023 with 2705 companies and falls down to 1431 in 2024, probably because they didn't have the time to disclose the value yet. This leads to many data to be properly imputed, because this is our targed value.

In [None]:
start_date = pd.Timestamp('2016-01-01')
end_date = pd.Timestamp('2024-12-31')
mask = (all_historic_frame.index.get_level_values('Date') >= start_date) & (
            all_historic_frame.index.get_level_values('Date') <= end_date)
all_historic_frame = all_historic_frame[mask]

In [None]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_historic_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: x / historic_dictionary_len * 100)
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(0, 101, 10), labels=labels, right=True)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)
mask = bar_data.loc[bar_data["sameFeatureCount"] < 90, "Features"]

In [None]:
fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "companyCountIn%": "Anzahl Unternehmen in %",
        "sameFeatureCount": "Anzahl überschneidender zeitreihen Features",
    }
)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "historic-same-features-count.png")
fig.show()

In [None]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_historic_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: int(x / historic_dictionary_len * 100))
labels = ["{0}".format(i) for i in range(91, 101, 1)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(90, 101, 1), labels=labels)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)
mask_under_90 = bar_data.loc[bar_data["sameFeatureCount"] < 90, "Features"]
mask_90_97 = bar_data.loc[(bar_data["sameFeatureCount"] >= 90) & (bar_data["sameFeatureCount"] < 98), "Features"]

In [None]:
fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "sameFeatureCount": "Anzahl überschneidender statischer Features",
        "companyCountIn%": "Anzahl Unternehmen in %"
    },
)
fig.update_traces(cliponaxis=False)
fig.update_coloraxes(showscale=False)
fig.show()

The companies are not reporting a lot of Features equally. 623 Features are only reported by less than 10% of the companies, and the next high is with 710 Features reported by over 90% of all companies. Of these 710, the majority of data remains also in the region of 99–100% with 623 features.

In [None]:
all_historic_frame.drop(columns=mask_under_90, inplace=True)

## Domain Knowledge Filtering

In [None]:
all_historic_frame2 = all_historic_frame[mask_90_97]

In [None]:
all_historic_frame = all_historic_frame.convert_dtypes()
all_historic_frame.to_csv(config.eda_filtered_dir / "eda_filtered_historic.csv")
#safe dtypes as files to make it possible to know the dtypes of the file while reading
dlist = all_historic_frame.dtypes.to_csv(config.eda_filtered_dir / "eda_filtered_historic_dtypes.csv", index=False)

## Working with filtered Dataset

In [None]:
from core import Config
import pandas as pd

config = Config()
dlist = pd.read_csv(config.eda_filtered_dir / "eda_filtered_historic_dtypes.csv")
all_historic_frame = pd.read_csv(config.eda_filtered_dir / "eda_filtered_historic.csv", dtype=dlist.values, index_col=[0, 1])

## Impute values

In [None]:
historic_dictionary: dict[str, pd.DataFrame] = {id_key: all_historic_frame.droplevel(level=0) for id_key, all_historic_frame in all_historic_frame.groupby(level=0)}

If only the same value in the dataframe column is reported, fill NaN values with that one value per column. A change in previous years is unlikely.

In [None]:
for key, df in historic_dictionary.items():
    columns_with_one_unique_value = (df.nunique(dropna=True) == 1)
    for column in df.columns[columns_with_one_unique_value]:
        single_value = df[column].dropna().iloc[0]
        df[column] = df[column].fillna(single_value)
    historic_dictionary[key] = df

In [None]:
all_historic_frame = pd.concat(historic_dictionary, names=[0, 1])

In [None]:
# Detect columns with only boolean-like strings and convert dtype to boolean
bool_strings = [{'true', 'false'}, {'True', 'False'}, {'yes', 'no'}, {'Yes', 'No'}, {'1', '0'}]
def is_bool_string_col(series):
    values = set(series.dropna().unique())
    return any(values <= s for s in bool_strings)

for col in all_historic_frame.select_dtypes(include=['object', 'string']):
    if is_bool_string_col(all_historic_frame[col]):
        all_historic_frame[col] = all_historic_frame[col].replace({'true': True, 'True': True, 'yes': True, 'Yes': True, '1': True,
                                  'false': False, 'False': False, 'no': False, 'No': False, '0': False}).astype('boolean')
        all_historic_frame[col] = all_historic_frame[col].astype('boolean')  # use 'bool' for non-nullable booleans

In [None]:
g = all_historic_frame.columns.to_series().groupby(all_historic_frame.dtypes.apply(lambda x: x.name))
boolean_cols = all_historic_frame.select_dtypes('bool')

In [None]:
#remove values with only one unique value
is_single_value = [
    col for col in all_historic_frame
    if all_historic_frame[col].nunique(dropna=True) == 1
]
all_historic_frame.drop(is_single_value, axis=1, inplace=True)

In [None]:
import core.config as config
config = config.Config()
column_names = all_historic_frame.columns.values
column_names.sort()
with open(config.eda_features_file_historic, "w") as file:
    file.write("\n".join(str(i) for i in column_names))

In [None]:
object_cols = all_historic_frame.select_dtypes('object')

In [None]:
# Count NaN for every column of object cols and safe it as a series.
nan_count: pd.Series = object_cols.isna().sum()

In [None]:
# Count NaN in the whole time series frame and filter
nan_count: pd.Series = all_historic_frame.isna().sum()
# filter how many NaN per year
nan_per_year: pd.DataFrame = all_historic_frame.isna().groupby(level=1).sum()
nan_per_year_and_column: pd.Series = nan_per_year.sum(axis=1)
# filter features with threshold NaNs
filtered: pd.DataFrame = all_historic_frame.drop(nan_count.index[nan_count > 15000], axis=1)

In [None]:
# a series of every company with the number of distinct values in the TR.UpstreamScope3PurchasedGoodsAndServices column.
scope31_count: list[int] = []
for key, df in historic_dictionary.items():
    scope31_count.append(df["TR.UpstreamScope3PurchasedGoodsAndServices"].nunique())

##  Impute Scope 3.1

In [None]:
import plotly.express as px
scope31.dropna(inplace=True)
scope31["TR.UpstreamScope3PurchasedGoodsAndServices"] = scope31["TR.UpstreamScope3PurchasedGoodsAndServices"].astype(float)
scope31["value_norm"] = scope31.groupby(0)["TR.UpstreamScope3PurchasedGoodsAndServices"].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
subset = scope31[scope31[0].isin(scope31[0].unique()[:200])]
fig = px.line(
    subset,
    x=1,
    y='value_norm',
    color="TR.UpstreamScope3PurchasedGoodsAndServices",
    title='Data Trends Over Years',
    template='plotly_dark'
)
fig.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
scope31["value_norm"] = scope31.groupby(0)["TR.UpstreamScope3PurchasedGoodsAndServices"].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
pivot = scope31.pivot(index=0, columns=1, values='value_norm')
plt.figure(figsize=(10, 6))
sns.heatmap(pivot, cmap='mako', cbar=True)
plt.title('Data Coverage and Trends by Company')
plt.show()

In [74]:
agg = scope31.groupby(1)["value_norm"].median().reset_index()
px.line(agg, x=1, y="value_norm", title="Median Value per Year").show()

### Write List of filterted features

In [None]:
filtered_time_series_feature_list: list[str] = all_historic_frame.columns.to_list()
filtered_time_series_feature_list.sort()
with open(config.historic_features_file, "w") as file:
    file.write("\n".join(str(i) for i in filtered_time_series_feature_list))

# TODO
- convert categorical data to category
- Check how balanced categorical data is