In [None]:
from core import Config
from data.cleaning import read_all_static_csv
import pandas as pd

config = Config()
static_dictionary: dict[str, pd.DataFrame] = read_all_static_csv(config.static_dir)
all_static_frame: pd.DataFrame = pd.concat(static_dictionary.values())

In [None]:
from collections import Counter
static_counter: Counter = Counter()
for dataframe in static_dictionary.values():
    static_counter.update(dataframe.columns.to_list())
static_dictionary_len = len(static_dictionary)
del static_dictionary
most_common_static_columns: list[tuple[str, int]] = static_counter.most_common()

In [None]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_static_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: int(x/static_dictionary_len * 100))
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(0, 101, 10), labels=labels)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)
mask = bar_data.loc[bar_data["sameFeatureCount"] < 90, "Features"]
all_static_frame.drop(columns=mask, inplace=True)

In [None]:
import plotly.express as px

fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "sameFeatureCount": "Anzahl überschneidender statischer Features",
        "companyCountIn%": "Anzahl Unternehmen in %"
    },
)
fig.update_traces(cliponaxis=False)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "static-same-features-count.png")
fig.show()

In [None]:
#manually removed columns
# identifier are not good, 'cause they don't hold any new information than the instrument id
# too many distinctions are not good, 'cause when everything is different, how should the model know that this value cause high or low Scope 3.1
remove_cols: list[str] = [
    'TR.TickerSymbolCode', #identifier
    'TR.TickerSymbolOld', #identifier
    'TR.IssuerTickerCode', #identifier
    'CUSIP (extended)', #identifier
    'Main Phone Number', #useless information
    'Ticker Symbol', #identifier
    'TR.SEDOLCode', #identifier
    'Contact Postal Code', #too many distinctions
    'TR.WertCode', #identifier
    'TR.RegistrationNumber', #identifier
    'TR.OrgFoundedDay',
    'Primary Issue RIC',
    'TR.ExchangeName',
    'TR.PrimaryInstrument',
    'TR.PrimaryRICCode',
    'TR.InstrumentName',
    'Contact Title',
    'TR.ExchangeRegion',
    'TR.InstrumentCommonName',
    'Web Link to Home Page',
    'TR.RelatedOrgName',
    'Co. Business Summary',
    'Contact Street Address',
    'TR.FormerlyKnownAsName',
    'TR.CmnLegalAddressLine1',
    'TR.HQAddressLine1',
    'TR.AlsoKnownAsName',
    'TR.PrimaryIssueRICCode',
    'TR.BusinessSummary',
    'TR.PrimaryQuote',
    'TR.CommonName',
    'Contact Name',
    'TR.RegistrationNumber',
    'TR.LegalAddressLine1',
    'TR.SegmentRevenueEstBrokerName',
    'Contact Email Address',
    'TR.OrganizationWebsite',
    'Contact Phone Number',
    'TR.CommonHQAddressLine2',
    'TR.ShortExchangeName',
    'Main Fax Number',
    'TR.LegalAddressLine2',
    'TR.CompanyEmail'
]

In [None]:
all_static_frame.drop(columns=remove_cols, inplace=True)
converted: pd.DataFrame = all_static_frame.convert_dtypes()
g = converted.columns.to_series().groupby(converted.dtypes.apply(lambda x: x.name))
converted[g.get_group('string')] = converted[g.get_group('string')].astype('category')
list_of_type = list(converted.select_dtypes(include=['object']).columns)
sub_df = converted[list_of_type]

In [None]:
converted.to_csv(config.filtered_dir / "manually_filtered_static.csv")

In [11]:
#safe dtypes as files to make it possible to know the dtypes of the file while reading
dlist = converted.dtypes.to_csv(config.filtered_dir / "static_dtypes.csv", index=False)

In [15]:
import pandas as pd
from core import Config

config = Config()
dlist = pd.read_csv(config.filtered_dir / "static_dtypes.csv")
filtered_static_frame: pd.DataFrame = pd.read_csv(config.filtered_dir / "manually_filtered_static.csv", index_col=0, dtype=dlist.values)

In [None]:
# Count NaN in the whole static frame and filter
nan_count: pd.Series = all_static_frame.isna().sum()
filtered: pd.DataFrame = all_static_frame.filter(nan_count[nan_count == 0].index)
filtered_static_feature_list: list[str] = all_static_frame.columns.to_list()
filtered_static_feature_list.sort()

## Write list of features

In [None]:
with open(config.static_features_file, "w") as file:
    file.write("\n".join(str(i) for i in filtered_static_feature_list))