In [1]:
from core import Config
from data.cleaning import read_all_static_csv
import pandas as pd

config = Config()
static_dictionary: dict[str, pd.DataFrame] = read_all_static_csv(config.filtered_dir_static)
all_static_frame: pd.DataFrame = pd.concat(static_dictionary.values())

# Explorative Data Analysis

In [2]:
all_static_frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2853 entries, SCATC.OL to 8219.T
Columns: 482 entries, Contact Postal Code to Web Link to Home Page
dtypes: bool(6), float64(311), int64(18), object(147)
memory usage: 10.4+ MB


## Detect all features that have most companies in common

In [3]:
from collections import Counter

static_counter: Counter = Counter()
for dataframe in static_dictionary.values():
    static_counter.update(dataframe.columns.to_list())
static_dictionary_len = len(static_dictionary)
del static_dictionary
most_common_static_columns: list[tuple[str, int]] = static_counter.most_common()

In [4]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_static_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: int(x / static_dictionary_len * 100))
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(0, 101, 10), labels=labels)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)

In [5]:
import plotly.express as px

fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "sameFeatureCount": "Anzahl überschneidender statischer Features",
        "companyCountIn%": "Anzahl Unternehmen in %"
    },
)
fig.update_traces(cliponaxis=False)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "static-same-features-count.png")
fig.show()

90% or more samples have reported 491 of the same features

### Inspect common features between 90 and 100

In [6]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_static_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: int(x / static_dictionary_len * 100))
labels = ["{0}".format(i) for i in range(91, 101, 1)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(90, 101, 1), labels=labels)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)
mask_under_90 = bar_data.loc[bar_data["sameFeatureCount"] < 90, "Features"]
mask_90_97 = bar_data.loc[(bar_data["sameFeatureCount"] >= 90) & (bar_data["sameFeatureCount"] < 98), "Features"]

In [7]:
fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "sameFeatureCount": "Anzahl überschneidender statischer Features",
        "companyCountIn%": "Anzahl Unternehmen in %"
    },
)
fig.update_traces(cliponaxis=False)
fig.update_coloraxes(showscale=False)
fig.show()

In [8]:
all_static_frame.drop(columns=mask_under_90, inplace=True)

We can see that even most features settle between 98% and 100%.

All features below 90% will be lost because there would be too much missing data that has to be imputed.

Features between 90% and 97% will be closely inspected in the domain knowledge filtering:

['TR.NACEClassification', 'TR.OrgFoundedDay', 'TR.MemberIndexRic', 'TR.MemberIndexDate', 'TR.CmnLegalAddressLine1', 'TR.LegalAddressLine1',
'TR.CompanyIncorpDate', 'TR.HeadquartersCity', 'Contact City Address', 'TR.CompanyParentType', 'TR.HQAddressCity',
'TR.TaxAuthority', 'TR.TaxAuthorityName', 'TR.HQAddressPostalCode', 'Contact Postal Code', 'TR.RegistrationNumber', 'TR.RegistrationCity',
'TR.LegalAddressCity', 'TR.DUNSNUMBER', 'TR.PricePctChgRelIdx5Y', 'TR.AlsoKnownAsName', 'Main Phone Number', 'TR.HeadquartersPhone',
'TR.WertCode', 'TR.IssuerTickerCode', 'TR.PriceNetChg5Y', 'TR.PricePctChg5Y', 'TR.LegalAddressPostalCode']


## Domain Knowledge Filtering

When inspecting the features 90% - 97% coverage, we can remove out all but ['TR.NACEClassification', 'TR.CompanyParentType']

### Manually removed features

In [9]:
from data.constants.constants_features import REPRESENTATIVE_INFO, IDENTIFIER_INFO, CONTACT_INFO, SUMMED_DATA, TOO_PRECISE_DATA, REDUNDANT_DATA, DATES, OTHER_DATA, UNSURE

remove_cols: set = set(
    REPRESENTATIVE_INFO + IDENTIFIER_INFO + CONTACT_INFO +
    SUMMED_DATA + TOO_PRECISE_DATA + REDUNDANT_DATA + DATES +
    OTHER_DATA + UNSURE
)
test = all_static_frame.columns.to_list()
reducing_set: set = set(all_static_frame.columns) & remove_cols
all_static_frame.drop(columns=reducing_set, inplace=True)

### Save as csv

In [10]:
all_static_frame = all_static_frame.convert_dtypes()
all_static_frame.to_csv(config.eda_filtered_dir / "eda_filtered_static.csv")
#safe dtypes as files to make it possible to know the dtypes of the file while reading
dtypes = all_static_frame.dtypes.to_frame('dtypes').reset_index()
dtypes.to_csv(config.eda_filtered_dir / "eda_filtered_static_dtypes.csv")

# Work with filtered dataset

In [1]:
import pandas as pd
from core import Config

config = Config()
dtypes: pd.DataFrame = pd.read_csv(
    config.eda_filtered_dir / "eda_filtered_static_dtypes.csv",
    index_col=0
)
dtypes_dict: dict[str, str] = {}
for row in dtypes.itertuples(index=False):
    dtypes_dict[row[0]] = row[1]

filtered_df: pd.DataFrame = pd.read_csv(
    config.eda_filtered_dir / "eda_filtered_static.csv",
    index_col=0,
    dtype=dtypes_dict,
)

# remove samples without any GICS Sector
filtered_df.drop([
    '80CX.L', 'DADA.OQ', 'BSIF.L', 'GCPI.L', 'ETWO.N', 'HICL.L', 'ORIT.L',
    'GREENY.BR', '42TE.L', 'TSETNQJ.J', 'DLAR.L', 'BRWM.L', 'SYNCS.L',
    'SAIN.L', '2888.TW'
], inplace=True)

### Fill with Median or most common values (mode)

In [3]:
# Example
# modes: dict[str, str] = {}
# for group in filtered_df.groupby("TR.HQCountryCode")['Currency Code']:
#     country = group[0]
#     currency = group[1].mode().iloc[0]
#     modes[country] = currency
# mask = filtered_df['Currency Code'].isna()
# filtered_df.loc[mask, 'Currency Code'] = filtered_df.loc[mask, 'TR.HQCountryCode'].map(modes)

fill_key_by_modes_of_value: dict[str, str] = {
    'Currency Code': 'TR.HQCountryCode',
    'TR.AssetCategory': 'TR.GICSSectorCode',
    'TR.BusinessSector': 'TR.GICSSectorCode',
    'TR.BusinessSectorScheme': 'TR.GICSSectorCode',
    'TR.CompanyParentType': 'TR.GICSSectorCode',
    'TR.HeadquartersRegionAlt': 'TR.HQCountryCode',
    'TR.InstrumentType': 'TR.GICSSectorCode',
    'TR.OrganizationType': 'TR.GICSSectorCode',
    'TR.PriceMainIndex': 'TR.HQCountryCode',
    'TR.RelatedOrgISO2': 'TR.HQCountryCode',
    'TR.RelatedOrgType': 'TR.GICSSectorCode',
}
modes: dict[str, str]
mask: pd.Series
for missing_value_col, col in fill_key_by_modes_of_value.items():
    modes = {}
    for group in filtered_df.groupby(col)[missing_value_col]:
        modes[group[0]] = group[1].mode().iloc[0]
    mask = filtered_df[missing_value_col].isna()
    filtered_df.loc[mask, missing_value_col] = filtered_df.loc[mask, col].map(modes)

medians: dict[str, int] = {}
for group in filtered_df.groupby('TR.GICSSectorCode')['Total Share Float']:
    total_shares = group[1].dropna()
    medians[group[0]] = int(total_shares.median())
mask = filtered_df['Total Share Float'].isna()
filtered_df.loc[mask, 'Total Share Float'] = filtered_df.loc[mask, 'TR.GICSSectorCode'].map(medians)

In [4]:
from data.constants.constants_hq import HQ
filtered_df['TR.HeadquartersCity'].fillna(HQ, inplace=True)

## Filter companies with many NaN values (over 10%)

In [None]:
nan_counts = all_static_frame.isnull().sum(axis=1).sort_values(ascending=False)
# companies with NaN values over 10%
companies_with_nan_values_over_10 = nan_counts[nan_counts > all_static_frame.shape[1] * 0.1]

In [None]:
# Count NaN in the whole static frame and filter
nan_count: pd.Series = all_static_frame.isna().sum()
filtered_static_feature_list: list[str] = all_static_frame.columns.to_list()
filtered_static_feature_list.sort()
# This is a dataframe without NaN values
filtered: pd.DataFrame = all_static_frame.filter(nan_count[nan_count == 0].index)
not_none_filtered: pd.DataFrame = all_static_frame.filter(nan_count[nan_count != 0].index)

## Write list of features

In [None]:
with open(config.static_features_file, "w") as file:
    file.write("\n".join(str(i) for i in filtered_static_feature_list))