In [1]:
from core import Config
from data.cleaning import read_all_static_csv
import pandas as pd

config = Config()
static_dictionary: dict[str, pd.DataFrame] = read_all_static_csv(config.static_dir)
all_static_frame: pd.DataFrame = pd.concat(static_dictionary.values())

# Detect all columns that have most companies in common (>90%)

In [None]:
from collections import Counter
static_counter: Counter = Counter()
for dataframe in static_dictionary.values():
    static_counter.update(dataframe.columns.to_list())
static_dictionary_len = len(static_dictionary)
del static_dictionary
most_common_static_columns: list[tuple[str, int]] = static_counter.most_common()

In [None]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_static_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: int(x/static_dictionary_len * 100))
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(0, 101, 10), labels=labels)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)
mask = bar_data.loc[bar_data["sameFeatureCount"] < 90, "Features"]
all_static_frame.drop(columns=mask, inplace=True)

In [None]:
import plotly.express as px

fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "sameFeatureCount": "Anzahl überschneidender statischer Features",
        "companyCountIn%": "Anzahl Unternehmen in %"
    },
)
fig.update_traces(cliponaxis=False)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "static-same-features-count.png")
fig.show()

In [19]:
columns_contain = all_static_frame.columns[all_static_frame.columns.str.contains("CUSIP")]

# Domain Knowledge Filtering

In [20]:
remove_cols: list[str] = [
    # Contact - HQ address is more important than contact address
    'Contact Postal Code', #TR.CompanyCntPostalCodeAddr
    'Contact Country Name', #TR.CompanyCntName, TR.CompanyCntCountryAddr
    'Contact Street Address', #TR.CompanyCntStreetAddr
    'Contact City Address', #TR.CompanyCntCityAddr
    'Contact Email Address', #TR.CompanyCntEmail
    'Contact Title', #TR.CompanyCntTitle
    'Contact Phone Number', #TR.CompanyPhoneCnt

    # Mail, Telephone, etc. - Unnecessary information
    'Main Fax Number', #TR.CompanyPhoneFax
    'Main Phone Number', #TR.CompanyPhoneMain
    'TR.HeadquartersPhone',
    'Web Link to Home Page', #TR.CompanyLinkHome
    'TR.CompanyEmail',
    'TR.OrganizationWebsite',

    # Identifier - Redundant information, because we use the RIC (Refinitiv instrument code) as id for financial instruments
    'TR.CommonName',
    'Company Name', #TR.CompanyName
    'TR.AlsoKnownAsName',
    'TR.FormerlyKnownAsName',
    'TR.InstrumentName',
    'TR.InstrumentCommonName',
    'TR.ExchangeName',
    'TR.ShortExchangeName',
    'TR.OrganizationName',
    'TR.RelatedOrgName',
    'TR.RICCode',
    'Primary Issue RIC', #TR.PrimaryRIC
    'TR.PrimaryQuote',
    'TR.PrimaryRICCode',
    'TR.PrimaryInstrument',
    'TR.PrimaryIssueRICCode',
    'Ticker Symbol', #TR.TickerSymbol
    'TR.TickerSymbolOld',
    'TR.TickerSymbolCode',
    'TR.IssuerTickerCode',
    'TR.WertCode',
    'TR.SEDOLCode',
    'CUSIP (extended)', #TR.CUSIPExtended
    'TR.RegistrationNumber',
    'TR.InstrumentTypeCode',
    'TR.ISINCode',
    'TR.SEDOLCode',
    'TR.IssuerPICode',
    'TR.ExchangeMarketIdCode',
    'TR.ValorenCode',
    'TR.LipperRICCode',
    'TR.CommonCode',

    # Data that can't be put into categorical variables
    'Co. Business Summary', #TR.CompanyInfoBusSummary
    'TR.BusinessSummary',

    # Data that is too specific
    'TR.CmnLegalAddressLine1',
    'TR.CommonHQAddressLine2',
    'TR.LegalAddressLine1',
    'TR.LegalAddressLine2',
    'TR.HQAddressLine1',
    'TR.OrgFoundedDay',

    # Redundant information
    'TR.RegCountryCode', #instead, use TR.HQCountryCode
    'TR.ExchangeRegion', #instead, use TR.HQCountryCode
    'TR.HeadquartersRegion', #instead, use TR.HQCountryCode
    'TR.NAICSSectorCode', #instead, use TR.NAICSSectorAllCode
    'TR.LegalAddressPostalCode', #instead, use TR.HQAddressPostalCode

    # Other
    'TR.SegmentRevenueEstBrokerName', #The name of the broker forecasting the estimate.
    'TR.MICName', #MIC name
    'TR.TaxAuthorityName', #The authority to which the respective Organization pays taxes.
    'TR.PriceMainIndexRIC', #The main Index RIC applicable to the entity in question and used as the benchmark in values like Dividend Yield Relative to Primary Index.
    'TR.OrgProviderTypeCode', #Organization Provider type code.
    'TR.CUSIPCode',
    'TR.CinCUSIPCode', #Committee on Uniform Securities Identification Procedures Identifier for Non US & Canadian companies.
    'TR.ExchangeCountryCode', #ISO2 country code where the instrument trades.
    'TR.OrganizationStatusCode', #Indicates whether the Organization is active in the real world.
    'TR.EquityLocalCode', #Local code
    'TR.AssetIDCode', #The Refinitiv Fixed Income identifier.
]
# unsure about these codes and classifiers
unsure = [
    'TR.OrgSubtypeCode', 'TR.TRBCBusinessSectorAllCode', 'TR.ICBSectorCode',
    'TR.NAICSSectorAllCode', 'TR.HQAddressPostalCode', 'TR.TRBCActivityAllCode',
    'TR.ICBSupersectorCode', 'TR.NAICSInternationalIndustryAllCode',
    'TR.TRBCEconSectorAllCode', 'TR.ICBIndustryCode', 'TR.HQCountryCode',
    'TR.CompanyReportCurrency', 'TR.NAICSSubsectorCode', 'TR.TRBCIndustryCode',
    'TR.NAICSInternationalIndustryCode', 'TR.TRBCActivityCode', 'TR.TRBCEconSectorCode',
    'TR.GICSIndustryGroupCode', 'TR.TRBCIndustryGroupAllCode', 'TR.OrganizationTypeCode',
    'TR.InstrumentListingStatusCode', 'TR.NAICSIndustryGroupAllCode',
    'TR.NAICSIndustryGroupCode', 'TR.TRBCIndustryGroupCode', 'TR.NAICSSubsectorAllCode',
    'TR.GICSSectorCode', 'TR.ICBSubsectorCode', 'TR.NAICSNationalIndustryAllCode',
    'TR.TRBCIndustryAllCode', 'TR.GICSSubIndustryCode', 'TR.OrgTypeCode',
    'TR.AssetCategoryCode', 'TR.NAICSNationalIndustryCode', 'TR.TRBCBusinessSectorCode',
    'TR.GICSIndustryCode', 'TR.SICIndustryCode', 'TR.SICDivisionCode',
    'TR.SICIndustryGroupCode', 'TR.SICMajorGroupCode'
]

In [21]:
second = all_static_frame.drop(columns=remove_cols)

In [None]:
all_static_frame.drop(columns=remove_cols, inplace=True)
converted: pd.DataFrame = all_static_frame.convert_dtypes()
g = converted.columns.to_series().groupby(converted.dtypes.apply(lambda x: x.name))
converted[g.get_group('string')] = converted[g.get_group('string')].astype('category')
list_of_type = list(converted.select_dtypes(include=['object']).columns)
sub_df = converted[list_of_type]

### Save as csv

In [None]:
converted.to_csv(config.filtered_dir / "manually_filtered_static.csv")
#safe dtypes as files to make it possible to know the dtypes of the file while reading
dlist = converted.dtypes.to_csv(config.filtered_dir / "static_dtypes.csv", index=False)

# Work with filtered dataset

In [None]:
import pandas as pd
from core import Config

config = Config()
dlist = pd.read_csv(config.filtered_dir / "static_dtypes.csv")
filtered_static_frame: pd.DataFrame = pd.read_csv(config.filtered_dir / "manually_filtered_static.csv", index_col=0, dtype=dlist.values)

## Filter companies with many NaN values (over 10%)

In [None]:
nan_counts = all_static_frame.isnull().sum(axis=1).sort_values(ascending=False)
# companies with NaN values over 10%
companies_with_nan_values_over_10 = nan_counts[nan_counts > all_static_frame.shape[1] * 0.1]

In [None]:
# Count NaN in the whole static frame and filter
nan_count: pd.Series = all_static_frame.isna().sum()
filtered_static_feature_list: list[str] = all_static_frame.columns.to_list()
filtered_static_feature_list.sort()
# This is a dataframe without NaN values
filtered: pd.DataFrame = all_static_frame.filter(nan_count[nan_count == 0].index)
not_none_filtered: pd.DataFrame = all_static_frame.filter(nan_count[nan_count != 0].index)

## Write list of features

In [None]:
with open(config.static_features_file, "w") as file:
    file.write("\n".join(str(i) for i in filtered_static_feature_list))