In [9]:
from pyexpat import features

from scipy.cluster.hierarchy import leaves_list

from core import Config
from data.cleaning import read_all_static_csv
import pandas as pd

config = Config()
static_dictionary: dict[str, pd.DataFrame] = read_all_static_csv(config.static_dir)
all_static_frame: pd.DataFrame = pd.concat(static_dictionary.values())

# Explorative Data Analysis

In [2]:
all_static_frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2865 entries, SCATC.OL to 8219.T
Columns: 696 entries, Non audit to Audit Fees Ratio Growth to TR.CriticalCountry5
dtypes: bool(4), float64(379), int64(21), object(292)
memory usage: 15.2+ MB


## Detect all features that have most companies in common

In [10]:
from collections import Counter

static_counter: Counter = Counter()
for dataframe in static_dictionary.values():
    static_counter.update(dataframe.columns.to_list())
static_dictionary_len = len(static_dictionary)
del static_dictionary
most_common_static_columns: list[tuple[str, int]] = static_counter.most_common()

In [11]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_static_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: int(x / static_dictionary_len * 100))
labels = ["{0} - {1}".format(i, i + 10) for i in range(0, 100, 10)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(0, 101, 10), labels=labels)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)

In [5]:
import plotly.express as px

fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "sameFeatureCount": "Anzahl überschneidender statischer Features",
        "companyCountIn%": "Anzahl Unternehmen in %"
    },
)
fig.update_traces(cliponaxis=False)
fig.update_coloraxes(showscale=False)
fig.write_image(config.data_dir / "figures" / "static-same-features-count.png")
fig.show()

90% or more samples have reported 491 of the same features

### Inspect common features between 90 and 100

In [12]:
bar_data: pd.DataFrame = pd.DataFrame(most_common_static_columns, columns=["Features", "sameFeatureCount"])
bar_data["sameFeatureCount"] = bar_data["sameFeatureCount"].apply(lambda x: int(x / static_dictionary_len * 100))
labels = ["{0}".format(i) for i in range(91, 101, 1)]
bar_data["companyCountIn%"] = pd.cut(bar_data["sameFeatureCount"], range(90, 101, 1), labels=labels)
grouped: pd.DataFrame = bar_data.groupby("companyCountIn%").count().reset_index(names="companyCountIn%")
grouped["companyCountIn%"] = grouped["companyCountIn%"].astype(str)
mask_under_90 = bar_data.loc[bar_data["sameFeatureCount"] < 90, "Features"]
mask_90_97 = bar_data.loc[(bar_data["sameFeatureCount"] >= 90) & (bar_data["sameFeatureCount"] < 98), "Features"]

In [7]:
fig = px.bar(
    grouped,
    x="companyCountIn%",
    y="sameFeatureCount",
    color="sameFeatureCount",
    color_continuous_scale="mint",
    text_auto=True,
    labels={
        "sameFeatureCount": "Anzahl überschneidender statischer Features",
        "companyCountIn%": "Anzahl Unternehmen in %"
    },
)
fig.update_traces(cliponaxis=False)
fig.update_coloraxes(showscale=False)
fig.show()

In [14]:
all_static_frame.drop(columns=mask_under_90, inplace=True)

We can see that even most features settle between 98% and 100%.

All features below 90% will be lost because there would be too much missing data that has to be imputed.

Features between 90% and 97% will be closely inspected in the domain knowledge filtering:

['TR.NACEClassification', 'TR.OrgFoundedDay', 'TR.MemberIndexRic', 'TR.MemberIndexDate', 'TR.CmnLegalAddressLine1', 'TR.LegalAddressLine1',
'TR.CompanyIncorpDate', 'TR.HeadquartersCity', 'Contact City Address', 'TR.CompanyParentType', 'TR.HQAddressCity',
'TR.TaxAuthority', 'TR.TaxAuthorityName', 'TR.HQAddressPostalCode', 'Contact Postal Code', 'TR.RegistrationNumber', 'TR.RegistrationCity',
'TR.LegalAddressCity', 'TR.DUNSNUMBER', 'TR.PricePctChgRelIdx5Y', 'TR.AlsoKnownAsName', 'Main Phone Number', 'TR.HeadquartersPhone',
'TR.WertCode', 'TR.IssuerTickerCode', 'TR.PriceNetChg5Y', 'TR.PricePctChg5Y', 'TR.LegalAddressPostalCode']


## Domain Knowledge Filtering

When inspecting the features 90% - 97% coverage, we can remove out all but ['TR.NACEClassification', 'TR.CompanyParentType']

### Manually removed features

In [27]:
representative_info: list[str] = [
    # Contact, Legal - HQ address is more important than contact address
    'Contact Postal Code',  #TR.CompanyCntPostalCodeAddr
    'Contact Country Name',  #TR.CompanyCntName, TR.CompanyCntCountryAddr
    'Contact Street Address',  #TR.CompanyCntStreetAddr
    'Contact City Address',  #TR.CompanyCntCityAddr
    'Contact Email Address',  #TR.CompanyCntEmail
    'Contact Title',  #TR.CompanyCntTitle
    'Contact Phone Number',  #TR.CompanyPhoneCnt
    'TR.LegalAddressCity',
    'TR.CmnLegalAddressLine1',
    'TR.CmnLegalAddressLine2',
    'TR.CmnLegalAddressLine3',
    'TR.LegalAddressLine1',
    'TR.LegalAddressLine2',
    'TR.LegalAddressLine3',
    'TR.LegalAddressPostalCode',
    'TR.LegalAddressStateProvince',
    'TR.RegistrationCountry',
    'TR.RegistrationCity',
]
contact_info: list[str] = [
    # Mail, Telephone, etc. - Unnecessary information
    'Main Fax Number',  #TR.CompanyPhoneFax
    'Main Phone Number',  #TR.CompanyPhoneMain
    'TR.HeadquartersPhone',
    'Web Link to Home Page',  #TR.CompanyLinkHome
    'TR.CompanyEmail',
    'TR.OrganizationWebsite',
]
identifier_info: list[str] = [
    # Identifier - Redundant information, because we use the RIC (Refinitiv instrument code) as id for financial instruments
    'TR.CommonName',
    'Company Name',  #TR.CompanyName
    'TR.AlsoKnownAsName',
    'TR.FormerlyKnownAsName',
    'TR.InstrumentName',
    'TR.InstrumentCommonName',
    'TR.ExchangeName',
    'TR.ShortExchangeName',
    'TR.OrganizationName',
    'TR.RelatedOrgName',
    'TR.RICCode',
    'Primary Issue RIC',  #TR.PrimaryRIC
    'TR.PrimaryQuote',
    'TR.PrimaryRICCode',
    'TR.PrimaryInstrument',
    'TR.PrimaryIssueRICCode',
    'Ticker Symbol',  #TR.TickerSymbol
    'TR.TickerSymbolOld',
    'TR.TickerSymbolCode',
    'TR.IssuerTickerCode',
    'TR.WertCode',
    'TR.SEDOLCode',
    'CUSIP (extended)',  #TR.CUSIPExtended
    'TR.RegistrationNumber',
    'TR.InstrumentTypeCode',
    'TR.ISINCode',
    'TR.SEDOLCode',
    'TR.IssuerPICode',
    'TR.ExchangeMarketIdCode',
    'TR.ValorenCode',
    'TR.LipperRICCode',
    'TR.CommonCode',
    'TR.DUNSNUMBER',
    'TR.CIKNUMBER',
    'TR.InstrumentDescription',
    'TR.MemberIndexRic',
    'ORG ID',
    'TR.QuoteID',
    'TR.InstrumentID',
    'TR.MXID',
    'TR.EstimateId',
]
summed_data: list[str] = [
    # Data that can't be put into categorical variables
    'Co. Business Summary',  #TR.CompanyInfoBusSummary
    'TR.BusinessSummary',
    'TR.CompanyInfoFinSummary',
    # and are not time series data or only averages of the last couple of weeks - seems to be all float types
    'TR.PricePctChgRelIdx5Y',
    'TR.PricePctChg5Y',
    'TR.PriceNetChg5Y',
    'TR.PriceAvgPctDiff90D',
    'TR.AvgMonthlyVolume13W',
    'TR.PriceAvgNetDiff2D',
    'TR.AvgDailyVolume80D',
    'TR.PriceAvgPctDiff40D',
    'TR.AvgDailyVolume3M',
    'TR.AvgDailyVolume90D',
    'TR.AvgDirMovIdxRating30D',
    'TR.PriceAvgNetDiff100D',
    'TR.AvgDailyValTraded30D',
    'TR.PriceAvgPctDiff240D',
    'TR.PriceAvgNetDiff50D',
    'TR.PriceAvg20D',
    'TR.PriceAvgPctDiff25D',
    'TR.PriceAvgPctDiff160D',
    'TR.AvgDailyVolume2D',
    'TR.AvgDailyVolume120D',
    'TR.PriceAvg10D',
    'TR.AvgDailyVolume25D',
    'TR.MovAvgIntersect60D200D',
    'TR.PriceAvgNetDiff10D',
    'TR.PriceAvgPctDiff250D',
    'TR.PriceAvgPctDiff180D',
    'TR.AvgDailyVolume150D',
    'TR.AvgDailyVolume5D',
    'TR.PriceAvgNetDiff30D',
    'TR.PriceAvgPctDiff200D',
    'TR.PriceAvgPctDiff20D',
    'TR.PriceAvg160D',
    'TR.PriceAvgPctDiff60D',
    'TR.PriceAvg40D',
    'TR.PriceAvg90D',
    'TR.AvgDailyVolume30D',
    'TR.PriceAvg25D',
    'TR.AvgDailyVolume10Day',
    'TR.AvgDailyVolume60D',
    'TR.PriceAvgPctDiff30D',
    'TR.PriceAvg180D',
    'TR.PriceAvgNetDiff25D',
    'TR.PriceAvgNetDiff180D',
    'TR.AvgDailyVolume6M',
    'TR.PriceAvgPctDiff10D',
    'TR.AvgDailyVolume180D',
    'TR.PriceAvg250D',
    'TR.AvgDailyVolume200D',
    'TR.PriceAvg6M',
    'TR.AvgDirMovIdxRating14D',
    'TR.AvgDailyVolume50D',
    'TR.PriceAvg2D',
    'TR.PriceAvgNetDiff200D',
    'TR.AvgDailyVolume160D',
    'TR.PriceAvgNetDiff240D',
    'TR.PriceAvgNetDiff250D',
    'TR.AvgDailyVolume5DPrior5D',
    'TR.PriceAvgNetDiff40D',
    'TR.PriceAvgNetDiff90D',
    'TR.AvgDailyVolume20D',
    'TR.PriceAvg100D',
    'TR.PriceAvg80D',
    'TR.AvgMonthlyVolume3M',
    'TR.PriceAvgPctDiff100D',
    'TR.PriceAvgNetDiff160D',
    'TR.PriceAvgPctDiff50D',
    'TR.PriceAvgNetDiff120D',
    'TR.PriceAvg240D',
    'TR.AvgDailyValTraded5D',
    'TR.PriceAvgPctDiff5D',
    'TR.AvgDailyVolume40D',
    'TR.PriceAvgNetDiff20D',
    'TR.MovAvgIntersect30D200D',
    'TR.PriceAvgNetDiff5D',
    'TR.PriceAvgNetDiff150D',
    'TR.AvgDailyValTraded52W',
    'TR.AvgDailyVolume250D',
    'TR.PriceAvg120D',
    'TR.MovAvgIntersect30D60D',
    'TR.AvgDailyVolume100D',
    'TR.PriceAvg3M',
    'TR.AvgDailyValTraded20D',
    'TR.PriceAvg30D',
    'TR.AvgDirMovIdxRating9D',
    'TR.AvgDailyVolume13W',
    'TR.PriceAvgPctDiff2D',
    'TR.AvgDailyVolume240D',
    'TR.PriceAvg60D',
    'TR.PriceAvgNetDiff60D',
    'TR.PriceAvg5D',
    'TR.PriceAvgPctDiff80D',
    'TR.PriceAvgNetDiff80D',
    'TR.PriceAvgPctDiff120D',
    'TR.PriceAvgPctDiff150D',
    'TR.MovAvgCDSignal',
    'TR.MovAvgCDLine2',
    'TR.MovAvgCDLine1',
    'TR.AvgMonthlyValTraded1Year',
    'TR.RelPricePctChg26W',
    'TR.VolumePctChg5D30D',
    'TR.PricePctChg90D',
    'TR.PriceNetChg1D',
    'TR.PricePctChg20D',
    'TR.PricePctChg8M',
    'TR.PricePctChg4M',
    'TR.PricePctChg11M',
    'TR.PriceNetChg5D',
    'TR.PriceNetChg3M',
    'TR.PriceNetChg30D',
    'TR.PricePctChg9M',
    'TR.PriceNetChg2M',
    'TR.VolumeNetChg5D30D',
    'TR.PricePctChg100D',
    'TR.PricePctChgYTD',
    'TR.RelPricePctChg1D',
    'TR.PriceNetChg8M',
    'TR.PriceNetChg20D',
    'TR.PriceNetChg240D',
    'TR.PricePctChg10M',
    'TR.PricePctChg4W',
    'TR.RelPricePctChg52W',
    'TR.PricePctChg6M',
    'TR.PricePctChg5D',
    'TR.RelPricePctChg13W',
    'TR.PricePctChg7M',
    'TR.PricePctChg120D',
    'TR.PricePctChgRelIdxYTD',
    'TR.PricePctChg26W',
    'TR.PriceNetChg90D',
    'TR.PricePctChg5M',
    'TR.PricePctChg40D',
    'TR.PricePctChg60D',
    'TR.PriceNetChg9M',
    'TR.PricePctChg25D',
    'TR.PricePctChg250D',
    'TR.PricePctChg50D',
    'TR.PricePctChgWTD',
    'TR.PricePctChg2D',
    'TR.PriceNetChg5M',
    'TR.PricePctChgRelIdx1Y',
    'TR.PricePctChgRelIdx5D',
    'TR.PricePctChg52W',
    'TR.PriceNetChg250D',
    'TR.PriceNetChg10D',
    'TR.PriceNetChg50D',
    'TR.PricePctChg52WkHigh',
    'TR.PricePctChg1D',
    'TR.RelPricePctChg4W',
    'TR.PricePctChg80D',
    'TR.PriceNetChg80D',
    'TR.PriceNetChg10M',
    'TR.PriceNetChg100D',
    'TR.PricePctChg1Y',
    'TR.PricePctChgRelIdx3M',
    'TR.PriceNetChg120D',
    'TR.PriceNetChg1M',
    'TR.PriceNetChg200D',
    'TR.PriceNetChg150D',
    'TR.PriceNetChg7M',
    'TR.PricePctChg2M',
    'TR.PriceNetChg4M',
    'TR.PriceNetChg40D',
    'TR.PricePctChgRelIdx1M',
    'TR.PricePctChg150D',
    'TR.PricePctChgQTD',
    'TR.PricePctChg3M',
    'TR.PricePctChg13W',
    'TR.PricePctChg200D',
    'TR.PricePctChg30D',
    'TR.PriceNetChg1Y',
    'TR.PriceNetChg25D',
    'TR.PriceRelSMAPctChg200D',
    'TR.PriceNetChg2D',
    'TR.PriceNetChg180D',
    'TR.PricePctChg10D',
    'TR.PricePctChgMTD',
    'TR.PriceNetChg11M',
    'TR.PricePctChg180D',
    'TR.VolumePctChg1D',
    'TR.PricePctChg1M',
    'TR.PricePctChg52WkLow',
    'TR.PriceNetChg160D',
    'TR.PricePctChg160D',
    'TR.PriceNetChg6M',
    'TR.RelPricePctChgYTD',
    'TR.PricePctChg240D',
    'TR.PriceNetChg60D',
    'TR.AlphaMthlyUp5Y',
    'TR.SharpeRatioWklyDown3Y',
    'TR.BetaFiveYear',
    'TR.BetaWklyUp3Y',
    'TR.SharpeRatioWklyUp3Y',
    'TR.SharpeRatioWklyDown2Y',
    'TR.BetaWklyAdj2Y',
    'TR.SharpeRatioWkly3Y',
    'TR.AlphaWklyUp3Y',
    'TR.AlphaWklyUp2Y',
    'TR.BetaWklyUp2Y',
    'TR.AlphaMthly5Y',
    'TR.SharpeRatioWkly2Y',
    'TR.AlphaWkly3Y',
    'TR.BetaWkly3Y',
    'TR.AlphaWklyDown3Y',
    'TR.AlphaWklyDown2Y',
    'TR.SharpeRatioMthlyUp5Y',
    'TR.BetaWklyDown2Y',
    'TR.BetaFiveYearAdj',
    'TR.AlphaWkly2Y',
    'TR.SharpeRatioMthly5Y',
    'TR.SharpeRatioMthlyDown5Y',
    'TR.BetaWklyAdj3Y',
    'TR.SharpeRatioWklyUp2Y',
    'TR.AlphaMthlyDown5Y',
    'TR.BetaWkly2Y',
    'TR.BetaWklyDown3Y',
    'Average Daily Value Traded – 1 Week',
    'TR.Price52WeekHigh',
    'TR.Price52WkHighFlg5D',
    'TR.Price52WeekLow',
    'TR.High1W',
    'TR.RSIWilder3D',
    'TR.RSIWilder14D',
    'TR.Low1W',
    'TR.RSIWilder30D',
    'TR.SortinoRatio156W',
    'TR.Price52WkLowFlg1D',
    'TR.Volume3WSum',
    'TR.Price52WkHighFlg1D',
    'TR.RSIWilder9D',
    '1 Week Total Return Cross Asset',
    'TR.Price52WkLowFlg5D',
    'Average Daily Value Traded – 3 Months',
    'TR.RSISimple30D',
    'TR.VolatilityCloseToClose120D',
    'TR.Volatility100D',
    'TR.Price200DayAverage',
    'TR.Price150DayAverage',
    'TR.Volatility260D',
    'TR.DirMovIdxDiPlus',
    'TR.PriceDeviation',
    'TR.Volatility120D',
    'TR.Volatility25D',
    'TR.Volatility80D',
    'TR.Price50DayAverage',
    'TR.VolumeSum10D',
    'TR.VolumeBlockSum10D',
    'Average Daily Value Traded – 6 Months',
    'TR.RSIExp14D',
    'TR.VolumeDeviation',
    'TR.Volatility250D',
    'TR.VolumeNonBlockSum10D',
    'TR.Volatility150D',
    'TR.Volatility160D',
    'TR.RSISimple14D',
    'TR.RSISimple9D',
    'TR.Volatility5D',
    'TR.Volatility2D',
    'TR.BetaDaily180D',
    'TR.RSIExp30D',
    'TR.VolatilityCloseToClose90D',
    'TR.Volatility40D',
    'TR.Volatility200D',
    'TR.Volatility20D',
    'TR.OrganizationID',
    'TR.Liquidity10DAmt',
    'TR.Volatility180D',
    'TR.Volatility240D',
    'TR.RSISimple3D',
    'TR.RSIExp9D',
    'TR.VolatilityCloseToClose20D',
    'TR.BetaDaily90D',
    'TR.RSIExp3D',
    'Average Daily Value Traded – 2 Months',
    'TR.Volatility60D',
    'TR.Volatility10D',
    'TR.VolatilityCloseToClose60D',
    'TR.Volatility50D',
    'TR.BetaDown',
    'TR.Volatility90D',
    'TR.DirMovIdxDiMinus',
    'TR.Liquidity10DVol',
    'TR.Volatility30D',
    'TR.SortinoRatio60M',
    'TR.High1M',
    'TR.Low1M',
    '3-year Price PCT Change',
    'TR.BollingerLowBand',
    'TR.BollingerMidBand',
    'TR.BollingerUpBand',
    'TR.BetaUp',
    'TR.MoneyFlowTotalVol',
    'TR.MoneyFlowNonBlockVol',
    'TR.StopAndReversalPoint',
    'TR.SettlementPeriod',
]
too_precise_data: list[str] = [
    # Data that is too specific or is too distinctive
    'TR.CmnLegalAddressLine1',
    'TR.CommonHQAddressLine1',
    'TR.CommonHQAddressLine2',
    'TR.CommonHQAddressLine3',
    'TR.LegalAddressLine1',
    'TR.LegalAddressLine2',
    'TR.HQAddressLine1',
]
redundant_data: list[str] = [
    # Redundant information
    'TR.ExchangeCode',  #instead, use TR.HQCountryCode
    'TR.RegCountryCode',  #instead, use TR.HQCountryCode
    'TR.RegStateProvince',  #instead, use TR.HQStateProvince
    'TR.HQAddressStateProvince',  #instead, use TR.HQStateProvince
    'TR.ExchangeRegion',  #instead, use TR.HQCountryCode
    'TR.HQAddressCity',  #instead, use TR.HeadquartersCity
    'TR.HeadquartersRegion',  #instead, use TR.HQCountryCode
    'TR.LegalAddressCountryISO',  #instead, use TR.HQCountryCode
    'TR.HQAddressCountryISO',  #instead, use TR.HQCountryCode
    'TR.LegalAddressPostalCode',  #instead, use TR.HQAddressPostalCode
    'TR.ImmediateParentCountryHQ',  #instead, use TR.ImmediateParentISOCountryHQ
    'TR.UltimateParentCountryHQ',  #instead, use TR.HQCountryCode
    'TR.UltimateParentISOCountryHQ',  #instead, use TR.HQCountryCode
    'TR.RelatedOrgCountry',  #instead, use TR.HQCountryCode
    'TR.HeadquartersCountry',  #instead, use TR.HQCountryCode
    'TR.NAICSSectorAll',  #instead, use TR.NAICSSector
    'TR.NAICSIndustryGroupAll',  #instead, use TR.NAICSIndustryGroup
    'TR.NAICSNationalIndustryAll',  #instead, use TR.NAICSNationalIndustry
    'TR.NAICSIndustryGroupAllCode',  #instead, use TR.NAICSIndustryGroupCode
    'TR.NAICSNationalIndustryAllCode',  #instead, use TR.NAICSNationalIndustryCode
    'TR.NAICSSubsectorAll',  #instead, use TR.NAICSSubsector
    'TR.NAICSNationalIndustry',  #instead, use TR.NAICSInternationalIndustry
    'TR.NAICSInternationalIndustryAll',  #instead, use TR.NAICSInternationalIndustry
    'TR.NAICSInternationalIndustry',  #instead, use TR.NAICSIndustryGroup
    'TR.NAICSInternationalIndustryAllCode',  #instead, use TR.NAICSInternationalIndustryCode
    'TR.NAICSSubsectorAllCode',  #instead, use TR.NAICSSubsectorCode
    'TR.QuoteMarketCapitalization',  #instead, use TR.QuoteMarketCap
    'TR.UltimateParentId',  #instead, use TR.UltimateParent
    'TR.PriceMoPriceCurrency',  #instead, use Currency Code
]
dates: list[str] = [
    # Dates of reporting
    'Number Employees Date',  #TR.CompanyNumEmployDate
    'Number of Holders Date',  #TR.CompanyShldrNumDate
    'Shares O/S Date',  #TR.CompanySharesOutDate
    'Fiscal Year End Date',  #TR.CompanyFYearEnd
    'Source Filing Date',  #TR.CompanySrcFileDate
    'TR.CompanyPublicSinceDate',
    'TR.Price52WeekLowDate',
    'TR.DataThroughDate',
    'TR.FirstTradeDate',
    'TR.MemberIndexDate',
    'TR.CompanyIncorpDate',
    'TR.DataThroughDateValuation',
    'TR.IPODate',
    'TR.LowDate1W',
    'TR.HighDate1W',
    'TR.LowDate1M',
    'TR.HighDate1M',
    'TR.RetireDate',
    'TR.OrgFoundedDay',
    'TR.Price52WeekHighDate',
    'TR.SegmentRevenueProdNoteDate',
    'TR.SegmentEBITDAEstDate',
    'TR.SegmentEBITEstDate',
    'TR.SegmentEBITDAProdNoteDate',
    'TR.SegmentEBITEstConfirmDate',
    'TR.SegmentEBITEstStopDate',
    'TR.SegmentRevenueEstDate',
    'TR.SegmentRevenueEstConfirmDate',
    'TR.SegmentOpProfitEstDate',
    'TR.SegmentOpProfitProdNoteDate',
    'TR.SegmentEBITProdNoteDate',
    'TR.SegmentOpProfitEstConfirmDate',
    'TR.SegmentEBITDAEstConfirmDate',
    'TR.SegmentOrganicSalesGrowthEstConfirmDate',
    'TR.SegmentOrganicSalesGrowthEstDate',
    'TR.SegmentOrganicSalesGrowthProdNoteDate',
    'TR.SegmentNumofStoresByTotalProdNoteDate',
    'TR.SegmentNumofStoresByTotalEstConfirmDate',
    'TR.SegmentNumofStoresByTotalEstDate',
    'TR.SegmentSubscribersEstConfirmDate',
    'TR.SegmentSubscribersEstDate',
    'TR.SegmentSubscribersProdNoteDate',
    'TR.SegmentNetSubscriberAddsEstDate',
    'TR.SegmentNetSubscriberAddsProdNoteDate',
    'TR.SegmentNetSubscriberAddsEstConfirmDate',
    'TR.SegmentDailyActiveUsersEstConfirmDate',
    'TR.SegmentDailyActiveUsersProdNoteDate',
    'TR.SegmentDailyActiveUsersEstDate',
    'TR.SegmentNumofStoresOpenedByTotalProdNoteDate',
    'TR.SegmentRevenueEstStopDate',
    'TR.SegmentEBITDAReportedProdNoteDate',
    'TR.SegmentNumofStoresOpenedByTotalEstDate',
    'TR.SegmentEBITDAEstStopDate',
    'TR.SegmentNumofStoresClosedProdNoteDate',
    'TR.SegmentNumofStoresClosedEstDate',
    'TR.SegmentNumofStoresOpenedByTotalEstConfirmDate',
]
other_data: list[str] = [
    # Other
    'TR.SegmentRevenueEstBrokerName',  #The name of the broker forecasting the estimate.
    'TR.MICName',  #MIC name
    'TR.TaxAuthorityName',  #The authority to which the respective Organization pays taxes.
    'TR.PriceMainIndexRIC',
    #The main Index RIC applicable to the entity in question and used as the benchmark in values like Dividend Yield Relative to Primary Index.
    'TR.OrgProviderTypeCode',  #Organization Provider type code.
    'TR.CUSIPCode',
    'TR.CinCUSIPCode',
    #Committee on Uniform Securities Identification Procedures Identifier for Non US & Canadian companies.
    'TR.ExchangeCountry',  #Country where the instrument trades.
    'TR.ExchangeCountryCode',  #ISO2 country code where the instrument trades.
    'TR.OrganizationStatusCode',  #Indicates whether the Organization is active in the real world.
    'TR.EquityLocalCode',  #Local code
    'TR.AssetIDCode',  #The Refinitiv Fixed Income identifier.
    'TR.TaxAuthority',  # I don't think that to know where the taxes are paid is relevant.
    'TR.TaxAuthorityName',
    'TR.OrganizationVerified',  # only same values
    'TR.IsRule144aRegistered',  # only same values
    'TR.IsCompositeQuote',  # only same values
    'TR.IsPrimaryQuote',  # don't know what this is and only 5 are not
    'TR.OrgFoundedMonth',
    'TR.IsPrimaryInstrument',  # only three are not
    'TR.PrimListFunExist',  # only same values
    'TR.HasFundamentalCoverage',  # only same values
    'TR.HasESGCoverage'  # only same values
]
# unsure about these codes and classifiers
unsure = [
    'TR.OrgSubtypeCode',
    'TR.TRBCBusinessSectorAllCode',
    'TR.ICBSectorCode',
    'TR.TRBCActivityAllCode',
    'TR.ICBSupersectorCode',
    'TR.TRBCEconSectorAllCode',
    'TR.ICBIndustryCode',
    'TR.TRBCIndustryCode',
    'TR.TRBCActivityCode',
    'TR.TRBCEconSectorCode',
    'TR.TRBCIndustryGroupAllCode',
    'TR.InstrumentListingStatusCode',
    'TR.TRBCIndustryGroupCode',
    'TR.ICBSubsectorCode',
    'TR.TRBCIndustryAllCode',
    'TR.OrgTypeCode',
    'TR.AssetCategoryCode',
    'TR.TRBCBusinessSectorCode',
    'TR.SICIndustryCode',
    'TR.SICDivisionCode',
    'TR.SICIndustryGroupCode',
    'TR.SICMajorGroupCode',
    'TR.CriticalCountry1',
    'TR.IsCountryPrimaryQuote',
    'TR.ImmediateParentISOCountryHQ',
    'TR.ImmediateParent',  # has around 200 values that have another parent company / could encode company=parent to 0
    'TR.NACEClassification',  # is the standard European industry classification system
    'TR.IsDelistedQuote',  # flag if a quote is delisted (only 1% are)
    'TR.QuoteMarketCap',
    # represents the marked value - Has to be reloaded with USD as currency'TR.NAICSIndustryGroup', #focus on GICSCodes
    'TR.NAICSInternationalIndustryCode',  #TR.NAICSSectorCode is more generalized
    'TR.NAICSNationalIndustryCode',  #TR.NAICSSectorCode is more generalized
    'TR.NAICSIndustryGroupCode',  #TR.NAICSSectorCode is more generalized
    'TR.NAICSSubsectorCode',  #TR.NAICSSectorCode is more generalized
    'TR.NAICSSectorAllCode',  #TR.NAICSSectorCode is more generalized
    'TR.NAICSIndustryGroup',  #focus on GICSCodes
    'TR.NAICSSectorCode',  #focus on GICSCodes
    'TR.NAICSSector',  #focus on GICSCodes
    'TR.TRBCBusinessSectorAll',  #focus on GICSCodes
    'TR.TRBCIndustryGroupAll',  #focus on GICSCodes
    'TR.TRBCBusinessSector',  #focus on GICSCodes
    'TR.TRBCEconomicSector',  #focus on GICSCodes
    'TR.TRBCEconSectorAll',  #focus on GICSCodes
    'TR.TRBCIndustryGroup',  #focus on GICSCodes
    'TR.TRBCActivityAll',  #focus on GICSCodes
    'TR.TRBCIndustryAll',  #focus on GICSCodes
    'TR.TRBCIndustry',  #focus on GICSCodes
    'TR.TRBCActivity',  #focus on GICSCodes
    'TR.ICBSupersector',  #focus on GICSCodes
    'TR.ICBSubsector',  #focus on GICSCodes
    'TR.ICBIndustry',  #focus on GICSCodes
    'TR.ICBSector',  #focus on GICSCodes
]
remove_cols: list[str] = list(representative_info + identifier_info + contact_info +
                              summed_data + too_precise_data + redundant_data + dates + other_data + unsure)
already_filtered_under_90 = [
    'Contact Email Address', 'Contact Title', 'Contact Phone Number', 'TR.CmnLegalAddressLine2',
    'TR.CmnLegalAddressLine3', 'TR.LegalAddressLine2', 'TR.LegalAddressLine3', 'TR.LegalAddressStateProvince',
    'Main Fax Number', 'TR.CompanyEmail', 'TR.FormerlyKnownAsName', 'TR.ShortExchangeName', 'TR.ValorenCode',
    'TR.LipperRICCode', 'TR.CommonCode', 'TR.CIKNUMBER', 'TR.MemberIndexRicCo. Business Summary',
    'TR.PricePctChgRelIdx5Y, TR.PricePctChg5Y, TR.PriceNetChg5YTR.CmnLegalAddressLine1', 'TR.CommonHQAddressLine2',
    'TR.CommonHQAddressLine3', 'TR.LegalAddressLine2', 'TR.RegStateProvince', 'TR.HQAddressStateProvince',
    'Number Employees Date', 'Number of Holders Date', 'TR.SegmentEBITDAProdNoteDate', 'TR.SegmentEBITProdNoteDate',
    'TR.SegmentRevenueProdNoteDate', 'TR.SegmentEBITDAEstDate', 'TR.SegmentEBITEstDate', 'TR.SegmentRevenueEstDate',
    'TR.SegmentRevenueEstConfirmDate', 'TR.SegmentOpProfitEstDate', 'TR.SegmentOpProfitProdNoteDate',
    'TR.SegmentOpProfitEstConfirmDate', 'TR.SegmentEBITEstConfirmDate', 'TR.SegmentEBITDAEstConfirmDate',
    'TR.SegmentOrganicSalesGrowthEstConfirmDate', 'TR.SegmentOrganicSalesGrowthEstDate',
    'TR.SegmentOrganicSalesGrowthProdNoteDate', 'TR.SegmentEBITEstStopDate', 'TR.SegmentNumofStoresByTotalProdNoteDate',
    'TR.SegmentNumofStoresByTotalEstConfirmDate', 'TR.SegmentNumofStoresByTotalEstDate', 'TR.RetireDate',
    'TR.SegmentSubscribersEstConfirmDate', 'TR.SegmentSubscribersEstDate', 'TR.SegmentSubscribersProdNoteDate',
    'TR.SegmentNetSubscriberAddsEstDate', 'TR.SegmentNetSubscriberAddsProdNoteDate',
    'TR.SegmentNetSubscriberAddsEstConfirmDate', 'TR.SegmentDailyActiveUsersEstConfirmDate',
    'TR.SegmentDailyActiveUsersProdNoteDate', 'TR.SegmentDailyActiveUsersEstDate',
    'TR.SegmentNumofStoresOpenedByTotalProdNoteDate', 'TR.SegmentRevenueEstStopDate',
    'TR.SegmentEBITDAReportedProdNoteDate', 'TR.SegmentNumofStoresOpenedByTotalEstDate', 'TR.SegmentEBITDAEstStopDate',
    'TR.SegmentNumofStoresClosedProdNoteDate', 'TR.SegmentNumofStoresClosedEstDate',
    'TR.SegmentNumofStoresOpenedByTotalEstConfirmDate', 'TR.SegmentRevenueEstBrokerName', 'TR.CUSIPCode',
    'TR.CinCUSIPCode', 'TR.EquityLocalCode', 'TR.AssetIDCode', 'TR.SICIndustryCode', 'TR.SICDivisionCode',
    'TR.SICIndustryGroupCode', 'TR.SICMajorGroupCode', 'TR.CriticalCountry1'
]
filter_cols = list(set(remove_cols + unsure) - set(already_filtered_under_90))
all_static_frame = all_static_frame.drop(columns=filter_cols)

### Save as csv

In [None]:
all_static_frame = all_static_frame.convert_dtypes()
all_static_frame.to_csv(config.filtered_dir / "eda_filtered_static.csv")
#safe dtypes as files to make it possible to know the dtypes of the file while reading
dlist = all_static_frame.dtypes.to_csv(config.filtered_dir / "eda_filtered_static_dtypes.csv", index=False)

# Work with filtered dataset

In [None]:
import pandas as pd
from core import Config

config = Config()
dlist = pd.read_csv(config.filtered_dir / "eda_filtered_static_dtypes.csv")
filtered_static_frame: pd.DataFrame = pd.read_csv(config.filtered_dir / "eda_filtered_static.csv", index_col=0,
                                                  dtype=dlist.values)

## Filter companies with many NaN values (over 10%)

In [None]:
nan_counts = all_static_frame.isnull().sum(axis=1).sort_values(ascending=False)
# companies with NaN values over 10%
companies_with_nan_values_over_10 = nan_counts[nan_counts > all_static_frame.shape[1] * 0.1]

In [None]:
# Count NaN in the whole static frame and filter
nan_count: pd.Series = all_static_frame.isna().sum()
filtered_static_feature_list: list[str] = all_static_frame.columns.to_list()
filtered_static_feature_list.sort()
# This is a dataframe without NaN values
filtered: pd.DataFrame = all_static_frame.filter(nan_count[nan_count == 0].index)
not_none_filtered: pd.DataFrame = all_static_frame.filter(nan_count[nan_count != 0].index)

## Write list of features

In [None]:
with open(config.static_features_file, "w") as file:
    file.write("\n".join(str(i) for i in filtered_static_feature_list))