In [2020]:
import pandas as pd
import numpy as np

import os
from pathlib import Path

import json

from datetime import timedelta

In [1233]:
#useful to look at the number of unqiue labels and find tickers with no matches
def unique_label_list(statement,m_dict):
    gaap_list = []
    for ticker in m_dict:
        if statement in m_dict[ticker] and m_dict[ticker][statement] is not None:
            current_statement = list(m_dict[ticker][statement].keys())
            gaap_list = np.concatenate((gaap_list,list(m_dict[ticker][statement].keys())))
            #print(f'{ticker}: {len(current_statement)}')
        else:
            print(ticker)

    gaap_list = np.unique(np.array(gaap_list).flatten())
    print(f"Number of unique labels {len(gaap_list)}")
    return gaap_list

def gaap_dict_with_tag_as_key(statement, m_dict):
    gaap_dict = {}
    for ticker in m_dict:

        if (statement in m_dict[ticker]) and (m_dict[ticker][statement] is not None):
            for tag in m_dict[ticker][statement].keys():
                if (tag in m_dict[ticker][statement]) and (m_dict[ticker][statement][tag] is not None):
                    if tag not in gaap_dict:
                        gaap_dict[tag] = dict()

                    for label in m_dict[ticker][statement][tag]:
                        if label not in gaap_dict[tag]:
                            gaap_dict[tag][label] = 1
                        else:
                            gaap_dict[tag][label] += 1


    return gaap_dict

def gaap_dict_with_label_as_key(gaap_dict):
    inverted_gaap_dict = {}
    for tag in gaap_dict:
        for label in gaap_dict[tag]:
            if gaap_dict[tag][label] > max(2,label_max):

                if label not in inverted_gaap_dict:

                    inverted_gaap_dict[label] = {}

                inverted_gaap_dict[label][tag] = gaap_dict[tag][label] 
    return inverted_gaap_dict

def formated_gaap_dict(inverted_gaap_dict):
    formated_inverted_gaap_dict = {}
    for label in inverted_gaap_dict:
        formated_inverted_gaap_dict[label] = np.array([])
        tag_list = []
        to_dict = {}
        for tag in inverted_gaap_dict[label]:
            matches = inverted_gaap_dict[label][tag]
            if matches not in to_dict:
                to_dict[matches] = np.array([])
            to_dict[matches] = np.append(to_dict[matches],tag)
            
        for matches in (sorted(to_dict.keys(),reverse=True)):
            formated_inverted_gaap_dict[label] = np.append(formated_inverted_gaap_dict[label],to_dict[matches])
            formated_inverted_gaap_dict[label] = list(formated_inverted_gaap_dict[label])

    return formated_inverted_gaap_dict


def create_formated_gaap_dict(data_path):
    
    mapping_dict = {}
    
    with open(f"{data_path}mappings/ticker_tag_label_mapping.json") as json_file:
        m_dict = json.load(json_file)
    
    print(f"Number of companies: {len(m_dict.keys())}")
    
    for statement in ['Income Statement','Cash Flow','Balance Sheet']:

        gaap_dict = gaap_dict_with_tag_as_key(statement, m_dict)
        inverted_gaap_dict = gaap_dict_with_label_as_key(gaap_dict)
        formated_inverted_gaap_dict = formated_gaap_dict(inverted_gaap_dict)
        
        
        with open(f"{data_path}mappings/placeholder_label_tag_mapping_{statement}.json","w") as json_file:
            json.dump(formated_inverted_gaap_dict,json_file)

        
        mapping_dict[statement] = formated_inverted_gaap_dict
        
        print_duplicates(mapping_dict[statement],statement)
    
    return mapping_dict
    
def print_duplicates(mapping_dict,statement):
    duplicate_dict = {}
    for label in mapping_dict:
        for tag in mapping_dict[label]:
            if tag not in duplicate_dict:
                duplicate_dict[tag] = []

            duplicate_dict[tag].append(label)
        duplicate_dict[tag] = list(np.unique(duplicate_dict[tag]))

    for tag in duplicate_dict:
        if len(duplicate_dict[tag]) >= 2:
            print(f"{statement} {tag}: {', '.join(duplicate_dict[tag])}")


In [1234]:
data_path = '../data/'
mapping_dict = create_formated_gaap_dict(data_path)

Number of companies: 149
Income Statement us-gaap_revenues: Gross Profit, Revenue
Income Statement us-gaap_interestexpense: Cost of Revenue, Interest Expense (Operating), Non-operating Interest Expenses
Income Statement us-gaap_incomelossfromcontinuingoperationsbeforeincometaxesextraordinaryitemsnoncontrollinginterest: EBT, Operating Income
Income Statement us-gaap_incomelossfromcontinuingoperationsbeforeincometaxesminorityinterestandincomelossfromequitymethodinvestments: EBT, Operating Income
Income Statement us-gaap_operatingincomeloss: EBIT, Operating Income
Income Statement us-gaap_netincomeloss: Income after Tax, Income from Continuous Operations, Consolidated Net Income/Loss, Net Income Common
Income Statement us-gaap_profitloss: Consolidated Net Income/Loss, Income after Tax, Income from Continuous Operations, Net Income Common
Income Statement us-gaap_incomelossfromcontinuingoperationsincludingportionattributabletononcontrollinginterest: Income after Tax, Income from Continuous