In [2020]:
import pandas as pd
import numpy as np

import os
from pathlib import Path

import json

from datetime import timedelta

In [1233]:
#useful to look at the number of unqiue labels and find tickers with no matches
def unique_label_list(statement,m_dict):
    gaap_list = []
    for ticker in m_dict:
        if statement in m_dict[ticker] and m_dict[ticker][statement] is not None:
            current_statement = list(m_dict[ticker][statement].keys())
            gaap_list = np.concatenate((gaap_list,list(m_dict[ticker][statement].keys())))
            #print(f'{ticker}: {len(current_statement)}')
        else:
            print(ticker)

    gaap_list = np.unique(np.array(gaap_list).flatten())
    print(f"Number of unique labels {len(gaap_list)}")
    return gaap_list

def gaap_dict_with_tag_as_key(statement, m_dict):
    gaap_dict = {}
    for ticker in m_dict:

        if (statement in m_dict[ticker]) and (m_dict[ticker][statement] is not None):
            for tag in m_dict[ticker][statement].keys():
                if (tag in m_dict[ticker][statement]) and (m_dict[ticker][statement][tag] is not None):
                    if tag not in gaap_dict:
                        gaap_dict[tag] = dict()

                    for label in m_dict[ticker][statement][tag]:
                        if label not in gaap_dict[tag]:
                            gaap_dict[tag][label] = 1
                        else:
                            gaap_dict[tag][label] += 1


    return gaap_dict

def gaap_dict_with_label_as_key(gaap_dict):
    inverted_gaap_dict = {}
    for tag in gaap_dict:
        for label in gaap_dict[tag]:
            if gaap_dict[tag][label] > max(2,label_max):

                if label not in inverted_gaap_dict:

                    inverted_gaap_dict[label] = {}

                inverted_gaap_dict[label][tag] = gaap_dict[tag][label] 
    return inverted_gaap_dict

def formated_gaap_dict(inverted_gaap_dict):
    formated_inverted_gaap_dict = {}
    for label in inverted_gaap_dict:
        formated_inverted_gaap_dict[label] = np.array([])
        tag_list = []
        to_dict = {}
        for tag in inverted_gaap_dict[label]:
            matches = inverted_gaap_dict[label][tag]
            if matches not in to_dict:
                to_dict[matches] = np.array([])
            to_dict[matches] = np.append(to_dict[matches],tag)
            
        for matches in (sorted(to_dict.keys(),reverse=True)):
            formated_inverted_gaap_dict[label] = np.append(formated_inverted_gaap_dict[label],to_dict[matches])
            formated_inverted_gaap_dict[label] = list(formated_inverted_gaap_dict[label])

    return formated_inverted_gaap_dict


def create_formated_gaap_dict(data_path):
    
    mapping_dict = {}
    
    with open(f"{data_path}mappings/ticker_tag_label_mapping.json") as json_file:
        m_dict = json.load(json_file)
    
    print(f"Number of companies: {len(m_dict.keys())}")
    
    for statement in ['Income Statement','Cash Flow','Balance Sheet']:

        gaap_dict = gaap_dict_with_tag_as_key(statement, m_dict)
        inverted_gaap_dict = gaap_dict_with_label_as_key(gaap_dict)
        formated_inverted_gaap_dict = formated_gaap_dict(inverted_gaap_dict)
        
        
        with open(f"{data_path}mappings/placeholder_label_tag_mapping_{statement}.json","w") as json_file:
            json.dump(formated_inverted_gaap_dict,json_file)

        
        mapping_dict[statement] = formated_inverted_gaap_dict
        
        print_duplicates(mapping_dict[statement],statement)
    
    return mapping_dict
    
def print_duplicates(mapping_dict,statement):
    duplicate_dict = {}
    for label in mapping_dict:
        for tag in mapping_dict[label]:
            if tag not in duplicate_dict:
                duplicate_dict[tag] = []

            duplicate_dict[tag].append(label)
        duplicate_dict[tag] = list(np.unique(duplicate_dict[tag]))

    for tag in duplicate_dict:
        if len(duplicate_dict[tag]) >= 2:
            print(f"{statement} {tag}: {', '.join(duplicate_dict[tag])}")


In [1234]:
data_path = '../data/'
mapping_dict = create_formated_gaap_dict(data_path)

Number of companies: 149
Income Statement us-gaap_revenues: Gross Profit, Revenue
Income Statement us-gaap_interestexpense: Cost of Revenue, Interest Expense (Operating), Non-operating Interest Expenses
Income Statement us-gaap_incomelossfromcontinuingoperationsbeforeincometaxesextraordinaryitemsnoncontrollinginterest: EBT, Operating Income
Income Statement us-gaap_incomelossfromcontinuingoperationsbeforeincometaxesminorityinterestandincomelossfromequitymethodinvestments: EBT, Operating Income
Income Statement us-gaap_operatingincomeloss: EBIT, Operating Income
Income Statement us-gaap_netincomeloss: Income after Tax, Income from Continuous Operations, Consolidated Net Income/Loss, Net Income Common
Income Statement us-gaap_profitloss: Consolidated Net Income/Loss, Income after Tax, Income from Continuous Operations, Net Income Common
Income Statement us-gaap_incomelossfromcontinuingoperationsincludingportionattributabletononcontrollinginterest: Income after Tax, Income from Continuous

In [1369]:
with open(f"{data_path}mappings/canonical_label_tag_mapping.json") as json_file:
    m_dict = json.load(json_file)


In [1631]:
def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    return abs((d2 - d1).days)



In [1665]:
def count_sequential_quarters(quarter_list,log_missing=False,timeseries_logger=None):
    sequential_quarters = 0

    missing_quarters = []
    for i in range(1,len(quarter_list)):
        diff = datetime.strptime(quarter_list[i-1],'%Y-%m-%d') - datetime.strptime(quarter_list[i],'%Y-%m-%d')

        if diff.days > 85 and diff.days < 95:
            sequential_quarters +=1
        else:
            if log_missing:
                missing_quarters.append(f"{quarter_list[i-1]} and {quarter_list[i]}")
            else:
                break   

    if len(missing_quarters) > 0: 
        timeseries_logger.warning(f"Missing quarter between: {', '.join(missing_quarters)}") 

    return sequential_quarters 

In [2041]:
def drop_extraneous_columns(df_timeseries): 

    date_col_list = df_timeseries.columns
    
    if uses_regular_quarter_schedule(df_timeseries):
        regular_quarter_dates = [date_col for date_col in date_col_list if date_col[-5:] in ['03-31','06-30','09-30','12-31']]
        drop_column_list = df_timeseries.columns[~pd.to_datetime(df_timeseries.columns).isin(pd.date_range(start=min(regular_quarter_dates),end=max(regular_quarter_dates),freq="3M"))]
    else:
        drop_column_list = []
        for i in range(1,len(date_col_list)):
            diff = datetime.strptime(date_col_list[i-1],'%Y-%m-%d') - datetime.strptime(date_col_list[i],'%Y-%m-%d')
            if diff.days < 85: 
                drop_column_list.append(df_timeseries.loc[:,[date_col_list[i-1],date_col_list[i]]].isna().sum().idxmax())
        drop_column_list = list(np.unique(drop_column_list))
        
    df_timeseries = df_timeseries.drop(drop_column_list,axis=1)


    return df_timeseries 


def uses_regular_quarter_schedule(df_timeseries):
    return False 

    regular_quarter_dates = [date_col for date_col in quarter_list if date_col[-5:] in ['03-31','06-30','09-30','12-31']]
    if len(regular_quarter_dates)/len(quarter_list) > 0.7: #if 70% of data columns are part of regular observation then 
        return True
    else:
        return False

def add_missing_columns(df_timeseries):
    
    if uses_regular_quarter_schedule(df_timeseries):
        regular_quarter_dates = [date_col for date_col in df_timeseries.columns if date_col[-5:] in ['03-31','06-30','09-30','12-31']]
        datetime_range = pd.date_range(start=min(regular_quarter_dates),end=max(regular_quarter_dates),freq="3M")
        missing_dates = datetime_range[~datetime_range.isin(pd.to_datetime(df_timeseries.columns))].astype(str)
    else:
        missing_dates = []
        for i in range(1,len(date_col_list)):
            diff = datetime.strptime(date_col_list[i-1],'%Y-%m-%d') - datetime.strptime(date_col_list[i],'%Y-%m-%d')
            if diff.days > 103: 
                print(f"{date_col_list[i]}, {date_col_list[i-1]}: {diff.days}")
                for j in range(1,int(round(diff.days/91,0))): #loop through all the missing dates
                    missing_dates.append(datetime.strftime(datetime.strptime(date_col_list[i],'%Y-%m-%d') + timedelta(days=91*j),'%Y-%m-%d'))

    
    df_timeseries = df_timeseries.merge(pd.DataFrame(index=df_timeseries.index,columns=missing_dates),left_index=True,right_index=True)
    df_timeseries = df_timeseries.reindex(sorted(df_timeseries.columns,reverse=True),axis=1)

    return df_timeseries

    

In [2044]:
ticker = 'ADS'
df_timeseries= pd.read_csv(f"{data_path}timeseries/{ticker}/Canonical Statement/Income Statement.csv",index_col=[0,1,2])
#df_timeseries = df_timeseries.drop(df_timeseries.loc[:,df_timeseries.isna().all()].columns,axis=1)

#df_timeseries = drop_extraneous_columns(df_timeseries)

#date_col_list = df_timeseries.columns


    
    #df_timeseries = df_timeseries.drop(df_timeseries.loc[:,df_timeseries.isna().all()].columns,axis=1)


#missing_dates
df_timeseries

FileNotFoundError: [Errno 2] File ../data/timeseries/ADS/Canonical Statement/Income Statement.csv does not exist: '../data/timeseries/ADS/Canonical Statement/Income Statement.csv'

In [1961]:
datetime.strptime(,'%Y-%m-%d') + timedelta(days=91

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2019-11-24,2018-11-25,2017-08-27,2016-08-28,2015-02-22,2014-08-24,2013-08-25,2012-11-25,2011-08-28,2010-08-29,2009-08-30,2008-08-24
filing_label,xbrl_tag,standard_label,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
"Net income attributable to Conagra Brands, Inc. common stockholders (in dollars per share)",us-gaap_earningspersharediluted,EPS (diluted),0.53,0.31,0.36,0.42,-2.23,1.12,0.34,0.51,0.2,0.33,0.37,0.94
"Income (Loss) from Discontinued Operations and Disposal of Discontinued Operations, Net of Tax, Per Diluted Share",us-gaap_incomelossfromdiscontinuedoperationsnetoftaxperdilutedshare,,0.0,-0.01,0.0,0.0,0.0,0.87,0.01,-0.01,,0.01,0.0,0.71
"Income (Loss) from Continuing Operations, Per Diluted Share",us-gaap_incomelossfromcontinuingoperationsperdilutedshare,"EPS (diluted, from continous ops)",0.53,0.32,0.36,0.42,-2.23,0.25,0.33,0.51,0.2,0.32,0.37,0.23
"Net income attributable to Conagra Brands, Inc. common stockholders (in dollars per share)",us-gaap_earningspersharebasic,EPS (basic),0.53,0.31,0.37,0.42,-2.23,1.14,0.34,0.52,0.21,0.33,0.37,0.95
"Income (Loss) from Discontinued Operations and Disposal of Discontinued Operations, Net of Tax, Per Basic Share",us-gaap_incomelossfromdiscontinuedoperationsnetoftaxperbasicshare,,0.0,-0.01,0.0,0.0,0.0,0.88,0.0,0.0,,0.01,-0.01,0.72
"Income (Loss) from Continuing Operations, Per Basic Share",us-gaap_incomelossfromcontinuingoperationsperbasicshare,"EPS (basic, from continuous ops)",0.53,0.32,0.37,0.42,-2.23,0.26,0.34,0.52,0.21,0.32,0.38,0.23
"Net income attributable to Conagra Brands, Inc.",us-gaap_netincomeloss,,260500000.0,131600000.0,152500000.0,186200000.0,-954100000.0,482300000.0,144300000.0,211600000.0,85300000.0,146400000.0,165900000.0,442400000.0
Net Income (Loss) Attributable to Noncontrolling Interest,us-gaap_netincomelossattributabletononcontrollinginterest,Net income from non-controlling interests,1000000.0,800000.0,800000.0,3800000.0,1400000.0,2200000.0,2900000.0,4900000.0,300000.0,-100000.0,-700000.0,0.0
"Net Income (Loss), Including Portion Attributable to Noncontrolling Interest",us-gaap_profitloss,Net income,261500000.0,132400000.0,153300000.0,190000000.0,-952700000.0,484500000.0,147200000.0,216500000.0,85600000.0,146300000.0,165200000.0,442400000.0
Income (loss) from discontinued operations,us-gaap_incomelossfromdiscontinuedoperationsnetoftax,,0.0,-1900000.0,-300000.0,1500000.0,-600000.0,373300000.0,1100000.0,-1000000.0,100000.0,2600000.0,-1300000.0,334800000.0


In [1916]:
quarter_list

Index(['2020-03-31', '2019-12-31', '2019-09-30', '2019-06-30', '2019-03-31',
       '2018-12-31', '2018-09-30', '2018-06-30', '2018-03-31', '2017-12-31',
       '2017-09-30', '2017-06-30', '2017-03-31', '2016-12-31', '2016-09-30',
       '2016-06-30', '2016-03-31', '2015-12-31', '2015-09-30', '2015-06-30',
       '2015-03-31', '2014-12-31', '2014-09-30', '2014-06-30', '2014-03-31',
       '2013-12-31', '2013-09-30', '2013-06-30', '2013-03-31', '2012-12-31',
       '2012-09-30', '2012-06-30', '2012-03-31', '2011-12-31', '2011-09-30',
       '2011-06-30', '2011-03-31', '2010-12-31', '2010-09-30', '2010-06-30',
       '2010-03-31', '2009-12-31', '2009-09-30', '2009-06-30'],
      dtype='object')

In [1659]:
#['CTAS', 'AAPL', 'GRMN', 'ZM', 'PFE', 'ADI', 'CAG', 'NVDA']
for ticker in os.listdir(f"{data_path}timeseries/"):
    if os.path.exists(f"{data_path}timeseries/{ticker}/Canonical Statement/Income Statement.csv"):
        df_income = pd.read_csv(f"{data_path}timeseries/{ticker}/Canonical Statement/Income Statement.csv",index_col=[0,1,2])
        print(f"____{ticker}____")
        date_col_list = list(df_income.columns)
        for i in range(1,len(date_col_list)):
            print(f"{date_col_list[i-1]}, {date_col_list[i]}: {days_between(date_col_list[i],date_col_list[i-1])}")
    
        #print(ticker)
#ticker = 'NVDA'
#

____CTAS____
2020-02-29, 2019-11-30: 91
2019-11-30, 2019-08-31: 91
2019-08-31, 2019-05-31: 92
2019-05-31, 2019-02-28: 92
2019-02-28, 2018-11-30: 90
2018-11-30, 2018-08-31: 91
2018-08-31, 2018-05-31: 92
2018-05-31, 2018-02-28: 92
2018-02-28, 2017-11-30: 90
2017-11-30, 2017-08-31: 91
2017-08-31, 2017-05-31: 92
2017-05-31, 2017-02-28: 92
2017-02-28, 2016-11-30: 90
2016-11-30, 2016-08-31: 91
2016-08-31, 2016-05-31: 92
2016-05-31, 2016-02-29: 92
2016-02-29, 2015-11-30: 91
2015-11-30, 2015-08-31: 91
2015-08-31, 2015-05-31: 92
2015-05-31, 2015-02-28: 92
2015-02-28, 2014-11-30: 90
2014-11-30, 2014-08-31: 91
2014-08-31, 2014-05-31: 92
2014-05-31, 2014-02-28: 92
2014-02-28, 2013-11-30: 90
2013-11-30, 2013-08-31: 91
2013-08-31, 2013-05-31: 92
2013-05-31, 2013-02-28: 92
2013-02-28, 2012-11-30: 90
2012-11-30, 2012-08-31: 91
2012-08-31, 2012-05-31: 92
2012-05-31, 2012-02-29: 92
2012-02-29, 2011-11-30: 91
2011-11-30, 2011-08-31: 91
2011-08-31, 2011-05-31: 92
2011-05-31, 2010-05-31: 365
____VZ____
202

In [1645]:
91*4

364