In [1]:
import sys

project_root = (
    "/media/andrew/E4AB-09ED/Dev/py/cayce"
    if sys.platform == "linux"
    else "F:/Dev/py/cayce"
)
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
import datetime as dt
import pandas as pd
from typing import Dict, Any, List, Union, Tuple

In [3]:
file_name = 'f:/data/edgar/TAKE_TWO_INTERACTIVE_SOFTWARE_INC_10-Q_20200804.xml'

In [4]:
# start reading and parsing this file
with open(file_name, mode='r') as fin:
    contents = fin.read()
soup = BeautifulSoup(contents, 'lxml')

In [7]:
re_date_strip=re.compile('[^\d]+')
def _parse_date(date_str:str)->dt.date:
    stripped_date_str=re_date_strip.sub('',date_str)[:8]
    return dt.datetime.strptime(stripped_date_str,'%Y%m%d').date()

# get relevant contexts
context_elements = soup.find_all(name=re.compile('context', re.IGNORECASE|re.MULTILINE))
relevant_contexts = {}
for context_element in context_elements:
    context_id = context_element.attrs['id']

    entity_element = context_element.find('entity')
    if entity_element is not None:
        if entity_element.find('segment') is not None:
            # don't care about contexts that apply to a given segment
            continue

    period_element=context_element.find('period')
    if  period_element is not None:
        instant_element = period_element.find('instant')
        if instant_element is not None:
            relevant_contexts[context_id]=_parse_date(instant_element.text)
        else:
            start_date_element=period_element.find('startdate')
            start_date=_parse_date(start_date_element.text) if start_date_element is not None else None
            
            end_date_element=period_element.find('enddate')
            end_date=_parse_date(end_date_element.text) if end_date_element is not None else None

            relevant_contexts[context_id]=(start_date,end_date)

relevant_contexts


{&#39;i26f3b0b908aa44f384c66fa4f0f7584d_D20200401-20200630&#39;: (datetime.date(2020, 4, 1),
  datetime.date(2020, 6, 30)),
 &#39;ic86e7fcbd43348a799169a1b3a60399a_I20200723&#39;: datetime.date(2020, 7, 23),
 &#39;i7882f3d565104f6abf829111db999509_I20200630&#39;: datetime.date(2020, 6, 30),
 &#39;ie3773fdd302b4c7bb69aeb58ae61a245_I20200331&#39;: datetime.date(2020, 3, 31),
 &#39;i04c41c540c0d4222a54765222ef85672_D20190401-20190630&#39;: (datetime.date(2019, 4, 1),
  datetime.date(2019, 6, 30)),
 &#39;i5aed14e2b37a436ba8fa1d5b88a07f63_I20190331&#39;: datetime.date(2019, 3, 31),
 &#39;iadfb3f2af10448b9a9d5bdd01d5cf8ab_I20190630&#39;: datetime.date(2019, 6, 30)}

In [25]:
ticker = 'ttwo'
company_specific_tags = soup.find_all(name=re.compile(f"{ticker}:.*", re.IGNORECASE))

# build a DataFrame, ticker, period start, period end, attribute name, attribute value, currency
rows = []
for tag in company_specific_tags:
    if tag.text.isnumeric():
        # not bothering to track any attributes that are free-form text for now
        if 'contextref' in tag.attrs and tag['contextref'] in relevant_contexts:
            # get context period
            period = relevant_contexts[tag['contextref']]
            start_date, end_date = (None, period) if type(period) == dt.date else period
            # get currency, if found
            currency = tag['unitref'].upper() if 'unitref' in tag.attrs else None
            
            value = float(tag.text)
            rows.append([ticker, start_date, end_date, tag.name[len(ticker)+1:], value, currency])
company_specific_attributes = pd.DataFrame(rows, columns=['Ticker', 'PeriodStart', 'PeriodEnd', 'AttributeName', 'AttributeValue', 'Currency'])

In [33]:
from collections import namedtuple

ParsedTag = namedtuple(
    "ParsedTag", ["period_start", "period_end", "attribute", "value", "currency"]
)


def _parse_tag(
    tag: Tag, ticker: str, relevant_contexts: Dict[str, Tuple[dt.date, dt.date]],
) -> ParsedTag:
    """
    Take a tag and pull out the context, value, and currency (if applicable)
    """
    if "contextref" not in tag.attrs or tag["contextref"] not in relevant_contexts:
        return None

    # get context period
    period = relevant_contexts[tag["contextref"]]
    start_date, end_date = (None, period) if type(period) == dt.date else period
    # get currency, if available
    currency = tag["unitref"].upper() if "unitref" in tag.attrs else None

    value = float(tag.text) if tag.text.isnumeric() else tag.text
    return ParsedTag(
        period_start=start_date,
        period_end=end_date,
        attribute=tag.name[len(ticker) + 1 :],
        value=value,
        currency=currency,
    )

In [35]:
# extract some header data
header_tag_labels = [
    'dei:DocumentType',
    'dei:DocumentPeriodEndDate',
    'dei:EntityRegistrantName',
    'dei:TradingSymbol',
    'dei:SecurityExchangeName',
    'dei:EntityCommonStockSharesOutstanding',
]

header = {}
for tag_label in header_tag_labels:
    tags = soup.find_all(name=re.compile(tag_label, re.IGNORECASE))

    for tag in tags:
        parsed_tag = _parse_tag(tag, 'dei', relevant_contexts)

        period_key = (parsed_tag.period_start, parsed_tag.period_end)
        if period_key not in header:
            header[period_key] = {}
        
        header[period_key][parsed_tag.attribute] = parsed_tag.value

header


{(datetime.date(2020, 4, 1),
  datetime.date(2020, 6, 30)): {&#39;documenttype&#39;: &#39;10-Q&#39;, &#39;documentperiodenddate&#39;: &#39;2020-06-30&#39;, &#39;entityregistrantname&#39;: &#39;TAKE-TWO INTERACTIVE SOFTWARE,\xa0INC.&#39;, &#39;tradingsymbol&#39;: &#39;TTWO&#39;, &#39;securityexchangename&#39;: &#39;NASDAQ&#39;},
 (None,
  datetime.date(2020, 7, 23)): {&#39;entitycommonstocksharesoutstanding&#39;: 114338838.0}}

In [46]:
def _get_attribute_values_df(soup:BeautifulSoup,tag_labels:str,ticker:str):
    rows = []
    processed_elements = set()
    for tag_label in tag_labels:
        tags = soup.find_all(name=re.compile(tag_label, re.IGNORECASE))

        for tag in tags:
            if tag.text.isnumeric():
                # not bothering to track any attributes that are free-form text for now
                if 'contextref' in tag.attrs and tag['contextref'] in relevant_contexts:
                    element_id = (tag_label, tag['contextref'])
                    if element_id in processed_elements:
                        continue
                    processed_elements.add(element_id)

                    # get context period
                    period = relevant_contexts[tag['contextref']]
                    start_date, end_date = (None, period) if type(period) == dt.date else period
                    # get currency, if found
                    currency = tag['unitref'].upper() if 'unitref' in tag.attrs else None
                    
                    value = float(tag.text)
                    rows.append([ticker, start_date, end_date, tag.name.split(':')[1], value, currency])
    return pd.DataFrame(rows, columns=['Ticker', 'PeriodStart', 'PeriodEnd', 'AttributeName', 'AttributeValue', 'Currency'])

In [47]:
# revenue_component_tags = soup.find_all(name=re.compile(f"{ticker}:Revenue.*", re.IGNORECASE))
    # <ticker>:<category>Cost
    #      - OR -
    #     <ticker>:CostOfSales<category>


income_statement_attributes = [
    'us-gaap:Revenues',
    'us-gaap:CostOfGoodsAndServicesSoldAmortization',
    'us-gaap:CostOfGoodsAndServicesSold',
    'us-gaap:GrossProfit',
    'us-gaap:ResearchAndDevelopmentExpenseExcludingAcquiredInProcessCost',
    'us-gaap:SellingAndMarketingExpense',
    'us-gaap:GeneralAndAdministrativeExpense',
    'us-gaap:DepreciationAndAmortization',
    'us-gaap:BusinessCombinationAcquisitionRelatedCosts',
    'us-gaap:RestructuringAndRelatedCostIncurredCost',
    'us-gaap:OperatingExpenses',
    'us-gaap:OperatingIncomeLoss',
    'us-gaap:InvestmentIncomeInterest',
    "us-gaap:InterestExpense",
    'us-gaap:OtherNonoperatingIncomeExpense',
        'us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments',
    'us-gaap:IncomeTaxExpenseBenefit',
    'us-gaap:IncomeLossFromContinuingOperations',
    'us-gaap:IncomeLossFromDiscontinuedOperationsNetOfTaxAttributableToReportingEntity',
    'us-gaap:NetIncomeLoss',

        'us-gaap:IncomeLossFromContinuingOperationsPerBasicShare',
        'us-gaap:DiscontinuedOperationIncomeLossFromDiscontinuedOperationNetOfTaxPerBasicShare',
        'us-gaap:EarningsPerShareBasic',

        'us-gaap:IncomeLossFromContinuingOperationsPerDilutedShare',
        'us-gaap:DiscontinuedOperationIncomeLossFromDiscontinuedOperationNetOfTaxPerDilutedShare',
        'us-gaap:EarningsPerShareDiluted',

        'us-gaap:WeightedAverageNumberOfSharesOutstandingBasic',
        'us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding',
]
income_statement_df = _get_attribute_values_df(soup,income_statement_attributes,ticker)
income_statement_df

Unnamed: 0,Ticker,PeriodStart,PeriodEnd,AttributeName,AttributeValue,Currency
0,ttwo,2020-04-01,2020-06-30,costofgoodsandservicessold,476689000.0,USD
1,ttwo,2019-04-01,2019-06-30,costofgoodsandservicessold,241469000.0,USD
2,ttwo,2020-04-01,2020-06-30,grossprofit,354621000.0,USD
3,ttwo,2019-04-01,2019-06-30,grossprofit,298990000.0,USD
4,ttwo,2020-04-01,2020-06-30,sellingandmarketingexpense,84779000.0,USD
5,ttwo,2019-04-01,2019-06-30,sellingandmarketingexpense,91821000.0,USD
6,ttwo,2020-04-01,2020-06-30,generalandadministrativeexpense,102173000.0,USD
7,ttwo,2019-04-01,2019-06-30,generalandadministrativeexpense,74833000.0,USD
8,ttwo,2020-04-01,2020-06-30,operatingexpenses,272478000.0,USD
9,ttwo,2019-04-01,2019-06-30,operatingexpenses,247260000.0,USD


In [3]:
# Form 4
file_name = 'F:/data/edgar/cache/xbrl/AquaBounty_Technologies_Inc__4_20191031.xml'
with open(file_name, mode='r') as fin:
    contents = fin.read()
soup = BeautifulSoup(contents, 'lxml')

In [4]:
report_date = dt.datetime.strptime(soup.find('periodofreport').text, '%Y-%m-%d').date()

In [5]:
# Get info on the party that this filing pertains to
owner_element = soup.find('reportingowner')
owner_name = owner_element.find('reportingownerid').find('rptownername').text

'KIRK RANDAL J'

In [6]:
relationship_element = owner_element.find('reportingownerrelationship')
relationship = {}
if relationship_element is not None:
    for relationship_type in ['director', 'officer', 'tenpercentowner', 'other']:
        flag = relationship_element.find(f'is{relationship_type}').text
        relationship[relationship_type] = flag == '1'
    
    officer_title_element = relationship_element.find('officertitle')
    if officer_title_element is not None:
        officer_title = officer_title_element.text.strip()
        if officer_title.lower() == 'see remarks':
            officer_title = soup.find('remarks').text
        if officer_title:
            relationship['officer_title']=officer_title
    
    other_text_element=relationship_element.find('othertext')
    if other_text_element is not None:
        other_text=other_text_element.text.strip()
        if other_text:
            relationship['other_text']=other_text

relationship


{'director': False, 'officer': False, 'tenpercentowner': True, 'other': False}

In [None]:
# Get the detail of all transactions