In [1]:
import sys

project_root = (
    "/media/andrew/E4AB-09ED/Dev/py/cayce"
    if sys.platform == "linux"
    else "F:/Dev/py/cayce"
)
if project_root not in sys.path:
    sys.path.append(project_root)

In [6]:
from bs4 import BeautifulSoup
import re
import datetime as dt

In [3]:
file_name = 'f:/data/edgar/TAKE_TWO_INTERACTIVE_SOFTWARE_INC_10-Q_20200804.xml'

In [4]:
# start reading and parsing this file
with open(file_name, mode='r') as fin:
    contents = fin.read()
soup = BeautifulSoup(contents, 'lxml')

In [7]:
re_date_strip=re.compile('[^\d]+')
def _parse_date(date_str:str)->dt.date:
    stripped_date_str=re_date_strip.sub('',date_str)[:8]
    return dt.datetime.strptime(stripped_date_str,'%Y%m%d').date()

# get relevant contexts
context_elements = soup.find_all(name=re.compile('context', re.IGNORECASE|re.MULTILINE))
relevant_contexts = {}
for context_element in context_elements:
    context_id = context_element.attrs['id']

    entity_element = context_element.find('entity')
    if entity_element is not None:
        if entity_element.find('segment') is not None:
            # don't care about contexts that apply to a given segment
            continue

    period_element=context_element.find('period')
    if  period_element is not None:
        instant_element = period_element.find('instant')
        if instant_element is not None:
            relevant_contexts[context_id]=_parse_date(instant_element.text)
        else:
            start_date_element=period_element.find('startdate')
            start_date=_parse_date(start_date_element.text) if start_date_element is not None else None
            
            end_date_element=period_element.find('enddate')
            end_date=_parse_date(end_date_element.text) if end_date_element is not None else None

            relevant_contexts[context_id]=(start_date,end_date)

relevant_contexts


{&#39;i26f3b0b908aa44f384c66fa4f0f7584d_D20200401-20200630&#39;: (datetime.date(2020, 4, 1),
  datetime.date(2020, 6, 30)),
 &#39;ic86e7fcbd43348a799169a1b3a60399a_I20200723&#39;: datetime.date(2020, 7, 23),
 &#39;i7882f3d565104f6abf829111db999509_I20200630&#39;: datetime.date(2020, 6, 30),
 &#39;ie3773fdd302b4c7bb69aeb58ae61a245_I20200331&#39;: datetime.date(2020, 3, 31),
 &#39;i04c41c540c0d4222a54765222ef85672_D20190401-20190630&#39;: (datetime.date(2019, 4, 1),
  datetime.date(2019, 6, 30)),
 &#39;i5aed14e2b37a436ba8fa1d5b88a07f63_I20190331&#39;: datetime.date(2019, 3, 31),
 &#39;iadfb3f2af10448b9a9d5bdd01d5cf8ab_I20190630&#39;: datetime.date(2019, 6, 30)}

In [18]:
ticker = 'ttwo'
company_specific_tags = soup.find_all(name=re.compile(f"{ticker}:.*", re.IGNORECASE))

# build a DataFrame, ticker, period start, period end, attribute name, attribute value
rows = []
for tag in company_specific_tags:
    if tag.text.isnumeric():
        # not bothering to track any attributes that are free-form text for now
        if 'contextref' in tag.attrs and tag['contextref'] in relevant_contexts:
            # get context period
            period = relevant_contexts[tag['contextref']]
            if type(period) == dt.date:
                start_date = period
                end_date = period
            else:
                start_date, end_date = period
            # get currency, if found
            currency = tag['unitref'] if 'unitref' in tag.attrs else None
            
            value = float(tag.text)
            rows.append([ticker, start_date, end_date, tag.name[len(ticker)+1:], value, currency])
rows

5, 0.0);text-align:left;vertical-align:bottom;border-top:1pt solid #000000;border-bottom:3pt double #000000;padding-left:1pt;&quot;&gt;&lt;span style=&quot;font-size:10pt;font-weight:700;font-family:\&#39;Times New Roman\&#39;,sans-serif;color:#000000;background-color:rgb(255,255,255, 0.0);&quot;&gt;$&lt;/span&gt;&lt;/td&gt;&lt;td style=&quot;padding-top:2px;padding-bottom:2px;background-color:rgb(255,255,255, 0.0);text-align:right;vertical-align:bottom;border-top:1pt solid #000000;border-bottom:3pt double #000000;padding-right:0%;&quot;&gt;&lt;span style=&quot;font-size:10pt;font-weight:700;font-family:\&#39;Times New Roman\&#39;,sans-serif;color:#000000;background-color:rgb(255,255,255, 0.0);&quot;&gt;409,962\xa0&lt;/span&gt;&lt;/td&gt;&lt;td style=&quot;padding-top:2px;padding-bottom:2px;background-color:rgb(255,255,255, 0.0);text-align:right;vertical-align:bottom;border-top:1pt solid #000000;border-bottom:3pt double #000000;padding-right:1pt;&quot;&gt;&lt;span style=&quot;font-size

In [66]:
d = '2020-09-25'
re.compile('[^\d]+').sub('',d)

re.compile(r'[^\d]+', re.UNICODE)