In [3]:
from bs4 import BeautifulSoup
import re
import os
import json
import pprint

In [4]:
class Scraper:
    def __init__(self, year, quarter, code):
        self.year = year
        self.quarter = quarter
        self.path = '/Users/basnugroho/Downloads/FinancialStatements/' + str(year) + '/Q' + str(quarter) + '/' + code + '/inlineXBRL/'

        # run on create object
        self.inline_xbrl_files = []
        self.set_xbrl_list_files()

    def set_xbrl_list_files(self):
        for file in os.listdir(self.path):
            if file.endswith(".html"):
                # print(os.path.join("Scrapers/2018/Q3/AALI/inlineXBRL/", file))
                self.inline_xbrl_files.append(file)
                self.inline_xbrl_files.sort()

    def read_file(self,file):
        file = open(self.path+file)
        data = file.read()
        file.close()
        return data

    def get_general_info(self):
        soup = BeautifulSoup(self.read_file(file=self.inline_xbrl_files[0]), 'lxml')
        columns = self.get_general_info_en_columns()
        values = self.get_general_info_values()
        data = dict(zip(columns, values))
        return data

    def get_balance_sheet(self):
        soup = BeautifulSoup(scraper.read_file(file=scraper.inline_xbrl_files[1]), 'lxml')
        trows = soup.findAll('tr', {'style': ''})

        columns = []
        values = []
        for row in trows:
            cols = row.findAll('td', {'class': 'rowHeaderEN01'})
            for col in cols:
                columns.append(col.contents[0].replace(' ', '_').lower())
            vals = row.findAll('ix:nonfraction', {'contextref': 'CurrentYearInstant'})
            if len(vals) > 0:
                numb_str = re.sub('  +', '', vals[0].contents[0].replace('\n', ''))
                try:
                    values.append(float(numb_str.replace(',', '')))
                except Exception as e:
                    print(e)
            else:
                values.append('')

        return dict(zip(columns, values[2:]))

    def get_income_statement(self):
        soup = BeautifulSoup(scraper.read_file(file=scraper.inline_xbrl_files[2]), 'lxml')
        trows = soup.findAll('tr', {'style': ''})

        columns = []
        values = []
        for row in trows:
            cols = row.findAll('td', {'class': 'rowHeaderEN01'})
            for col in cols:
                columns.append(col.contents[0].replace(' ', '_').lower())
            vals = row.findAll('ix:nonfraction', {'contextref': 'CurrentYearDuration'})
            if len(vals) > 0:
                numb_str = re.sub('  +', '', vals[0].contents[0].replace('\n', ''))
                try:
                    values.append(float(numb_str.replace(',', '')))
                except Exception as e:
                    print(e)
            else:
                values.append('')

        return dict(zip(columns, values[2:]))

    def get_cash_flow(self):
        pass

    def get_all(self, type='json'):
        # data = self.get_general_info().copy()
        # data.update({'income_statement': self.get_income_statement()})
        data = {'general_info': self.get_general_info(),
                'balance_sheet': self.get_balance_sheet(),
                'income_statement': self.get_income_statement(),
                'cash_flow': self.get_cash_flow()
                }
        # if(type=='json'):
        #     return json.dumps(data)
        # else:
        #     return json.dumps(data)
        return data

    def write_json(self):
        with open(self.path+'data.json', 'w') as outfile:
            json.dump(self.get_all(), outfile, indent=4)
            outfile.close()

    def get_general_info_en_columns(self, file_number=0):
        soup = BeautifulSoup(self.read_file(file=self.inline_xbrl_files[file_number]), 'lxml')
        columns = soup.findAll(attrs={'class': 'rowHeaderEN01'})
        return [col.text.replace(' ', '_').lower() for col in columns]

    def get_general_info_values(self, file_number=0):
        soup = BeautifulSoup(self.read_file(file=self.inline_xbrl_files[file_number]), 'lxml')
        columns = soup.findAll(attrs={'class': 'valueCell'})
        return [re.sub('  +', '', col.text.replace('\n', '')) for col in columns]

    def get_visible_balance_sheet_columns(self):
        soup = BeautifulSoup(self.read_file(file=self.inline_xbrl_files[1]), 'lxml')
        trows = soup.findAll('tr', {'style': ''})
        visible_columns = []
        for row in trows:
            cols = row.findAll('td', {'class': 'rowHeaderEN01'})
            for col in cols:
                visible_columns.append(col.contents[0])
        data = [column.replace(' ', '_').lower() for column in visible_columns]
        return data

In [5]:
scraper = Scraper(year=2018, quarter=3, code='TLKM')

In [8]:
def get_general_info_en_columns():
    soup = BeautifulSoup(scraper.read_file(file=scraper.inline_xbrl_files[0]), 'lxml')
    columns = soup.findAll(attrs={'class': 'rowHeaderEN01'})
    return [col.text.replace(' ','_').lower() for col in columns]

In [9]:
print(get_general_info_en_columns()[1])

explanation_of_change_in_name_from_the_end_of_the_preceding_reporting_period


In [373]:
# get visible columns and values (including negativity) for income statements
soup = BeautifulSoup(scraper.read_file(file=scraper.inline_xbrl_files[2]), 'lxml')
trows = soup.findAll('tr', {'style': ''})

columns = []
values = []

for row in trows:
    cols = row.findAll('td', {'class': 'rowHeaderEN01'})
    for col in cols:
        columns.append(col.contents[0].replace(' ', '_').lower())
    
    vals = row.findAll('ix:nonfraction', {'contextref': 'CurrentYearDuration'})
    minus_sign = row.findAll('td', {'class': 'valueCell'})
    if len(vals) > 0:
        numb_str  =  re.sub('  +', '', vals[0].contents[0].replace('\n',''))
        try:
            values.append(float(numb_str.replace(',','')))
        except Exception as e:
            print(e)
    else:
        values.append('')

# negativize
shifted_values = values[2:]
minus_signs = find_minus_signs()
for i in range(0, len(minus_signs)):
    if minus_signs[i]:
        shifted_values[i] = -1 * shifted_values[i]
            
data = dict(zip(columns, shifted_values))

In [374]:
# find minus signs
def find_minus_signs():
    min_signs = []
    signs = []
    for row in trows:
        minus_sign = row.findAll('td', {'class': 'valueCell'})
        min_signs.append(minus_sign)

    for i in range(1, len(min_signs)):
        if len(min_signs[i]) > 0:
            if str(min_signs[i][0]).find('(') is not -1:
                signs.append(True)
            else:
                signs.append(False)
    return signs

In [375]:
# negativize
#shifted_values = values[2:]
minus_signs = find_minus_signs()
for i in range(0, len(minus_signs)):
    if minus_signs[i]:
        shifted_values[i] = -1 * shifted_values[i]

In [376]:
print(len(minus_signs))
print(shifted_values)

31
[99203.0, 848.0, 3074.0, 2983.0, 33432.0, 10299.0, 15873.0, 5024.0, 76.0, 804.0, 2619.0, 45.0, 27672.0, 6985.0, 20687.0, 20687.0, '', '', 194.0, 194.0, 194.0, 20881.0, '', 14232.0, 6455.0, '', 14426.0, 6455.0, '', '', 143.67]


In [377]:
min_signs[4][0]

<td class="valueCell" style="text-align:right;"> ( 
         <ix:nonfraction contextref="CurrentYearDuration" decimals="-9" format="ixt:numcommadot" id="IX02_0697_003_01_02" name="idx-cor:InterconnectionExpenses" scale="9" unitref="IDR">
           3,074 
         </ix:nonfraction>) </td>

In [378]:
regex = re.compile('[<(]')
index = 4
#print(regex.match(str(min_signs[index][0])))
#print('x' in str(min_signs[index][0]))
if str(min_signs[index][0]).find('(') is not -1:
    print('found')
min_signs[index][0]

found


<td class="valueCell" style="text-align:right;"> ( 
         <ix:nonfraction contextref="CurrentYearDuration" decimals="-9" format="ixt:numcommadot" id="IX02_0697_003_01_02" name="idx-cor:InterconnectionExpenses" scale="9" unitref="IDR">
           3,074 
         </ix:nonfraction>) </td>

In [379]:
str(min_signs[4][0])

'<td class="valueCell" style="text-align:right;"> ( \n         <ix:nonfraction contextref="CurrentYearDuration" decimals="-9" format="ixt:numcommadot" id="IX02_0697_003_01_02" name="idx-cor:InterconnectionExpenses" scale="9" unitref="IDR">\n           3,074 \n         </ix:nonfraction>) </td>'

In [380]:
# cash flow

In [387]:
soup = BeautifulSoup(scraper.read_file(file=scraper.inline_xbrl_files[-1]), 'lxml')
trows = soup.findAll('tr', {'style': ''})
columns = []
values = []
for row in trows:
    cols = row.findAll('td', {'class': 'rowHeaderEN01'})
#     for col in cols:
#         columns.append(col.contents[0].replace(' ', '_').lower())
#     vals = row.findAll('ix:nonfraction', {'contextref': 'CurrentYearDuration'})
#     if len(vals) > 0:
#         numb_str = re.sub('  +', '', vals[0].contents[0].replace('\n', ''))
#         try:
#             values.append(float(numb_str.replace(',', '')))
#         except Exception as e:
#             print(e)
#     else:
#         values.append('')

#cash_flow = dict(zip(columns, values[4:]))

In [383]:
print(len(columns))
print(len(values[4:]))

29
26


In [384]:
columns[0:6]

['cash_flows_from_operating_activities_',
 'cash_receipts_from_operating_activities_',
 'receipts_from_customers',
 'cash_payments_from_operating_activities_',
 'payments_to_suppliers_for_goods_and_services',
 'payments_for_salaries_and_allowances']

In [385]:
values[-1]

17302.0