In [None]:
# Extrahiere Tabelle / Philosophennamen aus:
# https://de.wikipedia.org/wiki/Zeittafel_zur_Philosophiegeschichte
#
# Jupyter Shortcuts: 'H'
#
# https://www.tutorialspoint.com/python_web_scraping/python_modules_for_web_scraping.htm
# https://www.freecodecamp.org/news/how-to-scrape-websites-with-python-2/
# https://oxylabs.io/blog/python-web-scraping
# https://www.freecodecamp.org/news/web-scraping-python-tutorial-how-to-scrape-data-from-a-website/
# https://www.dataquest.io/blog/web-scraping-python-using-beautiful-soup/
#
# https://www.geeksforgeeks.org/parsing-tables-and-xml-with-beautifulsoup/
# https://towardsdatascience.com/a-guide-to-scraping-html-tables-with-pandas-and-beautifulsoup-7fc24c331cf7
#
# https://stackoverflow.com/questions/52730839/how-do-i-change-notebookapp-iopub-data-rate-limit-for-jupyter
#
# https://stackoverflow.com/questions/45605033/convert-ipython-notebook-to-directly-executable-python-script
#

In [None]:
import csv
import re
import regex
import requests
import os
from bs4 import BeautifulSoup

# https://stackoverflow.com/questions/287871/how-to-print-colored-text-to-the-terminal/287944#287944
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [None]:
web_page_address = "https://de.wikipedia.org/wiki/Zeittafel_zur_Philosophiegeschichte"
web_page = requests.get(web_page_address)
# print ("web_page = \t\t", web_page)
# print ("web_page.status_code = \t", web_page.status_code)
soup = BeautifulSoup(web_page.content, 'html.parser')
bs4_table = soup.find('table', class_='wikitable toptextcells')

In [None]:
# from >timeline.xsd< :
event_tags = ["start", "end", "text", "progress", "fuzzy", "locked", "ends_today", "category", "description", "hyperlink", "alert", "icon", "default_color", "milestone"]
# >add_*< keys are used internally
additional_event_tags = ["add_lifespan_org", "add_start_vCh", "add_end_vCh", "add_start_end_ok", "add_mods", "add_ok"]

extended_event_tags = []
for item in event_tags:
    extended_event_tags.append(item)
for item in additional_event_tags:
    extended_event_tags.append(item)
extended_event_tags = tuple(extended_event_tags)

# https://stackoverflow.com/questions/16730339/python-add-item-to-the-tuple
def csv_event_file_header_get():
    event_header = []                      # list
    # for item in event_tags:
    for item in extended_event_tags:
        event_header.append(item)
    event_header = tuple(event_header)     # tuple
    return event_header


In [None]:
def write_csv_file_with_header(fn, lo_d_event_philosopher):
    d_header = lo_d_event_philosopher[0]
    with open(fn, 'w', encoding='utf8') as f:
        w = csv.DictWriter(f, d_header.keys())
        w.writeheader()
        for d_event_philosopher in lo_d_event_philosopher:
            w.writerow(d_event_philosopher)

In [None]:
def new_d_event_philosopher():
    # return dictionary with keys according to event_tags in >timeline.xsd<
    # all values are preset = ''
    d_event_philosopher = {}
    for cnt, event_fieldname in enumerate (extended_event_tags):
        d_event_philosopher[event_fieldname] = ''
    d_event_philosopher["start"] = '?'
    d_event_philosopher["end"]   = '?'
    return d_event_philosopher

In [None]:
def philosopher_name_trim(td_item):
    philosopher_name = '?'
    if td_item.a:
        philosoph_full_name = td_item.a.string                 # Name
        if ( ' von ' in philosoph_full_name) or  ( ' der ' in philosoph_full_name):
            philosopher_name = philosoph_full_name
        else:
            philosopher_name = philosoph_full_name.split(' ')[-1]
            if philosoph_full_name.split(' ')[:-1]:
                philosopher_name = philosopher_name + ', '
            for ele in philosoph_full_name.split(' ')[:-1]:
                philosopher_name = philosopher_name + ele + ' '
    return philosopher_name

In [None]:
def lifespan_set_christian_era(philosoph_lifespan, d_event_philosopher):
    # eruiert ob vor oder nach Christus
    # setzt entsprechende Felder im dict >d_event_philosopher<

    # vor_Chr  == re.search('v\. Chr\.', philosoph_lifespan)
    philosoph_lifespan_org = philosoph_lifespan
    vor_Chr_str = 'v\. Chr\.'
    vor_Chr  = re.search(vor_Chr_str, philosoph_lifespan)

    # nach_Chr = re.search('n\. Chr\.', philosoph_lifespan)
    nach_Chr_str = 'n\. Chr\.'
    nach_Chr     = re.search(nach_Chr_str, philosoph_lifespan)

    if vor_Chr and nach_Chr:
        d_event_philosopher['add_start_vCh'] = 'True'
        philosoph_lifespan = re.sub(vor_Chr_str, "", philosoph_lifespan).strip()
        d_event_philosopher['add_end_vCh']   = 'False'
        philosoph_lifespan = re.sub(nach_Chr_str, "", philosoph_lifespan).strip()
        d_event_philosopher['add_mods'] += 'vor_Chr and nach_Chr'
        # print ('check_lifespan_era (vor_Chr and nach_Chr): vorher', philosoph_lifespan_org, ':  -->>  : > ', philosoph_lifespan)
    elif (len(re.findall('v\. Chr\.', philosoph_lifespan)) >= 1):
        d_event_philosopher['add_start_vCh'] = 'True'
        philosoph_lifespan = re.sub(vor_Chr_str, "", philosoph_lifespan).strip()
        d_event_philosopher['add_end_vCh']   = 'True'
        philosoph_lifespan = re.sub(vor_Chr_str, "", philosoph_lifespan).strip()
        d_event_philosopher['add_mods'] += 'vor_Chr'
        # print ('check_lifespan_era (vor_Chr): vorher', philosoph_lifespan_org, ':  -->>  : > ', philosoph_lifespan)
    elif nach_Chr:
        philosoph_lifespan = re.sub(nach_Chr_str, "", philosoph_lifespan).strip()
        d_event_philosopher['add_mods'] += 'nach_Chr'
        pass
    return philosoph_lifespan, d_event_philosopher

In [None]:
def lifespan_delete_substring(substring, philosopher_lifespan, d_event_philosopher, comment):
    # deletes >substring< in string >philosopher_lifespan<
    # ... if substring is present. (i.e. >n. Chr.<, >Jh.<, etc)
    # Usually this means, that lifespan is not exactly known
    # A corresponding flag ('fuzzy') is set.
    # A 'comment' string is returned, to indicate the change.
    found = re.search(substring, philosopher_lifespan)
    if found:
        philosopher_lifespan = re.sub(substring, "", philosopher_lifespan).strip()
        d_event_philosopher['add_mods'] = d_event_philosopher['add_mods']  + ', ' + substring
        comment += ', ' + substring
    return philosopher_lifespan, d_event_philosopher, comment

def lifespan_transform_slash(philosopher_lifespan, d_event_philosopher, comment):
    if ('/' in philosopher_lifespan):
        lo_slash_numbers = regex.findall(u"\d{1,4}/\d{1,4}", philosopher_lifespan)
        if len(lo_slash_numbers) == 1:
            # print ('philosopher_lifespan = ', philosopher_lifespan)
            slash_number = lo_slash_numbers[0]
            # slash_string = re.findall(u"/\d+", lo_slash_numbers[0])
            slash_digits = re.findall(u"/\d+", lo_slash_numbers[0])[0][1:]
            year_1 = 'um' + slash_digits
            philosopher_lifespan = re.sub(slash_number, year_1, philosopher_lifespan).strip()
            d_event_philosopher['add_mods'] = d_event_philosopher['add_mods']  + ', ' + '/'
        elif len(lo_slash_numbers) == 2:
            #  erst:  "/YYYY"
            slash_number = lo_slash_numbers[0]
            # slash_string = re.findall(u"/\d+", lo_slash_numbers[0])
            slash_digits = re.findall(u"/\d+", lo_slash_numbers[0])[0][1:]
            year_1 = 'um' + slash_digits
            philosopher_lifespan = re.sub(slash_number, year_1, philosopher_lifespan).strip()
            #  dann:  "YYYY/"
            slash_number = lo_slash_numbers[1]
            slash_string = re.findall(u"\d+/", lo_slash_numbers[1])
            slash_digits = re.findall(u"\d+/", lo_slash_numbers[1])[0][1:]
            year_2 = 'um' + slash_digits
            philosopher_lifespan = re.sub(slash_number, year_2, philosopher_lifespan).strip()

            d_event_philosopher['add_mods'] = d_event_philosopher['add_mods']  + ', ' + '/'
    return philosopher_lifespan, d_event_philosopher, comment


In [None]:
def vor_Chr_set_negativ(d_event_philosopher):
    if ((d_event_philosopher['start'] != '?') and d_event_philosopher['add_start_vCh']):
        d_event_philosopher['start'] = '-' + d_event_philosopher['start']
    if ((d_event_philosopher['end'] != '?') and d_event_philosopher['add_end_vCh']):
        d_event_philosopher['end'] = '-' + d_event_philosopher['end']
    return d_event_philosopher

In [None]:
d_event_philosoph_problem_cnt = 1
def philosopher_lifespan_trim (philosopher_lifespan, d_event_philosopher):
    global d_event_philosoph_problem_cnt

    comment = ''

    philosopher_lifespan                    = philosopher_lifespan.strip()
    philosopher_lifespan_org                = philosopher_lifespan
    d_event_philosopher['add_lifespan_org'] = philosopher_lifespan

    philosopher_lifespan, d_event_philosopher = lifespan_set_christian_era(philosopher_lifespan, d_event_philosopher)

    # string.replace(" ", "")
    philosopher_lifespan = philosopher_lifespan.replace(" ", "")
    if ("bis" in philosopher_lifespan):
        philosopher_lifespan = philosopher_lifespan.replace("bis", "-")
        d_event_philosopher['add_mods'] = d_event_philosopher['add_mods']  + ', bis'

    philosopher_lifespan = philosopher_lifespan.replace(" ", "")
    if ("/" in philosopher_lifespan):
        philosopher_lifespan, d_event_philosopher, comment = \
            lifespan_transform_slash(philosopher_lifespan, d_event_philosopher, comment)

    substrings = [u"um", u"vor", u"† nach", u"†", u"unsicher", "\*", u"Jh\.", u"Jahrhundert" ]
    for substring in substrings:
        philosopher_lifespan = philosopher_lifespan.replace(" ", "")
        philosopher_lifespan, d_event_philosopher, comment = \
            lifespan_delete_substring(substring, philosopher_lifespan, d_event_philosopher, comment)

    add_mods = d_event_philosopher['add_mods']
    if (u"um" in add_mods) or (u"vor" in add_mods) :
        d_event_philosopher['fuzzy']   = 'True'

    philosopher_lifespan = philosopher_lifespan.strip()
    # "^YYYY"  Stringanfang == Zeilenanfang
    search_substring_Y = u"^\d{1,2}\."
    if (u"Jh\." in add_mods) or (u"Jahrhundert" in add_mods) :
        substring = search_substring_Y
        if regex.match(substring, philosopher_lifespan):
            years = regex.search(u"^\d{1,4}", philosopher_lifespan)
            if years:
                try:
                    d_event_philosopher['start'] = str(int(years[0]) * 100)
                    d_event_philosopher['fuzzy']   = 'True'
                except:
                    pass

    philosopher_lifespan = philosopher_lifespan.strip()

    search_substring_from_Y_to_Y = u"\d{1,4}–\d{1,4}$"
    # Verdammtes Minus Zeichen! Verdammter Dash!!
    search_substring_from_Y_to_Y = u"\d{1,4}[–|-]\d{1,4}$"
    search_substring_Y = u"\d{1,4}$"

    substring = search_substring_from_Y_to_Y
    if regex.match(substring, philosopher_lifespan):
        years = regex.findall(u"\d{1,4}", philosopher_lifespan)
        d_event_philosopher['start']    = years[0]
        d_event_philosopher['end']      = years[1]
        d_event_philosopher['add_ok']   = 'ok'
        d_event_philosopher['add_mods'] += ', ' + substring
        comment += u", " + substring
    else:
        substring = search_substring_Y
        if regex.search(substring, philosopher_lifespan):
            years = regex.findall(u"\d{1,4}", philosopher_lifespan)
            if u'†' in d_event_philosopher['add_mods']:
                d_event_philosopher['end']   = years[0]
                d_event_philosopher['add_ok']   = 'ok'
            else:
                d_event_philosopher['start'] = years[0]
                if u'*' in d_event_philosopher['add_mods']:
                    d_event_philosopher['end']   = 'alive'
                    d_event_philosopher['add_ok']   = 'ok'
            d_event_philosopher['add_mods'] += ', ' + substring
        comment += u", " + substring

    # Nur Todesjahr => Geburtsjahr =~ Todesjahr - 65a
    if (d_event_philosopher['start'] == '?') and  (d_event_philosopher['end'] != '?') :
        if (u"†" in d_event_philosopher['add_mods']):
            try:
                d_event_philosopher['start'] = str(int(d_event_philosopher['end']) - 65)
                d_event_philosopher['add_mods'] += ', ' + 'start == end - 65'
                d_event_philosopher['add_ok']   = 'ok'
            except: pass
    # Nur 'Jahrhundert' => Geburtsjahr =~ Jahrhundert + 11;   Todesjahr =~ Jahrhundert + 88;
    elif  (d_event_philosopher['start'] != '?') and (d_event_philosopher['end'] == '?'):
        if (u"Jh\\." in d_event_philosopher['add_mods']) or (u"Jahrhundert" in add_mods) :
            try:
                start   = int(d_event_philosopher['start'])
                start_y = str(start + 11)
                end_y   = str(start + 88)
                d_event_philosopher['start'] = start_y
                d_event_philosopher['end']   = end_y
                if ('vor_Chr' in d_event_philosopher['add_mods'] ):
                    d_event_philosopher['start'], d_event_philosopher['end'] = \
                       d_event_philosopher['end'], d_event_philosopher['start']
                d_event_philosopher['add_mods'] += ', ' + 'end == start + 88 & start == start +11'
                d_event_philosopher['add_ok']   = 'ok'
            except:
                d_event_philosopher['add_mods'] += ', ' + 'ERROR: end == start + 88 & start == start +11'
                pass


    # vor_Chr => "YYYY" -> "-YYYY"
    d_event_philosopher = vor_Chr_set_negativ(d_event_philosopher)

    # d_event_philosopher['fuzzy']  == um YYYY
    if d_event_philosopher['fuzzy'] == 'True':
        if search_substring_from_Y_to_Y in d_event_philosopher['add_mods']:
            pass
        elif search_substring_Y in d_event_philosopher['add_mods']:
            # i.e. nur Geburtsjahr oder nur Todesjahr
            try:
                akme = int (d_event_philosopher['start'])
                d_event_philosopher['start'] = str(akme - 50)
                d_event_philosopher['end'] = str(akme + 15)
                d_event_philosopher['add_ok']   = 'ok'
                d_event_philosopher['add_mods'] = d_event_philosopher['add_mods']  + ', ' + 'fuzzy+akme: ' + search_substring_Y + 'in add_mods'
            except:
                pass


    # Prüfe ob Lebenszeit (Todesjahr - Geburtsjahr) negativ? (zB wenn 'v. Chr.' fehlt)
    search_substring_Y = u"\d{1,4}"
    substring = search_substring_Y
    if (regex.search(substring, d_event_philosopher["start"]) and (regex.search(substring, d_event_philosopher["end"]))):
        if (('vor_Chr' in d_event_philosopher['add_mods']) and ((int(d_event_philosopher["start"]) - int(d_event_philosopher["end"]) > 0))):
            # print (d_event_philosoph_problem_cnt, ' ', d_event_philosopher["add_lifespan_org"] + ': ', d_event_philosopher, end ='\n\n')
            print ('numbers: ', d_event_philosopher["start"], d_event_philosopher["end"])
            d_event_philosopher['add_ok'] = ''
        if (('vor_Chr' not in d_event_philosopher['add_mods']) and ((int(d_event_philosopher["start"]) - int(d_event_philosopher["end"]) > 0))):
            # print (d_event_philosoph_problem_cnt, ' ', d_event_philosopher["add_lifespan_org"] + ': ', d_event_philosopher, end ='\n\n')
            print ('numbers: ', d_event_philosopher["start"], d_event_philosopher["end"])
            d_event_philosopher['add_ok'] = ''

    ph_lifespan = philosopher_lifespan
    return d_event_philosopher


In [None]:
row_cnt = 0

act_h2  = ''   # Epoche
act_h3  = ''   # Unterepoche
act_h4  = ''   # Unter-Unterepoche

old_h2  = ''
old_h3  = ''
old_h4  = ''

# lo == list of : event == philosopher
lo_d_event_philosopher  = []
d_event_philosopher     = new_d_event_philosopher()
d_event_philosoph_cnt   = 0
d_event_philosoph_problem_cnt = 0

# cycle rows in table:
category = None
for bs4_row in bs4_table.find_all('tr'):
    # print (type(bs4_row))   # type == <class 'bs4.element.Tag'>
    bs4_row_text = BeautifulSoup(bs4_row.get_text())  # get_text() == 'converts' type to string, sort of
    row_cnt += 1
    # https://stackoverflow.com/questions/13853025/is-there-a-way-in-beautiful-soup-to-count-the-number-of-tags-in-a-html-page
    td_columns = len (bs4_row.find_all('td'))

    # 1 column in row == Epoche / Unterepoche / Unter-Unterepoche == Headline of span
    if (td_columns == 1):
        if bs4_row.find_all('h2'):
            try:
                act_h2 = bs4_row.find_all('span', class_='mw-headline')[0].string
                category = act_h2
            except:  pass
        elif bs4_row.find_all('h3'):
            try:
                act_h3 = bs4_row.find_all('span', class_='mw-headline')[0].string
                category = act_h2 + ' # ' + act_h3
            except: pass
        elif bs4_row.find_all('h4'):
            try:
                act_h4 = bs4_row.find_all('span', class_='mw-headline')[0].string
                category = act_h2  + ' # ' + act_h3  + ' # ' + act_h4
            except: pass
        # print ('category =  ', category)

    elif (td_columns == 4) :
        d_event_philosopher     = new_d_event_philosopher()
        philosoph_lifespan      = None

        # cycle 4 columns in row:  'Periode' / 'Philosoph' (Name) / 'Philosophie' (Thesen) / 'Allgemeine Geschichte'
        for td_cnt, td_item in enumerate (bs4_row.find_all('td')):
            if   (td_cnt == 0) and (td_item.string):                               #  'Periode' == Lebenszeitdaten
                d_event_philosopher = philosopher_lifespan_trim (td_item.string, d_event_philosopher)
            elif (td_cnt == 1) and (td_item.a.string):                             #  'Philosoph' (Name)
                philosopher_name            = philosopher_name_trim(td_item)
                d_event_philosopher["text"] = philosopher_name
                d_event_philosoph_cnt += 1
            elif (td_cnt == 2) and (td_item):                                      #  'Philosophie' (Thesen)
                philosopher_lo_theories = []
                for li_cnt, li_item in enumerate (td_item.find_all('li')):
                    philosopher_lo_theories.append(li_item.text)
            elif (td_cnt == 3) and (td_item):                                      #  'Allgemeine Geschichte'
                # philosopher_lo_history = td_item.text
                philosopher_lo_history = []
                for li_cnt, li_item in enumerate (td_item.find_all('li')):
                    philosopher_lo_history.append(li_item.text)

        d_event_philosopher["category"] = category
        if (d_event_philosopher["start"] == '?') or (d_event_philosopher["end"] == '?') or (d_event_philosopher["add_ok"] == ''):
            d_event_philosoph_problem_cnt += 1
            print (d_event_philosoph_problem_cnt, f"{bcolors.WARNING} ", d_event_philosopher["add_lifespan_org"] + f'{bcolors.ENDC}: ', d_event_philosopher, end ='\n\n')
        else:
            lo_d_event_philosopher.append(d_event_philosopher)

cnt_str = str(d_event_philosoph_cnt) + ' / ' + str(d_event_philosoph_problem_cnt)
print(f"{bcolors.WARNING}" + cnt_str + f"{bcolors.ENDC}")
# print (d_event_philosoph_cnt, '/', d_event_philosoph_problem_cnt)
    # lo_d_event_philosopher.append(philosopher)

fn = 'csv_tst.csv'
fn = 'WP-de_Zeittafel_zur_Philosophiegeschichte.csv'
write_csv_file_with_header(fn, lo_d_event_philosopher)

