In [6]:
!pip install beautifulsoup4
!pip install requests



In [2]:
url ='https://en.wikipedia.org/wiki/List_of_countries_by_GNI_(nominal)_per_capita'

In [42]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np


page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

def define_title(soup, tag, class_type, class_value):
    scrambled_titles_list = []
    scrambled_titles = soup.find_all(tag, {class_type : class_value})

    for scrambled_title in scrambled_titles:
        scrambled_titles_list.append(scrambled_title)
        
    return scrambled_titles_list
    
td_class_titles = define_title(soup, 'tr', "", "")
# print(td_class_titles)

def removed_tag_titles(soups=soup, tag= '', regex = ''):
    import re

    title_list = []

    for soup in soups:
        text_corpus = soup.find_all(tag) 
        if tag == '':
            text_corpus = soups
        if type(regex) == str:
            regex_in = regex
        elif type(regex) == list:
            regex_in = '|'.join(regex)
        else:
            raise ValueError('Use only a regex string or list of regex strings')

        for node in text_corpus:
            text = node.get_text()
            text= re.sub(regex_in, '', text)
            title_list.append(text)
            
    return title_list

filtered_titles = removed_tag_titles(td_class_titles, 'th' , ["\[[^\]]*\]", '\n'])
#print(filtered_titles)

def make_titles(arr):
    final_titles = {}
    j = 0
    for i, val in enumerate(arr, 1):
        if val != '\n':
            j += 1
            final_titles[f'Table_{j}'] = val
    return final_titles

final_titles = make_titles(filtered_titles)



def locate_tables(soup=soup):
    tbody_class_tables_dict = {}
    wikitable_sortable_class = soup.find_all("table", {"class": "wikitable"})
    for i, wiki_table in enumerate(wikitable_sortable_class):
        tbody_class_tables_dict[f'Table_{i + 1}'] = wikitable_sortable_class[i]
            
    return tbody_class_tables_dict

tbody_class_tables_dict = locate_tables(soup)
#print(tbody_class_tables_dict)

def removed_tag_tables(soup=soup, tag= '', regex = ''):
    import re
    text_corpus = soup.find_all(tag) 
    final_text = []
    
    if type(regex) == str:
        regex_in = regex
    elif type(regex) == list:
        regex_in = '|'.join(regex)
    else:
        raise ValueError('Use only a regex string or list of regex strings')

    for node in text_corpus:
        text = node.get_text()
        text= re.sub(regex_in, '', text)
        final_text.append(text)
    return final_text


def filter_tables(tables_dict):        
    filtered_table_dict = {}
    for key, val in tables_dict.items():
        tbody_class_table_clean = removed_tag_tables(val, 'tr' , ["\[[^\]]*\]", '\xa0', ',', '\n$', '^\n', '\xad', '\u200b'])
        filtered_table_dict[key] = tbody_class_table_clean        


    return filtered_table_dict

filter_tables_dict = filter_tables(tbody_class_tables_dict)

def fill_nulls(tables_dict):
    #print(tables_dict)
    
    import numpy as np
    filled_filter_tables_dict = {}
    for key, table in tables_dict.items():
        filled_nulls_table = []
        for i, row in enumerate(table):
#             while '\n\n\n' in row:
#                 row = row.replace('\n\n\n', '\nNaN\n')
#             while '\n\n' in row:
#                 row = row.replace('\n\n', '\nNaN\n')
            if '\n' in row[0:1]:
                row_filled = 'NaN' + row     
            if '\n' in row[-2:]:
                row_filled = row+'NaN' 
                filled_nulls_table.append(row_filled)
            else:
                row_filled = row
                filled_nulls_table.append(row_filled)
        filled_filter_tables_dict[key] = filled_nulls_table
    return filled_filter_tables_dict
    
filtered_tables_rows = fill_nulls(filter_tables_dict)
print(filtered_tables_rows)

def make_tables(tables_dict):
    import re   
    final_tables_dict = {}
    for key, table in tables_dict.items():
        filtered_tables_rows_dict = {}
        for i, val in enumerate(table, 1):
            row = re.split(r"\n", val)
            while "" in row:
                row.remove("")
            filtered_tables_rows_dict[f'Row_{i}'] = row

        final_tables_dict[key] = filtered_tables_rows_dict
    return final_tables_dict
    
final_rows = make_tables(filtered_tables_rows)
print(final_rows)

def make_dataframe(tables_dict):
    tables_to_df = {}
    removed_tables= {}
    for table_key, table in tables_dict.items():
        same = True
        len_avg = []
        for i in range(len(table.values()) - 1):
            length = len(list(table.values())[i])
            len_avg.append(length)
            if len(list(table.values())[i+1]) == length:
#                 print(f'good: {list(table.values())[i], list(table.values())[i+1]}')
                pass
            else:
#                 print(f'bad: {list(table.values())[i], list(table.values())[i+1]}')
                same = False
        if same == True:
            tables_to_df[table_key] = table
        else:
            keep_rows = input(f'Trable {table_key} could not be added, would you like to add the table with missing values?(y/n) \n')
            while keep_rows.lower() not in ["yes", "y", "no", "n"]:
                keep_rows = input(f'Sorry, "{keep_rows}" is not a valid input.\nTable {table_key} could not be added, would you like to add the table with missing values?(y/n)')
            if keep_rows.lower() == "no" or keep_rows.lower() == "n":
                print(f'{table_key} was not be added, the length of the rows are not the same')
                same = True
            elif keep_rows.lower() == "yes" or keep_rows.lower() == "y":
                mode = max(set(len_avg), key=len_avg.count)
                
                tables_to_df[table_key] = {}
                removed_tables[table_key] = {}
                for row_key, row in table.items():
                    length = len(row)
                    if len(row) == mode:
                        tables_to_df[table_key][row_key] = row
                    else:
                        print(f'Popped key:{row_key} and pop value: {row}')
                        removed_tables[table_key][row_key] = row
    final_tables = {}
    for key, table in tables_to_df.items():
        final_tables[key] = pd.DataFrame.from_dict(table).transpose()
        final_tables[key] = final_tables.get(key).reset_index(drop = True)
        final_tables[key].columns = final_tables[key].iloc[0]
        final_tables[key] = final_tables[key][1:]
    return final_tables, removed_tables

final_tables_tup = make_dataframe(final_rows)
final_tables = final_tables_tup[0]
removed_tables = final_tables_tup[1]

def name_tables(titles, tables):

    final_df = {titles[key] : value for key, value in tables.items()} 
    return final_df


final_df = name_tables(final_titles, final_tables)


path = r'C:\Users\alebe\Documents\coding-temple-jan2021\capstone\csv_tables'+'\\'
doc_title= 'GNI_(nominal)_per_capita'
def finaldf_to_csv(final_df):
    for i, table in enumerate(final_df): 
        final_df[table].to_csv(path+doc_title+str(table)+'.csv')
        print(f'Table_{i + 1} was saved in CSV format!')
finaldf_to_csv(final_df)

def removeddf_to_csv(removed_tables):
    if removed_tables != {}: 
        for i, table in enumerate(removed_tables, 1):
            with open(path+doc_title+'table'+str(i)+'.txt', 'w') as output:
                for key, row in removed_tables[f'Table_{i}'].items():
                    output.write(f"Table:{str(table)}, row number:{str(key)}, row content: {str(row)}" + '\n')
                print(doc_title+f' was saved in a TXT format! table = {i}')

removeddf_to_csv(removed_tables)


{'Table_1': ['Rank\n\nCountry\n\nGNI per capita (US$)\n\nYear', '1\nLiechtenstein\n116430\n2009', '—\nBermuda (UK)\n106140\n2013', '2\nSwitzerland\n85500\n2019', '3\nNorway\n82500\n2019', '—\nMacau (China)\n78640\n2018', '—\nIsle of Man (UK)\n75340\n2017', '4\nLuxembourg\n73910\n2019', '5\nIceland\n72850\n2019', '—\n  Channel Islands (UK)\n66230\n2007', '6\nUnited States\n65760\n2019', '7\nQatar\n63410\n2019', '8\nDenmark\n63240\n2019', '9\nIreland\n62210\n2019', '10\nSingapore\n59590\n2019', '11\nSweden\n55840\n2019', '12\nAustralia\n54910\n2019', '13\nNetherlands\n53200\n2019', '14\nAustria\n51300\n2019', '—\nHong Kong (China)\n50840\n2019', '15\nFinland\n49580\n2019', '16\nGermany\n48520\n2019', '17\nBelgium\n47350\n2019', '—\nCayman Islands (UK)\n47140\n2017', '18\nCanada\n46370\n2019', '19\nUnited Arab Emirates\n43470\n2019', '20\nIsrael\n43290\n2019', '21\nNew Zealand\n42670\n2019', '22\nFrance\n42400\n2019', '23\nUnited Kingdom\n42370\n2019', '24\nJapan\n41690\n2019', '—\nGreenl

In [12]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np


page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

def define_title(soup, tag, class_type, class_value):
    scrambled_titles_list = []
    scrambled_titles = soup.find_all(tag, {class_type : class_value})

    for scrambled_title in scrambled_titles:
        scrambled_titles_list.append(scrambled_title)
        
    return scrambled_titles_list
    
td_class_titles = define_title(soup, 'tr', "", "")
# print(td_class_titles)

def removed_tag_titles(soups=soup, tag= '', regex = ''):
    import re

    title_list = []

    for soup in soups:
        text_corpus = soup.find_all(tag) 
        if tag == '':
            text_corpus = soups
        if type(regex) == str:
            regex_in = regex
        elif type(regex) == list:
            regex_in = '|'.join(regex)
        else:
            raise ValueError('Use only a regex string or list of regex strings')

        for node in text_corpus:
            text = node.get_text()
            text= re.sub(regex_in, '', text)
            title_list.append(text)
            
    return title_list

filtered_titles = removed_tag_titles(td_class_titles, 'th' , ["\[[^\]]*\]", '\n'])
#print(filtered_titles)

def make_titles(arr):
    final_titles = {}
    j = 0
    for i, val in enumerate(arr, 1):
        if val != '\n':
            j += 1
            final_titles[f'Table_{j}'] = val
    return final_titles

final_titles = make_titles(filtered_titles)



def locate_tables(soup=soup):
    tbody_class_tables_dict = {}
    wikitable_sortable_class = soup.find_all("table", {"class": "wikitable"})
    for i, wiki_table in enumerate(wikitable_sortable_class):
        tbody_class_tables_dict[f'Table_{i + 1}'] = wikitable_sortable_class[i]
            
    return tbody_class_tables_dict

tbody_class_tables_dict = locate_tables(soup)
#print(tbody_class_tables_dict)

def removed_tag_tables(soup=soup, tag= '', regex = ''):
    import re
    text_corpus = soup.find_all(tag) 
    final_text = []
    
    if type(regex) == str:
        regex_in = regex
    elif type(regex) == list:
        regex_in = '|'.join(regex)
    else:
        raise ValueError('Use only a regex string or list of regex strings')

    for node in text_corpus:
        text = node.get_text()
        text= re.sub(regex_in, '', text)
        final_text.append(text)
    return final_text


def filter_tables(tables_dict):        
    filtered_table_dict = {}
    for key, val in tables_dict.items():
        tbody_class_table_clean = removed_tag_tables(val, 'tr' , ["\[[^\]]*\]", '\xa0', ',', '\n$', '^\n', '\xad', '\u200b'])
        filtered_table_dict[key] = tbody_class_table_clean        


    return filtered_table_dict

filter_tables_dict = filter_tables(tbody_class_tables_dict)

def fill_nulls(tables_dict):
    #print(tables_dict)
    
    import numpy as np
    filled_filter_tables_dict = {}
    for key, table in tables_dict.items():
        filled_nulls_table = []
        for i, row in enumerate(table):
#             while '\n\n\n' in row:
#                 row = row.replace('\n\n\n', '\nNaN\n')
#             while '\n\n' in row:
#                 row = row.replace('\n\n', '\nNaN\n')
            if '\n' in row[0:1]:
                row_filled = 'NaN' + row     
            if '\n' in row[-2:]:
                row_filled = row+'NaN' 
                filled_nulls_table.append(row_filled)
            else:
                row_filled = row
                filled_nulls_table.append(row_filled)
        filled_filter_tables_dict[key] = filled_nulls_table
    return filled_filter_tables_dict
    
filtered_tables_rows = fill_nulls(filter_tables_dict)
#print(filtered_tables_rows)

def make_tables(tables_dict):
    import re   
    final_tables_dict = {}
    for key, table in tables_dict.items():
        filtered_tables_rows_dict = {}
        for i, val in enumerate(table, 1):
            row = re.split(r"\n", val)
            while "" in row:
                row.remove("")
            filtered_tables_rows_dict[f'Row_{i}'] = row

        final_tables_dict[key] = filtered_tables_rows_dict
    return final_tables_dict
    
final_rows = make_tables(filtered_tables_rows)
#print(final_rows)

def make_dataframe(tables_dict):
    tables_to_df = {}
    removed_tables= {}
    for table_key, table in tables_dict.items():
        same = True
        len_avg = []
        for i in range(len(table.values()) - 1):
            length = len(list(table.values())[i])
            len_avg.append(length)
            if len(list(table.values())[i+1]) == length:
#                 print(f'good: {list(table.values())[i], list(table.values())[i+1]}')
                pass
            else:
#                 print(f'bad: {list(table.values())[i], list(table.values())[i+1]}')
                same = False
        if same == True:
            tables_to_df[table_key] = table
        else:
            keep_rows = input(f'Trable {table_key} could not be added, would you like to add the table with missing values?(y/n) \n')
            while keep_rows.lower() not in ["yes", "y", "no", "n"]:
                keep_rows = input(f'Sorry, "{keep_rows}" is not a valid input.\nTable {table_key} could not be added, would you like to add the table with missing values?(y/n)')
            if keep_rows.lower() == "no" or keep_rows.lower() == "n":
                print(f'{table_key} was not be added, the length of the rows are not the same')
                same = True
            elif keep_rows.lower() == "yes" or keep_rows.lower() == "y":
                mode = max(set(len_avg), key=len_avg.count)
                
                tables_to_df[table_key] = {}
                removed_tables[table_key] = {}
                for row_key, row in table.items():
                    length = len(row)
                    if len(row) == mode:
                        tables_to_df[table_key][row_key] = row
                    else:
                        print(f'Popped key:{row_key} and pop value: {row}')
                        removed_tables[table_key][row_key] = row
    final_tables = {}
    for key, table in tables_to_df.items():
        final_tables[key] = pd.DataFrame.from_dict(table).transpose()
        final_tables[key] = final_tables.get(key).reset_index(drop = True)
        final_tables[key].columns = final_tables[key].iloc[0]
        final_tables[key] = final_tables[key][1:]
    return final_tables, removed_tables

final_tables_tup = make_dataframe(final_rows)
#print(final_tables_tup)
final_tables = final_tables_tup[0]
removed_tables = final_tables_tup[1]

# def name_tables(titles, tables):

#     final_df = {titles[key] : value for key, value in tables.items()} 
#     return final_df


# final_df = name_tables(final_titles, final_tables)
# print(final_df)


path = r'C:\Users\alebe\Documents\coding-temple-jan2021\capstone\csv_tables'+'\\'
doc_title= 'GNI_(nominal)_per_capita'
def finaldf_to_csv(final_df):
    for i, table in enumerate(final_df): 
        final_df[table].to_csv(path+doc_title+str(table)+'.csv')
        print(f'Table_{i + 
1} was saved in CSV format!')
finaldf_to_csv(final_tables)

def removeddf_to_csv(removed_tables):
    if removed_tables != {}: 
        for i, table in enumerate(removed_tables, 1):
            with open(path+doc_title+'table'+str(i)+'.txt', 'w') as output:
                for key, row in removed_tables[f'Table_{i}'].items():
                    output.write(f"Table:{str(table)}, row number:{str(key)}, row content: {str(row)}" + '\n')
                print(doc_title+f' was saved in a TXT format! table = {i}')

removeddf_to_csv(removed_tables)


({'Table_1': 0  Rank                 Country GNI per capita (US$)  Year
1     1           Liechtenstein               116430  2009
2     —            Bermuda (UK)               106140  2013
3     2             Switzerland                85500  2019
4     3                  Norway                82500  2019
5     —           Macau (China)                78640  2018
6     —        Isle of Man (UK)                75340  2017
7     4              Luxembourg                73910  2019
8     5                 Iceland                72850  2019
9     —    Channel Islands (UK)                66230  2007
10    6           United States                65760  2019
11    7                   Qatar                63410  2019
12    8                 Denmark                63240  2019
13    9                 Ireland                62210  2019
14   10               Singapore                59590  2019
15   11                  Sweden                55840  2019
16   12               Australia            