In [1]:
!pip install beautifulsoup4
!pip install requests



In [1]:
url ='https://en.wikipedia.org/wiki/World_Happiness_Report'

In [91]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np


page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')


def define_title(soup, tag, class_type, class_value):
    scrambled_titles_list = []
    scrambled_titles = soup.find_all(tag, {class_type : class_value})

    for scrambled_title in scrambled_titles:
        scrambled_titles_list.append(scrambled_title)
        
    return scrambled_titles_list
    
td_class_titles = define_title(soup, 'span', "class", "mw-headline")

def removed_tag_titles(soups=soup, tag= '', regex = ''):
    import re

    title_list = []

    for soup in soups:
        text_corpus = soup.find_all(tag) 
        if tag == '':
            text_corpus = soups
        if type(regex) == str:
            regex_in = regex
        elif type(regex) == list:
            regex_in = '|'.join(regex)
        else:
            raise ValueError('Use only a regex string or list of regex strings')

        for node in text_corpus:
            text = node.get_text()
            text= re.sub(regex_in, '', text)
            title_list.append(text)
            
    return title_list

filtered_titles = removed_tag_titles(td_class_titles, '' , ["\[[^\]]*\]", '\n'])


def make_titles(arr):
    final_titles = {}
    j = 0
    for i, val in enumerate(arr, 1):
        if val != '\n':
            j += 1
            final_titles[f'Table_{j}'] = val
    return final_titles

final_titles = make_titles(filtered_titles)



def locate_tables(soup=soup):
    tbody_class_tables_dict = {}
    wikitable_sortable_class = soup.find_all("table", {"class": "wikitable sortable"})
    for i, wiki_table in enumerate(wikitable_sortable_class):
        tbody_class_tables_dict[f'Table_{i + 1}'] = wikitable_sortable_class[i]
            
    return tbody_class_tables_dict

tbody_class_tables_dict = locate_tables(soup)

def removed_tag_tables(soup=soup, tag= '', regex = ''):
    import re
    text_corpus = soup.find_all(tag) 
    final_text = []
    
    if type(regex) == str:
        regex_in = regex
    elif type(regex) == list:
        regex_in = '|'.join(regex)
    else:
        raise ValueError('Use only a regex string or list of regex strings')

    for node in text_corpus:
        text = node.get_text()
        text= re.sub(regex_in, '', text)
        final_text.append(text)
    return final_text

def filter_tables(tables_dict):        
    filtered_table_dict = {}
    for key, val in tables_dict.items():

        tbody_class_table_clean = removed_tag_tables(val, 'tr' , ["\[[^\]]*\]", '\xa0', ',', '\n$', '^\n'])
        filtered_table_dict[key] = tbody_class_table_clean
        
    return filtered_table_dict

filter_tables_dict = filter_tables(tbody_class_tables_dict)


def fill_nulls(tables_dict):
    import numpy as np
    filled_filter_tables_dict = {}
    for key, table in tables_dict.items():
        filled_nulls_table = []
        for i, row in enumerate(table):
            if '\n' in row[0:1]:
                row_filled = 'NaN' + row           
                filled_nulls_table.append(row_filled)
            else:
                row_filled = row
                filled_nulls_table.append(row_filled)
        filled_filter_tables_dict[key] = filled_nulls_table
    return filled_filter_tables_dict
    
filtered_tables_rows = fill_nulls(filter_tables_dict)

def make_tables(tables_dict):
    import re   
    final_tables_dict = {}
    for key, table in tables_dict.items():
        filtered_tables_rows_dict = {}
        for i, val in enumerate(table, 1):
            row = re.split(r"\n", val)
            while "" in row:
                row.remove("")
            filtered_tables_rows_dict[f'Row_{i}'] = row

        final_tables_dict[key] = filtered_tables_rows_dict
    return final_tables_dict
    
final_rows = make_tables(filtered_tables_rows)

#tables_dict = {key: tables{key: rows[values]}}

def make_dataframe(tables_dict):
    tables_to_df = {}
    removed_values = {}
    for table_key, table in tables_dict.items():
        same = True
        len_avg = []
        for i in range(len(table.values()) - 1):
            length = len(list(table.values())[i])
            len_avg.append(length)
            if len(list(table.values())[i+1]) == length:
#                 print(f'good: {list(table.values())[i], list(table.values())[i+1]}')
                pass
            else:
#                 print(f'bad: {list(table.values())[i], list(table.values())[i+1]}')
                same = False
        if same == True:
            tables_to_df[table_key] = table
        else:
            keep_rows = input(f'Trable {table_key} could not be added, would you like to add the table with missing values?(y/n) \n')
            while keep_rows.lower() not in ["yes", "y", "no", "n"]:
                keep_rows = input(f'Sorry, "{keep_rows}" is not a valid input.\nTable {table_key} could not be added, would you like to add the table with missing values?(y/n)')
            if keep_rows.lower() == "no" or keep_rows.lower() == "n":
                print(f'{table_key} was not be added, the length of the rows are not the same')
                same = True
            elif keep_rows.lower() == "yes" or keep_rows.lower() == "y":
                mode = max(set(len_avg), key=len_avg.count)
                
                tables_to_df[table_key] = {}
                removed_values = {}
                for row_key, row in table.items():
                    length = len(row)
                    if len(row) == mode:
                        tables_to_df[table_key][row_key] = row
                    else:
                        print(f'Popped key:{row_key} and pop value: {row}')
                        removed_values[row_key] = row
    final_tables = {}
    for key, table in tables_to_df.items():
        final_tables[key] = pd.DataFrame.from_dict(table).transpose()
        final_tables[key] = final_tables.get(key).reset_index(drop = True)
        final_tables[key].columns = final_tables[key].iloc[0]
        final_tables[key] = final_tables[key][1:]
    return final_tables, removed_values

final_tables_tup = make_dataframe(final_rows)
final_tables = final_tables_tup[0]
removed_values = final_tables_tup[1]

def name_tables(titles, tables):

    final_df = {titles[key] : value for key, value in tables.items()} 
    return final_df


final_df = name_tables(final_titles, final_tables)


path = r'C:\Users\alebe\Documents\coding-temple-jan2021\capstone\csv_tables'+'\\'
doc_title= 'World_Happiness_Report'
def finaldf_to_csv(final_df):
    for i, table in enumerate(final_df): 
        final_df[table].to_csv(path+doc_title+str(table)+'.csv')
        print(f'Table_{i + 1} was saved in CSV format!')
finaldf_to_csv(final_df)

def removeddf_to_csv(removed_values):
    if removed_values != {}:    
        with open(path+doc_title+'.txt', 'w') as output:
            for key, row in removed_values.items():
                output.write(f"{str(key)}: {str(row)}" + '\n')
        print(doc_title+' was saved in a TXT format!')

removeddf_to_csv(removed_values)


Trable Table_4 could not be added, would you like to add the table with missing values?(y/n) 
n
Table_4 was not be added, the length of the rows are not the same
Trable Table_5 could not be added, would you like to add the table with missing values?(y/n) 
n
Table_5 was not be added, the length of the rows are not the same
Table_1 was saved in CSV format!
Table_2 was saved in CSV format!
Table_3 was saved in CSV format!


In [7]:
import os
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np



page = requests.get(url)


### FILL OUT BELOW ###
soup = BeautifulSoup(page.content, 'html.parser')

doc_title= 'World_Happiness_Report'

path_dir = r'C:\Users\alebe\Documents\coding-temple-jan2021\capstone\csv_tables'

### FILL OUT ABOVE ###
path_join = os.path.join(path_dir, doc_title)
try: 
    path = os.makedirs(path_join)
except OSError as error:  
    print(error)   
    

    
def locate_tables(soup=soup, tag="table", class_type="class", class_value="wikitable sortable"):
    tbody_class_tables_dict = {}
    wikitable_sortable_class = soup.find_all(tag, {class_type : class_value})
    for i, wiki_table in enumerate(wikitable_sortable_class):
        tbody_class_tables_dict[f'Table_{i + 1}'] = wikitable_sortable_class[i]
            
    return tbody_class_tables_dict

tbody_class_tables_dict = locate_tables(soup, "table", "class", "wikitable")
#print(tbody_class_tables_dict)

def removed_tag_tables(soup=soup, tag= '', regex = ''):
    import re
    text_corpus = soup.find_all(tag) 
    final_text = []
    
    if type(regex) == str:
        regex_in = regex
    elif type(regex) == list:
        regex_in = '|'.join(regex)
    else:
        raise ValueError('Use only a regex string or list of regex strings')

    for node in text_corpus:
        text = node.get_text()
        text= re.sub(regex_in, '', text)
        final_text.append(text)
    return final_text


def filter_tables(tables_dict):        
    filtered_table_dict = {}
    for key, val in tables_dict.items():
        tbody_class_table_clean = removed_tag_tables(val, 'tr' , ["\[[^\]]*\]", '\xa0', ',', '\n$', '^\n', '\xad', '\u200b', '\u0394'])
        filtered_table_dict[key] = tbody_class_table_clean        


    return filtered_table_dict

filter_tables_dict = filter_tables(tbody_class_tables_dict)

def fill_nulls(tables_dict):
    #print(tables_dict)
    
    import numpy as np
    filled_filter_tables_dict = {}
    for key, table in tables_dict.items():
        filled_nulls_table = []
        for i, row in enumerate(table):
#             while '\n\n\n' in row:
#                 row = row.replace('\n\n\n', '\nNaN\n')
#             while '\n\n' in row:
#                 row = row.replace('\n\n', '\nNaN\n')
#             if '\n' in row[0:1]:
#                 row_filled = 'NaN' + row     
            if '\n' in row[-2:]:
                row_filled = row+'NaN' 
                filled_nulls_table.append(row_filled)
            else:
                row_filled = row
                filled_nulls_table.append(row_filled)
        filled_filter_tables_dict[key] = filled_nulls_table
    return filled_filter_tables_dict
    
filtered_tables_rows = fill_nulls(filter_tables_dict)
#print(filtered_tables_rows)

def make_tables(tables_dict):
    import re   
    final_tables_dict = {}
    for key, table in tables_dict.items():
        filtered_tables_rows_dict = {}
        for i, val in enumerate(table, 1):
            row = re.split(r"\n", val)
            while "" in row:
                row.remove("")
            filtered_tables_rows_dict[f'Row_{i}'] = row

        final_tables_dict[key] = filtered_tables_rows_dict
    return final_tables_dict
    
final_rows = make_tables(filtered_tables_rows)
#print(final_rows)

def make_dataframe(tables_dict):
    tables_to_df = {}
    removed_tables= {}
    for table_key, table in tables_dict.items():
        same = True
        len_avg = []
        for i in range(len(table.values()) - 1):
            length = len(list(table.values())[i])
            len_avg.append(length)
            if len(list(table.values())[i+1]) == length:
#                 print(f'good: {list(table.values())[i], list(table.values())[i+1]}')
                pass
            else:
#                 print(f'bad: {list(table.values())[i], list(table.values())[i+1]}')
                same = False
        if same == True:
            tables_to_df[table_key] = table
        else:
            keep_rows = input(f'{table_key} could not be added, would you like to add the table with missing values?(y/n) \n')
            while keep_rows.lower() not in ["yes", "y", "no", "n"]:
                keep_rows = input(f'Sorry, "{keep_rows}" is not a valid input.\nTable {table_key} could not be added, would you like to add the table with missing values?(y/n)')
            if keep_rows.lower() == "no" or keep_rows.lower() == "n":
                print(f'{table_key} was not be added, the length of the rows are not the same')
                same = True
            elif keep_rows.lower() == "yes" or keep_rows.lower() == "y":
                mode = max(set(len_avg), key=len_avg.count)
                
                tables_to_df[table_key] = {}
                removed_tables[table_key] = {}
                for row_key, row in table.items():
                    length = len(row)
                    if len(row) == mode:
                        tables_to_df[table_key][row_key] = row
                    else:
                        print(f'Popped key:{row_key} and pop value: {row}')
                        removed_tables[table_key][row_key] = row
    final_tables = {}
    for key, table in tables_to_df.items():
        final_tables[key] = pd.DataFrame.from_dict(table).transpose()
        final_tables[key] = final_tables.get(key).reset_index(drop = True)
        final_tables[key].columns = final_tables[key].iloc[0]
        final_tables[key] = final_tables[key][1:]
    return final_tables, removed_tables

final_tables_tup = make_dataframe(final_rows)
#print(final_tables_tup)
final_tables = final_tables_tup[0]
removed_tables = final_tables_tup[1]



def finaldf_to_csv(final_df):
    for i, table in enumerate(final_df): 
        final_df[table].to_csv(path_dir+'/'+doc_title+'/'+doc_title+'_'+str(table)+'.csv')
        print(f'Table_{i + 1} was saved in CSV format!')
finaldf_to_csv(final_tables)


def removeddf_to_csv(removed_tables):
    if removed_tables != {}: 
        for table in removed_tables:
            with open(path_dir+'/'+doc_title+'/'+doc_title+'_'+str(table)+'.txt', 'w') as output:
                for key, row in removed_tables[f'{table}'].items():
                    output.write(f"table:{str(table)}, row number:{str(key)}, row content: {str(row)}" + '\n')
                print(doc_title+f' was saved in a TXT format! {table}')


removeddf_to_csv(removed_tables)




Table_4 could not be added, would you like to add the table with missing values?(y/n) 
y
Popped key:Row_4 and pop value: ['3', 'Iceland', '7.504', ' 0.003', '1.481', '1.611', '0.834', '0.627', '0.476', '0.154', '2.323']
Popped key:Row_6 and pop value: ['5', 'Finland', '7.469', ' 0.056', '1.444', '1.540', '0.809', '0.618', '0.245', '0.383', '2.430']
Popped key:Row_9 and pop value: ['8', 'New Zealand', '7.314', ' 0.020', '1.406', '1.548', '0.817', '0.614', '0.500', '0.383', '2.046']
Popped key:Row_10 and pop value: ['9', 'Australia', '7.284', ' 0.029', '1.484', '1.510', '0.844', '0.602', '0.478', '0.301', '2.065']
Popped key:Row_11 and pop value: ['10', 'Sweden', '7.284', ' 0.007', '1.494', '1.478', '0.831', '0.613', '0.385', '0.384', '2.098']
Popped key:Row_12 and pop value: ['11', 'Israel', '7.213', ' 0.054', '1.375', '1.376', '0.838', '0.406', '0.330', '0.085', '2.802']
Popped key:Row_17 and pop value: ['16', 'Germany', '6.951', ' 0.043', '1.488', '1.473', '0.799', '0.563', '0.336', '