Sydney Greenspun

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.request import urlopen
from urllib.error import HTTPError
all_languages = {}
"""
This is a wikipedia page that contains languages with 50 million or more speakers and information about their language families. This lists includes dialects which I hope to flush out. I use this as the base for my dataset.
"""
try:
    html = urlopen('https://en.wikipedia.org/wiki/List_of_languages_by_total_number_of_speakers')
except HTTPError as e:
    print(e)
else:

    bs = BeautifulSoup(html.read(), 'html.parser')
    table = bs.find('table')
    for row in table.find_all('tr')[2:]:
        cols = row.find_all(['th', 'td'])
        lang = re.sub(r"\(.*?\)", "", cols[0].text.strip())
        if lang == "Modern Standard Arabic":
            lang= "Arabic"
        if lang == "Mandarin Chinese":
            lang = "Chinese"
        if lang == "Standard German":
            lang = "German"
        if lang == "Western Punjabi":
            lang = "Punjabi"
        if lang == "Tagalog[b]":
            lang = "Tagalog"
        if lang == "Iranian Persian":
            lang = "Persian"
        all_languages[lang] = {}
        all_languages[lang]["Family"] = cols[1].getText(strip=True)
        all_languages[lang]["Branch"] = cols[2].getText(strip=True)
        all_languages[lang]["Speakers (in millions)"] = cols[5].getText(strip=True)


In [3]:
spoken_list = {}
"""
This is from the WorldData and contains a list of languages and the number of countries in which they are spoken
"""
try:
    html2 = urlopen('https://www.worlddata.info/languages/')
except HTTPError as e:
    print(e)
else:
    bs2 = BeautifulSoup(html2.read(), 'html.parser')
    table = bs2.find('table')
    for row in table.find_all('tr')[1:]:
        cols = row.find_all('td')
        #for some reason getText didn't work
        spoken_list[cols[0].text] ={}
        spoken_list[cols[0].text]["Countries"] = int(cols[1].text.split(' ')[0])



In [5]:
iso_codes = {}
"""
Another Wikipedia page that has a chart of languages and their ISO names.
"""
try:
    html4 = urlopen('https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes')
except HTTPError as e:
    print(e)
else:
    bs4 = BeautifulSoup(html4.read(), 'html.parser')
    table = bs4.find('table')
    current_language = None
    for row in table.find_all("tr")[2:]:
        cols = row.find_all(["th", "td"])
        if "," in cols[0].getText(strip=True):
            language = cols[0].getText(strip=True)[:cols[0].getText(strip=True).index(",")]
        else:
            language = cols[0].getText(strip=True)
        iso_codes[language] = {}
        iso_codes[language]["ISO-Code"]= cols[1].text


In [29]:
new_scripts = {}
"""
This page is a Wikipedia page that contains a list of scripts as headers and their associated languages below the headers in a list. I mapped the languages to the header they were below
"""
try:
    html5 = urlopen('https://simple.wikipedia.org/wiki/List_of_languages_by_writing_system')
except HTTPError as e:
    print(e)
else:
    bs5 = BeautifulSoup(html5.read(), 'html.parser')
    for heading in bs5.find_all("div", class_=['mw-heading2', 'mw-heading3']):
        script = heading.getText(strip=True)
        script = re.sub(r'\[.*\]', '', script)
        script = script.split(" ")[0]
        if script == "ArabicScript":
            script = "Arabic"
        ul = heading.find_next_sibling('ul')
        if not ul:
            continue
        for li in ul.find_all('li'):
            lang = re.sub(r"\(.*?\)", "", li.getText(strip=True))
            if lang:
                new_scripts.setdefault(lang, {}).setdefault("script", []).append(script)







Arabic
Armenian
Borama
Brahmic
Devanagari
Assamese/Bengali
Balinese
Baybayin
Buhid
Burmese
Gujarati
Gurmukhi
Hanunó'o
Javanese
Kannada
Khmer
Lao
Lepcha
Limbu
Lontara
Malayalam
Oriya
'Phags-pa
Sinhala
Tagbanwa
Tamil
Telugu
Thaana
Thai
Tibetan
Canadian
Cherokee
Constructed
Aiha
Argpal
Cirth
Ewellic
Klingon
Tengwar
Tolianem
Coptic
Cyrillic
Bosnian
Saba
Georgian
Glagolitic
Gothic
Greek
Chinese
Hangul
Hebrew
Jurchen
Kaddare
Kana
Khitan
Latin
Mayan
Mongolian
Old
Mongolian
Manchu
Munda
Sorang
Ol
Warang
N'Ko
Naxi
Nsibidi
Ogham
Osmanya
Pahawh
Runic
Old
Orkhon
Syriac
Tifinagh
Yi
References


In [32]:
iso = [{'language': k, **v} for k, v in iso_codes.items()]
language_facts = [{'language': k, **v} for k, v in all_languages.items()]
spoken = [{'language': k, **v} for k, v in spoken_list.items()]
scripts_list = [{'language': k, **v} for k, v in new_scripts.items()]

iso_df = pd.DataFrame(iso)
language_facts_df = pd.DataFrame(language_facts)
spoken_df = pd.DataFrame(spoken)
scripts_df = pd.DataFrame(scripts_list)
merged = pd.merge(iso_df, language_facts_df, on='language', how='right')
merged2 = pd.merge(merged, spoken_df, on='language', how='left')
final = pd.merge(merged2, scripts_df, on='language', how='left')
final.to_csv('/Users/sbg/PycharmProjects/language-wiki/data/scraped_data.csv', index=False)
