Sydney Greenspun

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.request import urlopen
from urllib.error import HTTPError
all_languages = {}
"""
This is a wikipedia page that contains languages with 50 million or more speakers and information about their language families. This lists includes dialects which I hope to flush out. I use this as the base for my dataset.
"""
try:
    html = urlopen('https://en.wikipedia.org/wiki/List_of_languages_by_total_number_of_speakers')
except HTTPError as e:
    print(e)
else:

    bs = BeautifulSoup(html.read(), 'html.parser')
    table = bs.find('table')
    for row in table.find_all('tr')[2:]:
        cols = row.find_all(['th', 'td'])
        lang = re.sub(r"\(.*?\)", "", cols[0].text.strip())
        if lang == "Modern Standard Arabic":
            lang= "Arabic"
        if lang == "Mandarin Chinese":
            lang = "Chinese"
        if lang == "Standard German":
            lang = "German"
        if lang == "Western Punjabi":
            lang = "Punjabi"
        if lang == "Tagalog[b]":
            lang = "Tagalog"
        if lang == "Iranian Persian":
            lang = "Persian"
        all_languages[lang] = {}
        all_languages[lang]["Family"] = cols[1].getText(strip=True)
        all_languages[lang]["Branch"] = cols[2].getText(strip=True)
        all_languages[lang]["Speakers (in millions)"] = cols[5].getText(strip=True)


In [2]:
spoken_list = {}
"""
This is from the WorldData and contains a list of languages and the number of countries in which they are spoken
"""
try:
    html2 = urlopen('https://www.worlddata.info/languages/')
except HTTPError as e:
    print(e)
else:
    bs2 = BeautifulSoup(html2.read(), 'html.parser')
    table = bs2.find('table')
    for row in table.find_all('tr')[1:]:
        cols = row.find_all('td')
        #for some reason getText didn't work
        spoken_list[cols[0].text] ={}
        spoken_list[cols[0].text]["Countries"] = int(cols[1].text.split(' ')[0])



In [5]:
language_scripts = {}
"""
This is from the Unicode website that includes supplemental charts. This includes tons of languages and their script(s). I used chatGPT to help fix this, what happened is the script column can have multiple rows within it with one span in the language column. It kept stopped when it hit a language row with more than one script. I didn't just plug in "extract the language and scripts from this website" I explained the situation in general, and had to actually adapt it to include all the scripts if there is more than one (ChatGPT couldn't figure it out, it would only grab the first and continue) and remove extinct scripts (though it only removes one of the scripts if multiple scripts have the N because of another span issue which I don't really know how to fix)
"""
try:
    html3 = urlopen('https://www.unicode.org/cldr/charts/45/supplemental/languages_and_scripts.html')
except HTTPError as e:
    print(e)
else:
    bs3 = BeautifulSoup(html3.read(), 'html.parser')
    table = bs3.find_all('table')
    scripts_table = table[4]
    current_language = None
    #
    for row in scripts_table.find_all("tr")[1:]:
        cols = row.find_all(["th", "td"])
        if not cols:
            continue
        if cols[0].getText(strip=True) and len(cols) >= 6:
            current_language = cols[0].getText(strip=True)
        if len(cols) >= 6 and cols[-1].getText(strip=True) != "N":
            script = cols[4].getText(strip=True)
        else:
            if cols[-1].getText(strip=True) != "N" and cols[-1]:
                script = cols[0].getText(strip=True)
            else:
                continue

        language_scripts.setdefault(current_language, {}).setdefault("script", []).append(script)





In [6]:
iso_codes = {}
"""
Another Wikipedia page that has a chart of languages and their ISO names.
"""
try:
    html4 = urlopen('https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes')
except HTTPError as e:
    print(e)
else:
    bs4 = BeautifulSoup(html4.read(), 'html.parser')
    table = bs4.find('table')
    current_language = None
    for row in table.find_all("tr")[2:]:
        cols = row.find_all(["th", "td"])
        if "," in cols[0].getText(strip=True):
            language = cols[0].getText(strip=True)[:cols[0].getText(strip=True).index(",")]
        else:
            language = cols[0].getText(strip=True)
        iso_codes[language] = {}
        iso_codes[language]["ISO-Code"]= cols[1].text


In [8]:
iso = [{'language': k, **v} for k, v in iso_codes.items()]
language_facts = [{'language': k, **v} for k, v in all_languages.items()]
spoken = [{'language': k, **v} for k, v in spoken_list.items()]
scripts_list = [{'language': k, **v} for k, v in language_scripts.items()]

iso_df = pd.DataFrame(iso)
language_facts_df = pd.DataFrame(language_facts)
spoken_df = pd.DataFrame(spoken)
scripts_df = pd.DataFrame(scripts_list)
merged = pd.merge(iso_df, language_facts_df, on='language', how='right')
merged2 = pd.merge(merged, spoken_df, on='language', how='left')
final = pd.merge(merged2, scripts_df, on='language', how='left')
final


Unnamed: 0,language,ISO-Code,Family,Branch,Speakers (in millions),Countries,script
0,English,en,Indo-European,Germanic,1528,74.0,"[Latin, Shavian]"
1,Chinese,zh,Sino-Tibetan,Sinitic,1184,27.0,"[Bopomofo, Simplified, Traditional]"
2,Hindi,hi,Indo-European,Indo-Aryan,609,9.0,"[Devanagari, Latin]"
3,Spanish,es,Indo-European,Romance,558,36.0,[Latin]
4,Arabic,ar,Afro-Asiatic,Semitic,335,35.0,"[Arabic, Syriac]"
5,French,fr,Indo-European,Romance,312,51.0,[Latin]
6,Bengali,bn,Indo-European,Indo-Aryan,284,4.0,
7,Portuguese,pt,Indo-European,Romance,267,18.0,[Latin]
8,Russian,ru,Indo-European,Balto-Slavic,253,22.0,[Cyrillic]
9,Indonesian,id,Austronesian,Malayo-Polynesian,252,,"[Arabic, Latin]"
