# Intro and maps

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import geopandas as gpd
import zipfile
from io import BytesIO

In [None]:
! wget https://cdstar.eva.mpg.de/bitstreams/EAEA0-7EA2-D308-CD6E-0/glottolog_languoid.csv.zip

In [None]:
langs = pd.read_csv('language_data.csv')
with zipfile.ZipFile("glottolog_languoid.csv.zip", "r") as f:
    for name in f.namelist():
        if name.endswith('.csv'):
            with f.open(name) as zd:
                lang_geo = pd.read_csv(zd)
            break
fam_code_to_name = {row['id']: row['name'] for ix, row in lang_geo[(lang_geo['level']=='family')&(lang_geo['parent_id'].isna())].iterrows()}
lang_geo = lang_geo[lang_geo.iso639P3code.isin(langs.SILCode)]
lang_geo['family_id'].fillna('isolate', inplace=True)
lang_geo['family_name'] = [fam_code_to_name[x] if x!='isolate' else 'isolate' for x in lang_geo['family_id']]
SIL_to_family = {x:lang_geo[lang_geo.iso639P3code==x]['family_name'].values[0] for x in lang_geo.iso639P3code}

In [None]:
worldmap = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

np.random.seed(28)
color_picks = np.random.randint(0, 0xFFFFFF, size=16)
colors = list(map(lambda i: "#" + "%06x" % color_picks[i],range(16)))
fig, ax = plt.subplots(figsize=(12, 6))
worldmap.plot(color="lightgrey", ax=ax)

fam_data = lang_geo['family_name'].value_counts().to_dict()
for i,fam in enumerate(fam_data.keys()):
    x = lang_geo[lang_geo.family_name == fam]['longitude']
    y = lang_geo[lang_geo.family_name == fam]['latitude']
    plt.scatter(x, y, s=[150]*len(x), c=colors[i], label=fam, alpha=0.7, edgecolors='none')

plt.legend(loc="lower center",ncol=5)

plt.xlim([-180, 180])
plt.ylim([-90, 90])

plt.title("Language sample, color-coded by language family")
plt.show()

In [None]:
fam_data = lang_geo['family_name'].value_counts().to_dict()

labels = fam_data.keys()
sizes = fam_data.values()
plt.rcParams["figure.figsize"] = (7,7)
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=100, colors=colors, wedgeprops={'alpha':0.6})
plt.show()

In [None]:
lang_per_fam = {x:lang_geo[lang_geo['family_name']==x]['name'] for x in labels}
for family in lang_per_fam:
    print(family.upper()+':',', '.join(lang_per_fam[family]))