In [2]:
import numpy as np
import pandas as pd
import re
import json
import random

# Source.bib preprocessing

In [19]:
# download source.bib from https://github.com/cldf-datasets/glottolog-cldf/blob/master/cldf/sources.bib

In [2]:
with open('./source.bib', encoding='utf-8') as f:
    text = f.read()

re_type_id = re.compile(r'@(\w+?)\{(\d+?)\,')
re_dict = re.compile(r'(\w+?-*\w+?)\s+?=\s+?(.*?)"')

r1 = text.replace('"', "'").replace("\\", "/").replace("},\n", """ ", """).replace("}\n}", """ "$ """)
r1 = re_type_id.sub(r'$"bibtype": "\1", "id": "\2", ', r1)
r1 = r1.replace("{", '').replace("}", '').replace('$ $', '}, {')
r2 = re_dict.sub(r""" "\1": "\2" """, r1)
r3 = "[{" + r2[1:-2] + "}]"

json_text = json.loads(r3)
df = pd.json_normalize(json_text) 
df.shape

(408600, 223)

In [3]:
df_gr = df[df['hhtype'].isin(['grammar_sketch ', 'grammar '])]
df_gr = df_gr.dropna(axis=1, how='all')
df_gr = df_gr[df_gr['lgcode'].notna()]
df_gr.shape

(15919, 156)

In [4]:
re_lng = re.compile(r'\[\w{3}\]')
def lgcode_count(l):
    lngs = re_lng.findall(l)
    return len(lngs)

df_gr['count'] = df_gr['lgcode'].apply(lgcode_count)
df_gr['count'].value_counts()

1     15206
0       392
2       215
3        60
4        21
5        10
6         9
7         3
15        1
9         1
8         1
Name: count, dtype: int64

In [5]:
cols = [
        'title', 'author', 'year', 'hhtype', 'pages', 'inlg', 'lgcode', 'macro_area', 'src',
        'isbn', 'glottolog_ref', 'howpublished', 'url', 'doi', 'shelf_location', 
        'guldemann_location', 'goba_namedlocation'
       ]
df_gr1 = df_gr[df_gr['count'] == 1][cols]

re_lng = re.compile(r'\[\w{3}\]')
def get_lgcode(l):
    try:
        lngs = re_lng.findall(l)
        return lngs[0][1:-1]
    except:
        return ''
    
df_gr1['lgcode_iso3'] = df_gr1['lgcode'].apply(get_lgcode)
df_gr1['inlg_iso3'] = df_gr1['inlg'].apply(get_lgcode)

In [6]:
rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}

def to_int(list_of_s):
    list_of_int = []
    for s in list_of_s:
        if s.isalpha():
            s = s.upper()
            int_val = 0
            for i in range(len(s)):
                if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]:
                    int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]]
                else:
                    int_val += rom_val[s[i]]
            list_of_int.append(int_val)
        else:
            list_of_int.append(int(s))
    return list_of_int

def pages_minus(s):
    p_minus = to_int(s.split('-'))
    pm = p_minus[1] - p_minus[0] + 1
    return pm

def pages_to_int(s):
    try:
        s = s.strip().replace('–', '-')
        parts = s.split(', ')
        pages_total = 0
        for p in parts:
            if '+' in p:
                s_plus = s.split('+')
                for sp in s_plus:
                    if '-' in sp:
                        pages_total += pages_minus(sp)
                    else:
                        pages_total += to_int([sp])[0]
            elif '-' in p:
                pages_total += pages_minus(p)
            else:
                pages_total += to_int([p])[0]
        return pages_total
    except:
        return 0
    

df_gr1['pages'] = df_gr1['pages'].fillna('0')
df_gr1['pages_sum'] = df_gr1['pages'].apply(pages_to_int)
df_gr1 = df_gr1[df_gr1['pages_sum'] != 0]

In [7]:
inlgs = ['eng', 'rus', 'fra']
df_gr1_in = df_gr1[df_gr1['inlg_iso3'].isin(inlgs)]

In [8]:
groups_lg = df_gr1_in.groupby(by=['lgcode_iso3'], as_index = False)
only_max = groups_lg.apply(lambda g: g[g['pages_sum'] == g['pages_sum'].max()])
only_max = only_max.drop_duplicates(subset=['lgcode_iso3'], keep='last')
cols = list(set(only_max.columns) - set(['glottolog_ref', 'howpublished', 'shelf_location', 'goba_namedlocation']))
only_max = only_max[cols]
only_max.shape

(3657, 16)

In [437]:
# only_max.to_csv('source_only_max.csv', index=False)

# WALS (?+ Glottolog)

In [422]:
# current version
wals_lngs_url = 'https://raw.githubusercontent.com/cldf-datasets/wals/master/cldf/languages.csv'
#glottolog_lngs_url = 'https://raw.githubusercontent.com/cldf-datasets/glottolog-cldf/master/cldf/languages.csv'

In [423]:
df_wals_lngs = pd.read_csv(wals_lngs_url).loc[:2661, :]
#df_glot_lngs = pd.read_csv(glottolog_lngs_url)

In [424]:
df_wals_lngs.shape

(2662, 17)

In [425]:
# not considering sign languages, creoles&pigins
df_wals_lngs = df_wals_lngs[df_wals_lngs['Family'] != 'other']

In [426]:
df_wals_lngs

Unnamed: 0,ID,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code,Family,Subfamily,Genus,GenusIcon,ISO_codes,Samples_100,Samples_200,Country_ID,Source,Parent_ID
0,aab,Arapesh (Abu),Papunesia,-3.450000,142.950000,abua1245,aah,Torricelli,,Kombio-Arapesh,,aah,False,False,PG,Nekitel-1985,genus-kombioarapesh
1,aar,Aari,Africa,6.000000,36.583333,aari1239,aiw,Afro-Asiatic,Omotic,South Omotic,,aiw,False,False,ET,Hayward-1990a,genus-southomotic
2,aba,Abau,Papunesia,-4.000000,141.250000,abau1245,aau,Sepik,,Abau,,aau,False,False,PG,Bailey-1975,genus-abau
3,abb,Arabic (Chadian),Africa,13.833333,20.833333,chad1249,shu,Afro-Asiatic,,Semitic,,shu,False,False,TD,Abu-Absi-1995,genus-semitic
4,abd,Abidji,Africa,5.666667,-4.583333,abid1235,abi,Niger-Congo,Kwa,Agneby,,abi,False,False,CI,,genus-agneby
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2657,zte,Zapotec (Texmelucan),North America,16.500000,-97.166667,texm1235,zpz,Oto-Manguean,,Zapotecan,,zpz,False,False,MX,,genus-zapotecan
2658,zul,Zulu,Africa,-30.000000,30.000000,zulu1248,zul,Niger-Congo,Benue-Congo,Bantu,,zul,True,True,ZA,Canonici-1989 Canonici-1995 Cope-1982 Dahl-198...,genus-bantu
2659,zun,Zuni,North America,35.083333,-108.833333,zuni1245,zun,Zuni,,Zuni,,zun,False,False,US,Bunzel-1933-1938 Bybee-et-al-1994 Cook-1975 Mi...,genus-zuni
2660,zya,Zapotec (Yatzachi),North America,17.200000,-96.200000,yatz1235,zav,Oto-Manguean,,Zapotecan,,zav,False,False,MX,Butler-1980,genus-zapotecan


In [427]:
df_wals_lngs.shape

(2590, 17)

In [428]:
# Genus Sample (by Miestamo, 2015)
# considering ISO_codes with comma and without any value (will be dropped when merged with sources)
gs = df_wals_lngs.groupby('Macroarea')['Genus'].nunique()
gs

Macroarea
Africa           124
Australia         38
Eurasia           87
North America    106
Papunesia        165
South America    110
Name: Genus, dtype: int64

In [429]:
df_wals_gr_area_fam = df_wals_lngs.groupby(['Macroarea', 'Family'])['Genus'].count()

In [145]:
df_wals_gr_area_fam['Australia']

Family
Bunuban               2
Darwin Region         2
Eastern Daly          2
Gaagudju              1
Garrwan               1
Gunwinyguan           9
Iwaidjan              2
Jarrakan              2
Mangarrayi-Maran      4
Mangrida              4
Mirndi                3
Northern Daly         2
Nyulnyulan            5
Pama-Nyungan        120
Southern Daly         3
Tangkic               3
Tasmanian             2
Tiwian                1
Wagiman               1
Wandjiginy            2
Western Daly          4
Worrorran             3
Yangmanic             1
Name: Genus, dtype: int64

In [146]:
sum(gs)

630

In [151]:
df_wals_lngs.columns

Index(['ID', 'Name', 'Macroarea', 'Latitude', 'Longitude', 'Glottocode',
       'ISO639P3code', 'Family', 'Subfamily', 'Genus', 'GenusIcon',
       'ISO_codes', 'Samples_100', 'Samples_200', 'Country_ID', 'Source',
       'Parent_ID'],
      dtype='object')

In [152]:
df_wals_lngs[df_wals_lngs['ISO639P3code'] != df_wals_lngs['ISO_codes']].shape

(958, 17)

In [153]:
df_wals_lngs[df_wals_lngs['ISO639P3code'].notna()][df_wals_lngs['ISO_codes'].isna()]

  df_wg[df_wg['ISO639P3code'].notna()][df_wg['ISO_codes'].isna()]


Unnamed: 0,ID,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code,Family,Subfamily,Genus,GenusIcon,ISO_codes,Samples_100,Samples_200,Country_ID,Source,Parent_ID


In [154]:
df_wals_lngs = df_wals_lngs.drop(['GenusIcon'], axis=1)

In [155]:
df_wals_lngs.shape

(3501, 16)

In [156]:
df_wals_lngs.rename(columns={'ISO639P3code': 'lgcode_iso3'}, inplace=True)

# WALS (+ Glot) + Source

In [400]:
df_wals_maxsource = df_wals_lngs.merge(only_max, on='lgcode_iso3', how='inner')

In [158]:
cs = df_wals_maxsource.groupby('Macroarea')['Genus'].nunique()
cs

Macroarea
Africa           116
Australia         36
Eurasia           85
North America     97
Papunesia        140
South America     82
Name: Genus, dtype: int64

In [159]:
sum(cs)

556

In [160]:
cs / gs

Macroarea
Africa           0.935484
Australia        0.947368
Eurasia          0.977011
North America    0.915094
Papunesia        0.848485
South America    0.745455
Name: Genus, dtype: float64

In [161]:
max_per = min(cs/gs)
max_lgs = np.floor(gs * max_per)

In [162]:
print(sum(max_lgs))
max_lgs

468.0


Macroarea
Africa            92.0
Australia         28.0
Eurasia           64.0
North America     79.0
Papunesia        123.0
South America     82.0
Name: Genus, dtype: float64

In [165]:
sample_size = 300
macroarea_numlg300 = dict(np.floor(gs * sample_size / sum(gs)))
macroarea_numlg300

{'Africa': 59.0,
 'Australia': 18.0,
 'Eurasia': 41.0,
 'North America': 50.0,
 'Papunesia': 78.0,
 'South America': 52.0}

In [166]:
sum(marcoarea_numlg300.values())

298

# Sample 300 (family balanced)

In [401]:
def get_families_lists(one_from_genus):
    families = one_from_genus.sort_values(by=['pages_sum'], ascending=False)['Family'].drop_duplicates().values
    families_list = []
    for f in families:
        f_df = one_from_genus[one_from_genus['Family'] == f].sort_values(by=['pages_sum'], ascending=False)
        families_list.append(f_df.values.tolist())
    fanilies_list = sorted(families_list, key=lambda x: x[0][19], reverse=False)
    return families_list

In [402]:
one_from_genus = df_wals_maxsource.groupby(by='Genus').apply(lambda g: g[g['pages_sum'] == g['pages_sum'].max()])
one_from_genus = one_from_genus.drop_duplicates(subset=['lgcode_iso3'], keep='last')

lngs_all = {}
for macroarea in macroarea_numlg300:
    max_lgs = int(macroarea_numlg300[macroarea])
    macroarea_df = one_from_genus[one_from_genus['Macroarea'] == macroarea]
    lngs = []
    families_list = get_families_lists(macroarea_df)
    while len(lngs) != max_lgs:
        for family in families_list:
            if family:
                lng = family.pop(0)
                lngs.append(lng)
                if len(lngs) == max_lgs:
                    break
    lngs_all[macroarea] = lngs

In [405]:
len([l for f in lngs_all.values() for l in f])

298

In [406]:
df_sample_fixed = pd.DataFrame([l for f in lngs_all.values() for l in f], columns=df_wals_maxsource.columns)

In [267]:
cols_order = ['lgcode_iso3', 'Name', 'Family', 'Genus', 'Macroarea', 
              'title', 'author', 'year', 'pages_sum', 'inlg_iso3', 'hhtype', 'src',
              'isbn',  'url', 'guldemann_location',  'doi', 
              'Latitude', 'Longitude', 'Glottocode', 'ID', 
              'Country_ID', 'Source', 'Subfamily', 'Parent_ID',
           ]

In [407]:
df_sample_fixed = df_sample_fixed[cols_order]

In [408]:
df_sample_fixed_gr = df_sample_fixed.groupby(['Macroarea', 'Family'])['Genus'].count()

In [410]:
df_sample_fixed_gr['Australia']

Family
Bunuban             1
Gaagudju            1
Garrwan             1
Gunwinyguan         1
Iwaidjan            1
Jarrakan            1
Mangarrayi-Maran    1
Mangrida            1
Mirndi              1
Nyulnyulan          1
Pama-Nyungan        1
Southern Daly       1
Tangkic             1
Tiwian              1
Wagiman             1
Western Daly        1
Worrorran           1
Yangmanic           1
Name: Genus, dtype: int64

In [411]:
df_sample_fixed_gr['Africa']

Family
Afro-Asiatic       8
Austronesian       1
Bangime            1
Berta              1
Central Sudanic    6
Dogon              1
Eastern Sudanic    8
Fur                1
Gumuz              1
Hadza              1
Ijoid              1
Kadu               1
Khoe-Kwadi         1
Koman              1
Kordofanian        3
Kunama             1
Kxa                2
Laal               1
Maban              1
Mande              2
Niger-Congo        9
Saharan            2
Sandawe            1
Shabo              1
Siamou             1
Songhay            1
Tu                 1
Name: Genus, dtype: int64

In [412]:
df_sample_fixed_gr['South America']

Family
Alacalufan          1
Andoke              1
Arauan              1
Araucanian          1
Arawakan            1
Aymaran             1
Barbacoan           1
Boran               1
Bororoan            1
Cacua-Nukak         1
Cahuapanan          1
Camsá               1
Cariban             1
Cayuvava            1
Chapacura-Wanham    1
Chibchan            1
Choco               1
Chonan              1
Fulniô              1
Guahiban            1
Guaicuruan          1
Hobitu-Cholon       1
Itonama             1
Jivaroan            1
Kwaza               1
Macro-Ge            1
Mascoian            1
Mochica             1
Mosetenan           1
Movima              1
Mura                1
Nadahup             1
Nambikuaran         1
Pano-Tacanan        1
Peba-Yaguan         1
Puquina             1
Páezan              1
Quechuan            1
Sáliban             1
Ticuna              1
Trumai              1
Tucanoan            1
Tupian              1
Urarina             1
Uru-Chipaya         1
Wao

In [413]:
df_sample_fixed_gr['North America']

Family
Algic           2
Atakapa         1
Caddoan         1
Chibchan        1
Chimakuan       1
Chitimacha      1
Chumash         1
Coahuiltecan    1
Eskimo-Aleut    2
Esselen         1
Haida           1
Hokan           1
Iroquoian       1
Kalapuyan       1
Karankawa       1
Keresan         1
Kiowa-Tanoan    1
Kutenai         1
Mayan           1
Misumalpan      1
Mixe-Zoque      1
Muskogean       1
Na-Dene         2
Natchez         1
Oregon Coast    1
Oto-Manguean    2
Penutian        1
Salishan        2
Siouan          2
Takelma         1
Tarascan        1
Timucua         1
Tol             1
Tonkawa         1
Totonacan       1
Tsimshianic     1
Tunica          1
Uto-Aztecan     2
Wakashan        1
Wappo-Yukian    1
Washo           1
Yuchi           1
Zuni            1
Name: Genus, dtype: int64

In [414]:
df_sample_fixed_gr['Papunesia']

Family
Abun                     1
Asmat-Kamrau Bay         1
Austronesian             6
Baibai-Fas               1
Baining                  1
Border                   1
Duna-Bogaya              1
East Bird's Head         1
Eastern Trans-Fly        1
Eleman                   1
Geelvink Bay             1
Greater West Bomberai    3
Hatim-Mansim             1
Kamula-Elevala           1
Keram                    1
Kolopom                  1
Kuot                     1
Kwomtari                 1
Lakes Plain              2
Left May                 1
Mairasic                 1
Maybrat                  1
Moraori                  1
Mpur                     1
Nimboran                 1
North Bougainville       1
North Halmaheran         1
Pauwasi                  1
Ramu-Lower Sepik         4
Senagi                   1
Sentanic                 1
Sepik                    5
Skou                     3
Solomons East Papuan     3
South Bird's Head        1
South Bougainville       1
Sulka                

In [370]:
df_sample_fixed_gr['Eurasia']

Family
Afro-Asiatic           1
Ainu                   1
Altaic                 3
Austro-Asiatic         2
Austronesian           1
Basque                 1
Burushaski             1
Chukotko-Kamchatkan    2
Dravidian              1
Eskimo-Aleut           1
Great Andamanese       1
Hmong-Mien             2
Indo-European          3
Japanese               1
Kartvelian             1
Korean                 1
Kusunda                1
Nahali                 1
Nakh-Daghestanian      3
Nivkh                  1
Northwest Caucasian    1
Sino-Tibetan           3
South Andamanese       1
Tai-Kadai              2
Uralic                 3
Yeniseian              1
Yukaghir               1
Name: Genus, dtype: int64

In [416]:
df_sample_fixed['Genus'].nunique()#.value_counts()

298

In [434]:
df_sample_fixed['Name'].nunique()

298

In [417]:
#df_sample_fixed.to_csv('./sample300_3inlg_fixed.csv', index=False)

# Adding info from WALS Chapters concerning gender

In [419]:
maps = [('./Maps/30A_number_of_genders.tsv', 'WALS_Ngen'),
       ('./Maps/31A_sex_based_non_sex_based.tsv', 'WALS_sex_based'), 
       ('./Maps/32A_systems_of_gender_assignment.tsv', 'WALS_assignment'),
       ('./Maps/44A_gender_disctinc_in_indep_pers_pron.tsv', 'WALS_pronouns')]

In [430]:
df_wals_iso = pd.read_csv(wals_lngs_url).loc[:2661, :]
df_wals_iso = df_wals_iso[['ID', 'ISO639P3code']]
df_wals_iso.rename(columns={'ISO639P3code': 'lgcode_iso3', 'ID': 'wals code'}, inplace=True)

In [431]:
for m in maps:
    df_map = pd.read_csv(m[0], sep='\t')
    df_map = df_map[['wals code', 'description']]
    df_map.rename(columns={'description': m[1]}, inplace=True)
    df_map = df_map.merge(df_wals_iso, on='wals code', how='left').drop(['wals code'], axis=1)
    df_sample_fixed = df_sample_fixed.merge(df_map, on='lgcode_iso3', how='left')

In [435]:
#df_sample_fixed.to_csv('./sample300_3inlg_fixed_wals_features.csv', index=False)

# Adding info from Grambank

For parsing features  use `grambank_parser.py`. from https://github.com/ancheveleva/grambank_parser

Usage example:

`python3 grambank_parser.py GB314,GB315 save_file_path values_path languages_path parameters_path`

In [97]:
sample = pd.read_csv('./sample300_3inlg_fixed_wals_features.csv')

In [98]:
gb = pd.read_csv('./grambank/gender_gb.csv')

In [99]:
sample.columns

Index(['lgcode_iso3', 'Name', 'Family', 'Genus', 'Macroarea', 'title',
       'author', 'year', 'pages_sum', 'inlg_iso3', 'hhtype', 'src', 'isbn',
       'url', 'guldemann_location', 'doi', 'Latitude', 'Longitude',
       'Glottocode', 'ID', 'Country_ID', 'Source', 'Subfamily', 'Parent_ID',
       'WALS_Ngen', 'WALS_sex_based', 'WALS_assignment', 'WALS_pronouns'],
      dtype='object')

In [100]:
sum(gb['ISO639P3code'].notna())

0

In [101]:
sum(gb['Glottocode'].isna())

0

In [102]:
features = {'GB030': '3rd_pron',
 'GB051': 'sex',
 'GB053': 'animacy',
 'GB170': 'prop_words',
 'GB171': 'demonstratives',
 'GB172': 'articles',
 'GB192': 'phonetics',
 'GB196': '2nd_pron',
 'GB197': '1st_pron',
 'GB198': 'numerals',
 'GB321': 'nor_sem_nor_phon'}

In [103]:
gb.columns

Index(['ID', 'Language_ID', 'Parameter_ID', 'Value', 'Code_ID', 'Comment',
       'Source', 'Source_comment', 'Coders', 'Language_Name', 'Macroarea',
       'Latitude', 'Longitude', 'Glottocode', 'ISO639P3code', 'provenance',
       'Family_name', 'Family_level_ID', 'Language_level_ID', 'level',
       'lineage', 'Parameter_Name', 'Description', 'ColumnSpec', 'Patrons',
       'Grambank_ID_desc', 'Boundness', 'Flexivity', 'Gender_or_Noun_Class',
       'Locus_of_Marking', 'Word_Order', 'Informativity'],
      dtype='object')

In [107]:
gb[['Language_ID', 'Value', 'Comment', 'Source', 'Source_comment']]

Unnamed: 0,Language_ID,Value,Comment,Source,Source_comment
0,abad1241,0,,s_OaPaul_Gabadi[8],Oa & Paul 2013:8
1,abad1241,0,Passim,s_OaPaul_Gabadi[1-39],Oa & Paul 2013:1-39
2,abad1241,0,Passim,s_OaPaul_Gabadi[1-39],Oa & Paul 2013:1-39
3,abad1241,0,,s_OaPaul_Gabadi[7-20],Oa & Paul 2013:7-20
4,abad1241,0,,s_OaPaul_Gabadi[7-20],Oa & Paul 2013:7-20
...,...,...,...,...,...
24931,zuni1245,1,Zuni has 3 noun classes - The first is overwhe...,s_Newman_Zuni_1965[55-59],Newman (1965: 55-59)
24932,zuni1245,0,,g_Nichols_Zuni;s_Newman_Zuni_1965,"Nichols (1997), Newman (1965)"
24933,zuni1245,0,,g_Nichols_Zuni;s_Newman_Zuni_1965,"Nichols (1997), Newman (1965)"
24934,zuni1245,0,"Not mentioned, nor expressed in any example",g_Nichols_Zuni;s_Newman_Zuni_1965,"Nichols (1997), Newman (1965)"


In [110]:
gb['Gender_or_Noun_Class'].value_counts() 
# this column indicates whether the feature concerns topic of gender
# not whether there is gender in a language

1.0    24936
Name: Gender_or_Noun_Class, dtype: int64

In [111]:
gb.shape

(24936, 32)

In [112]:
for f in features:
    f_db = gb[gb['Parameter_ID'] == f][['Value', 'Glottocode', 'Comment', 'Source_comment']]
    f_db = f_db.rename(columns={
        'Value': f'{features[f]}_{f}',
        'Comment': f'{features[f]}_{f}_comment', 
        'Source_comment': f'{features[f]}_{f}_source',
                               })
    sample = sample.merge(f_db, on='Glottocode', how='left')

In [114]:
#sample.to_csv('./sample300_3inlg_fixed_wals_gb_features.csv', index=False)

In [3]:
sample = pd.read_csv('./sample300_3inlg_fixed_wals_gb_features.csv')

In [4]:
sample['inlg_iso3'].value_counts()

eng    260
fra     26
rus     12
Name: inlg_iso3, dtype: int64

In [9]:
sample.columns

Index(['lgcode_iso3', 'Name', 'Family', 'Genus', 'Macroarea', 'title',
       'author', 'year', 'pages_sum', 'inlg_iso3', 'hhtype', 'src', 'isbn',
       'url', 'guldemann_location', 'doi', 'Latitude', 'Longitude',
       'Glottocode', 'ID', 'Country_ID', 'Source', 'Subfamily', 'Parent_ID',
       'WALS_Ngen', 'WALS_sex_based', 'WALS_assignment', 'WALS_pronouns',
       '3rd_pron_GB030', '3rd_pron_GB030_comment', '3rd_pron_GB030_source',
       'sex_GB051', 'sex_GB051_comment', 'sex_GB051_source', 'animacy_GB053',
       'animacy_GB053_comment', 'animacy_GB053_source', 'prop_words_GB170',
       'prop_words_GB170_comment', 'prop_words_GB170_source',
       'demonstratives_GB171', 'demonstratives_GB171_comment',
       'demonstratives_GB171_source', 'articles_GB172',
       'articles_GB172_comment', 'articles_GB172_source', 'phonetics_GB192',
       'phonetics_GB192_comment', 'phonetics_GB192_source', '2nd_pron_GB196',
       '2nd_pron_GB196_comment', '2nd_pron_GB196_source', '1st_pron_

In [13]:
cols_pron = ['3rd_pron_GB030', '2nd_pron_GB196', '1st_pron_GB197']
cols_assignment = ['sex_GB051', 'animacy_GB053', 'phonetics_GB192', 'nor_sem_nor_phon_GB321']
cols_agreement = ['prop_words_GB170', 'demonstratives_GB171', 'articles_GB172', 'numerals_GB198']

In [37]:
sample[
    (sample['sex_GB051'].isin(['0', '?'])) &
    (sample['animacy_GB053'].isin(['0', '?'])) &
    (sample['phonetics_GB192'].isin(['0', '?'])) &
    (sample['nor_sem_nor_phon_GB321'].isin(['0', '?'])) &
#     ~(sample['prop_words_GB170'].isin(['0', '?'])) &
#     ~(sample['prop_words_GB170'].isna())
    (sample['prop_words_GB170'] == '1')
      ].groupby(['Macroarea']).count()['Genus']

Macroarea
Africa           1
South America    2
Name: Genus, dtype: int64

In [38]:
sample[
    ((sample['sex_GB051'] == '1') |
    (sample['animacy_GB053'] == '1') |
    (sample['phonetics_GB192'] == '1') |
    (sample['nor_sem_nor_phon_GB321'] == '1')) &
    (sample['prop_words_GB170'].isin(['0', '?'])) 
#     (sample['prop_words_GB170'].isna())
#     (sample['prop_words_GB170'] == '1')
      ].groupby(['Macroarea']).count()['Genus']

Macroarea
Africa           8
Australia        1
Eurasia          2
North America    7
Papunesia        7
South America    8
Name: Genus, dtype: int64

In [39]:
sample[
    (sample['sex_GB051'].isin(['0', '?'])) &
    (sample['animacy_GB053'].isin(['0', '?'])) &
    (sample['phonetics_GB192'].isin(['0', '?'])) &
    (sample['nor_sem_nor_phon_GB321'].isin(['0', '?'])) &
#     ~(sample['prop_words_GB170'].isin(['0', '?'])) &
#     ~(sample['prop_words_GB170'].isna())
    (sample['3rd_pron_GB030'] == '1')
      ].groupby(['Macroarea']).count()['Genus']

Macroarea
Africa           1
Eurasia          2
North America    1
Papunesia        4
South America    5
Name: Genus, dtype: int64

In [44]:
sample[
    (sample['sex_GB051'].isin(['0', '?']) | sample['sex_GB051'].isna()) &
    (sample['animacy_GB053'].isin(['0', '?']) | sample['animacy_GB053'].isna()) &
    (sample['phonetics_GB192'].isin(['0', '?']) | sample['phonetics_GB192'].isna()) &
    (sample['nor_sem_nor_phon_GB321'].isin(['0', '?']) | sample['nor_sem_nor_phon_GB321'].isna())
#     ~(sample['prop_words_GB170'].isin(['0', '?'])) &
#     ~(sample['prop_words_GB170'].isna())
#     (sample['3rd_pron_GB030'] == '1')
      ].groupby(['Macroarea']).count()['Genus']

Macroarea
Africa           42
Australia         9
Eurasia          34
North America    41
Papunesia        51
South America    36
Name: Genus, dtype: int64

In [45]:
sample[
    (sample['sex_GB051'].isna()) &
    (sample['animacy_GB053'].isna()) &
    (sample['phonetics_GB192'].isna()) &
    (sample['nor_sem_nor_phon_GB321'].isna())
#     ~(sample['prop_words_GB170'].isin(['0', '?'])) &
#     ~(sample['prop_words_GB170'].isna())
#     (sample['3rd_pron_GB030'] == '1')
      ].groupby(['Macroarea']).count()['Genus']

Macroarea
Africa           15
Australia         3
Eurasia           7
North America    11
Papunesia        16
South America     8
Name: Genus, dtype: int64

In [42]:
sample[
    (sample['numerals_GB198'].isin(['0', '?']))
      ].groupby(['Macroarea']).count()['Genus']

Macroarea
Africa           33
Australia         5
Eurasia          28
North America    38
Papunesia        36
South America    35
Name: Genus, dtype: int64

In [40]:
sample[
    (sample['demonstratives_GB171'].isna())
      ].groupby(['Macroarea']).count()['Genus']

Macroarea
Africa           17
Australia        10
Eurasia          10
North America    11
Papunesia        38
South America     8
Name: Genus, dtype: int64

In [12]:
sample.columns

Index(['lgcode_iso3', 'Name', 'Family', 'Genus', 'Macroarea', 'title',
       'author', 'year', 'pages_sum', 'inlg_iso3', 'hhtype', 'src', 'isbn',
       'url', 'guldemann_location', 'doi', 'Latitude', 'Longitude',
       'Glottocode', 'ID', 'Country_ID', 'Source', 'Subfamily', 'Parent_ID',
       'WALS_Ngen', 'WALS_sex_based', 'WALS_assignment', 'WALS_pronouns',
       '3rd_pron_GB030', '3rd_pron_GB030_comment', '3rd_pron_GB030_source',
       'sex_GB051', 'sex_GB051_comment', 'sex_GB051_source', 'animacy_GB053',
       'animacy_GB053_comment', 'animacy_GB053_source', 'prop_words_GB170',
       'prop_words_GB170_comment', 'prop_words_GB170_source',
       'demonstratives_GB171', 'demonstratives_GB171_comment',
       'demonstratives_GB171_source', 'articles_GB172',
       'articles_GB172_comment', 'articles_GB172_source', 'phonetics_GB192',
       'phonetics_GB192_comment', 'phonetics_GB192_source', '2nd_pron_GB196',
       '2nd_pron_GB196_comment', '2nd_pron_GB196_source', '1st_pron_

# Archive

## Sample 300 (family unbalanced)

In [36]:
#random_seed = 13

df_sample = pd.DataFrame()
for macroarea in macroarea_numlg300:
    max_lgs = int(macroarea_numlg300[macroarea])
    macroarea_df = df_wals_maxsource[df_wals_maxsource['Macroarea'] == macroarea]
    one_from_genus = macroarea_df.groupby(by='Genus').apply(lambda g: g[g['pages_sum'] == g['pages_sum'].max()])
    one_from_genus = one_from_genus.drop_duplicates(subset=['lgcode_iso3'], keep='last')
    sample = one_from_genus.sort_values('pages_sum', ascending=False).head(max_lgs)
    #sample = one_from_genus.sample(max_lgs, random_state=random_seed) #-- random
    
    df_sample = pd.concat([df_sample, sample])

In [37]:
df_sample.reset_index(drop=True, inplace=True)

In [39]:
df_sample = df_sample[cols_order]

In [154]:
#df_sample.to_csv('./sample300_3inlg.csv', index=False)

In [67]:
df = pd.read_csv('./sample300_3inlg.csv')

In [68]:
df_count_genus = df.groupby('Genus')['Name'].count()

In [69]:
df_count_genus[df_count_genus != 1]

Genus
Antillean Arawakan    2
Barito                2
Eskimo                2
Germanic              2
Semitic               2
Name: Name, dtype: int64

In [70]:
df_gr_area_fam = df.groupby(['Macroarea', 'Family'])['Genus'].count()

In [71]:
df_gr_area_fam['Africa']

Family
Afro-Asiatic       10
Austronesian        1
Bangime             1
Central Sudanic     3
Dogon               1
Eastern Sudanic     5
Gumuz               1
Indo-European       1
Koman               1
Maban               1
Mande               2
Niger-Congo        29
Shabo               1
Siamou              1
Songhay             1
Name: Genus, dtype: int64

In [72]:
df_gr_area_fam['Australia']

Family
Bunuban         1
Gaagudju        1
Gunwinyguan     4
Mirndi          3
Nyulnyulan      1
Pama-Nyungan    4
Tangkic         1
Tiwian          1
Worrorran       1
Yangmanic       1
Name: Genus, dtype: int64

In [73]:
df_gr_area_fam['South America']

Family
Alacalufan          1
Andoke              1
Arauan              1
Araucanian          1
Arawakan            5
Aymaran             1
Barbacoan           1
Boran               1
Cacua-Nukak         1
Camsá               1
Cariban             1
Chapacura-Wanham    1
Chibchan            1
Choco               1
Chonan              1
Guahiban            1
Guaicuruan          2
Hobitu-Cholon       1
Jivaroan            1
Kwaza               1
Macro-Ge            4
Mascoian            1
Mosetenan           1
Movima              1
Nadahup             1
Pano-Tacanan        2
Peba-Yaguan         1
Quechuan            1
Sáliban             1
Ticuna              1
Trumai              1
Tucanoan            1
Tupian              5
Urarina             1
Uru-Chipaya         1
Witotoan            1
Yanomam             1
Yuracare            1
Zaparoan            1
Name: Genus, dtype: int64

In [74]:
df_gr_area_fam['North America']

Family
Algic           2
Arawakan        1
Caddoan         1
Chumash         1
Eskimo-Aleut    2
Hokan           5
Iroquoian       2
Keresan         1
Kutenai         1
Mayan           1
Mixe-Zoque      1
Muskogean       1
Na-Dene         3
Oregon Coast    1
Oto-Manguean    5
Penutian        6
Salishan        2
Siouan          2
Tarascan        1
Totonacan       1
Tsimshianic     1
Uto-Aztecan     6
Wakashan        1
Washo           1
Yuchi           1
Name: Genus, dtype: int64

In [75]:
df_gr_area_fam['Eurasia']

Family
Afro-Asiatic            1
Altaic                  3
Austro-Asiatic          2
Basque                  1
Burushaski              1
Chukotko-Kamchatkan     1
Dravidian               1
Eskimo-Aleut            1
Hmong-Mien              1
Indo-European           9
Japanese                1
Kartvelian              1
Korean                  1
Nakh-Daghestanian       4
Sino-Tibetan           10
Tai-Kadai               1
Uralic                  1
Yukaghir                1
Name: Genus, dtype: int64

In [76]:
df_gr_area_fam['Papunesia']

Family
Austronesian             24
Baibai-Fas                1
Baining                   1
Border                    1
Duna-Bogaya               1
East Bird's Head          1
Greater West Bomberai     2
Keram                     1
Kuot                      1
Mairasic                  1
Maybrat                   1
Nimboran                  1
North Bougainville        1
North Halmaheran          1
Ramu-Lower Sepik          1
Senagi                    1
Sepik                     5
Skou                      2
Solomons East Papuan      3
South Bougainville        1
Taulil                    1
Torricelli                3
Trans-New Guinea         22
Yam                       1
Name: Genus, dtype: int64

## Soeren

In [38]:
df_bib = pd.read_csv('./hh_bib_table.tab', sep='\t', encoding='utf-8')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [42]:
df_bib.head(10)

Unnamed: 0,author,title,publisher,address,pages,year,glottolog_ref_id,hhtype,inlg,lgcode,...,copies,eisbn,ethiopia,translator,edition_note,title_german,title_french,weball_lgs,jmnnote,title_emglish
0,'Abd-al-'Ali KÄrang,"TÄti va Harzani, do lahja az zabÄn-i bÄstÄ...",Tabriz: Tabriz University Press,Tabriz,6+160,1334 [1953],41999.0,grammar_sketch,Farsi [pes],"Tati, Harzani [hrz]",...,,,,,,,,,,
1,'Abd-al-'Ali KÄrang,TÄti va KeringÄni,Tabriz: Tabriz University Press,Tabriz,,1330 [1952],549845.0,grammar_sketch,Farsi [pes],"Tati, Karingani [kgn]",...,,,,,,,,,,
2,'AbÅ« á¸¤ayyÄn al-'AndalusÄ«,KitÄb al-'idrÄk li-lisÄn al-'atrÄk,Istanbul: Evkaf MatbaasÄ±,,xv+186+158,1931 [1313],319108.0,grammar_sketch;dictionary,Arabic [arb],Mixture of Oguz and Qipchaq close to Western Q...,...,,,,,,,,,,
3,'AlÄ« HasÅ«rÄ«,GozÄresh-e gÅ«yeshhÄ-ye LorÄ«,TehrÄn: KetÄbkhÄne-ye TahÅ«rÄ«,,75,1342 AHS [1963],587478.0,overview;comparative;wordlist,Persian [pes],Lori = Luri-Northern [lrc],...,,,,,,,,,,
4,'AmoÌœn ThawÄ«sak,PhÄsÄ malÄyÅ« thin nai prathÄ“t thai,[Bangkok]: Institute of Language and Culture f...,,70,2530 [1987],557658.0,grammar_sketch,Thai [tha],Malay-Pattani [mfa],...,,,,,,,,,,
5,'Demola Lewis,Deciphering Aborigines and Migrants from Cogna...,,,29-51,2015,580870.0,overview;comparative;minimal,English [eng],"Etuno (Ebira in Igarra), Enwa, Akuku = Akuku [...",...,,,,,,,,,,
6,'Olam Sing and Shengan Lin,A mei yu jian ming ci dian = O Citing no pangcah,Taipei: Tai wan zu qun mu yu tui xing wei yuan...,,303,2011,574188.0,dictionary,Mandarin Chinese [cmn],Amis [ami],...,,,,,,,,,,
7,'Olam Sing,A mei zu yu shi yong yu fa : zeng qiang yu fa ...,Taipei: PCT tai wan zu qun mu yu tui xing wei ...,,223,2007,577286.0,grammar_sketch,Mandarin Chinese [cmn],Amis [ami],...,,,,,,,,,,
8,'Olam Sing,A mei yu yi yi suo yu yan = O ni isop a sapati...,Taipei: Totoy cheng xiang bu luo yu wen gong z...,,6+183,2006,577285.0,text,Mandarin Chinese [cmn],Amis [ami],...,,,,,,,,,,
9,3Umar atÌ£-TÌ£aiyib as-SaÌ„siÌ„,SprichwÃ¶rter und andere volkskundliche Texte ...,,,156,1972,,text,German [deu],Mekka = Hijazi Arabic [acw],...,,,,,,,,,,


In [23]:
"'AmoÌœn ThawÄ«sak".encode(encoding='Windows-1252').decode("utf-8")
"'AmoÌœn ThawÄ«sak".encode(encoding='ISO-8859-1').decode("utf-8")

UnicodeEncodeError: 'latin-1' codec can't encode character '\u0153' in position 5: ordinal not in range(256)

## WALS 2014

In [2]:
wals14_area = 'https://raw.githubusercontent.com/cldf-datasets/wals/ca710163dbc5d4155d13631b6757bf62564e4127/raw/walslanguage.csv'
wals14_fam = 'https://raw.githubusercontent.com/cldf-datasets/wals/ca710163dbc5d4155d13631b6757bf62564e4127/cldf/languages.csv'

In [3]:
w14_a = pd.read_csv(wals14_area)
w14_f = pd.read_csv(wals14_fam)

In [4]:
w14_a.shape, w14_f.shape

((2679, 7), (2679, 13))

In [5]:
w14_a.rename(columns={'iso_codes': 'ISO_codes'}, inplace=True)

In [6]:
w14 = w14_a.merge(w14_f, on='ISO_codes', how='inner', suffixes=('_a', '_f'))

In [7]:
w14 = w14[['macroarea', 'Family', 'Genus', 'ID']]

In [8]:
w14.drop_duplicates(inplace=True)

In [9]:
w14.shape

(2981, 4)

In [12]:
w14[(w14['macroarea'] == 'Africa') & (w14['Family'] == 'Indo-European')]

Unnamed: 0,macroarea,Family,Genus,ID
589,Africa,Indo-European,Indic,rse
6287,Africa,Indo-European,Germanic,afr


In [10]:
w14_gr = w14.groupby(['macroarea', 'Family'])['Genus'].count()

In [11]:
w14_gr['Africa']

Family
Afro-Asiatic      116
Altaic              1
Arawakan            2
Australian         15
Austro-Asiatic      1
Austronesian        3
Cariban             1
Chimúan             1
Huarpe              1
Indo-European       2
Jirajaran           1
Kadugli             2
Kapixana            1
Khoisan            12
Kwaza               1
Mixe-Zoque          1
Na-Dene             1
Niger-Congo       362
Nilo-Saharan       88
Sino-Tibetan        6
Skou                1
Tacame              1
Tasmanian           1
Timote-Cuica        1
Torricelli          1
Tupian              1
Uto-Aztecan         3
Yeniseian           1
other              16
Name: Genus, dtype: int64

## WALS 2011

In [92]:
wals11_country_lngs = 'https://raw.githubusercontent.com/cldf-datasets/wals/ef7134b87c14ac79e6d7db952f356f59afa4e237/raw/country_language.csv'
wals11_country = 'https://raw.githubusercontent.com/cldf-datasets/wals/ef7134b87c14ac79e6d7db952f356f59afa4e237/raw/country.csv'
wals11_lngs = 'https://raw.githubusercontent.com/cldf-datasets/wals/ef7134b87c14ac79e6d7db952f356f59afa4e237/cldf/languages.csv'

In [93]:
w11_c = pd.read_csv(wals11_country).dropna()
w11_cl = pd.read_csv(wals11_country_lngs)
w11_l = pd.read_csv(wals11_lngs)

In [96]:
w11_c.rename(columns={'id': 'country_id'}, inplace=True)

In [108]:
w11_cl = w11_cl[w11_cl['language_id'].notna()]

In [109]:
w11 = w11_cl.merge(w11_c, on='country_id', how='inner')

In [114]:
w11 = w11[['language_id', 'continent']]

In [116]:
w11.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [117]:
w11.shape

(2696, 2)

In [119]:
w11_l.rename(columns={'ID': 'language_id'}, inplace=True)

In [120]:
w11_l = w11_l[w11_l['language_id'].notna()]

In [121]:
w11 = w11.merge(w11_l, on='language_id', how='inner')

In [123]:
w11

Unnamed: 0,language_id,continent,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code,Family,Subfamily,Genus,ISO_codes,Samples_100,Samples_200
0,arr,Australia & Oceania,Arrernte,,-24.000000,134.000000,,,Australian,,Pama-Nyungan,aer are,False,False
1,dda,Australia & Oceania,Dhuwal (Dätiwuy),,-12.166667,136.250000,dhuw1249,duj,Australian,,Pama-Nyungan,duj,False,False
2,gml,Australia & Oceania,Gamilaraay,,-29.833333,149.500000,gami1243,kld,Australian,,Pama-Nyungan,kld,False,False
3,gmt,Australia & Oceania,Gumatj,,-12.500000,135.500000,guma1253,gnn,Australian,,Pama-Nyungan,gnn,False,False
4,kij,Australia & Oceania,Kitja,,-17.500000,127.750000,kitj1240,gia,Australian,,Djeragan,gia,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2691,poa,Australia & Oceania,Po-Ai,,-20.666667,164.833333,fwai1237,fwa,Austronesian,Eastern Malayo-Polynesian,Oceanic,fwa,False,False
2692,cem,Australia & Oceania,Cèmuhî,,-20.833333,165.166667,cemu1238,cam,Austronesian,Eastern Malayo-Polynesian,Oceanic,cam,False,False
2693,tin,Australia & Oceania,Tinrin,,-21.666667,165.750000,tiri1258,cir,Austronesian,Eastern Malayo-Polynesian,Oceanic,cir,False,False
2694,iaa,Australia & Oceania,Iaai,,-20.416667,166.583333,iaai1238,iai,Austronesian,Eastern Malayo-Polynesian,Oceanic,iai,False,False


In [124]:
w11_gr = w11.groupby(['continent', 'Family'])['Genus'].count()

In [125]:
w11_gr['Africa']

Family
Afro-Asiatic     118
Austronesian       1
Indo-European      1
Kadugli            2
Khoisan           12
Niger-Congo      369
Nilo-Saharan      88
other             12
Name: Genus, dtype: int64