In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### 0. Basic CLDF manipulation

The lexibank data can be downloaded from [here](https://zenodo.org/records/7836668). You then need to unzip it, and the unzip the forms.csv file in the cldf forlder. Here is an automated script that does that for you:

In [2]:
# unzip the lexibank-analysed-v1.0.zip file in the data folder
!unzip -o data/lexibank-analysed-v1.0.zip -d data/
# unzip the forms.csv file in the cldf folder
!unzip -o data/lexibank-lexibank-analysed-a4c0952/cldf/forms.csv.zip -d data/lexibank-lexibank-analysed-a4c0952/cldf/

unzip:  cannot find or open data/lexibank-analysed-v1.0.zip, data/lexibank-analysed-v1.0.zip.zip or data/lexibank-analysed-v1.0.zip.ZIP.


Archive:  data/lexibank-lexibank-analysed-a4c0952/cldf/forms.csv.zip
  inflating: data/lexibank-lexibank-analysed-a4c0952/cldf/forms.csv  


We need tree files; forms, languages and concepts:

In [3]:
forms = pd.read_csv('data/lexibank-lexibank-analysed-a4c0952/cldf/forms.csv')
languages = pd.read_csv('data/lexibank-lexibank-analysed-a4c0952/cldf/languages.csv')
concepts = pd.read_csv('data/lexibank-lexibank-analysed-a4c0952/cldf/concepts.csv')

In [4]:
# match forms with languages by Language_ID and ID
forms = forms.merge(languages, left_on='Language_ID', right_on='ID')
# match concepts by Parameter_ID and ID
forms = forms.merge(concepts, left_on='Parameter_ID', right_on='ID')
forms['Length'] = forms['Segments'].str.strip().str.count(' ') + 1
# convert Glottocode column to string
forms['Glottocode'] = forms['Glottocode'].astype(str)
forms['Number_of_meanings'] = forms.groupby(['Form', 'Language_ID'])['Concepticon_ID'].transform('nunique')

Let's look at the dataset we have:

In [5]:
forms.sort_values(by='Number_of_meanings')

Unnamed: 0,ID_x,Language_ID,Parameter_ID,Form,Segments,Comment,Source,Value,Local_ID,Graphemes,...,ID,Name_y,Description,ColumnSpec,Concepticon_ID,Concepticon_Gloss,Central_Concept,Core_Concept,Length,Number_of_meanings
0,aaleykusunda-KusundaK-above-1,aaleykusunda-KusundaK,above,ɐ̃ː.ʤi,ɐ̃ː + dʒ i,,Aaley2019,nɔ̃ː.ʤi ɐ̃ː.ʤi,aaleykusunda-KusundaK-1_above-1,,...,above,ABOVE,,,1741,ABOVE,HIGH,,4,1.0
453867,abvdoceanic-RapanuiEasterIsland-cutorhack-7,abvdoceanic-RapanuiEasterIsland,cutorhack,hore,h o r e,,Greenhill2008,hore,abvdoceanic-RapanuiEasterIsland-78_tocuthack-7,,...,cutorhack,CUT OR HACK,,,3889,CUT OR HACK,,,4,1.0
453868,abvdoceanic-RapanuiEasterIsland-cutorhack-8,abvdoceanic-RapanuiEasterIsland,cutorhack,tata,t a t a,,Greenhill2008,tata,abvdoceanic-RapanuiEasterIsland-78_tocuthack-8,,...,cutorhack,CUT OR HACK,,,3889,CUT OR HACK,,,4,1.0
453869,abvdoceanic-Lakalai-cutorhack-1,abvdoceanic-Lakalai,cutorhack,vari,v a r i,,Greenhill2008,vari,abvdoceanic-Lakalai-78_tocuthack-1,,...,cutorhack,CUT OR HACK,,,3889,CUT OR HACK,,,4,1.0
453870,abvdoceanic-Lakalai-cutorhack-2,abvdoceanic-Lakalai,cutorhack,tubi,t u b i,,Greenhill2008,tubi,abvdoceanic-Lakalai-78_tocuthack-2,,...,cutorhack,CUT OR HACK,,,3889,CUT OR HACK,,,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617305,savelyevturkic-Dolgan-with-1,savelyevturkic-Dolgan,with,,n a n,,Savelyev2020,,savelyevturkic-Dolgan-247_with-1,,...,with,WITH,,,1340,WITH,AND,Swadesh-1952-200,3,
630557,marrisonnaga-Jingpho-self-2,marrisonnaga-Jingpho,self,,n a n,,Marrison1967,,marrisonnaga-Jingpho-640_self-2,,...,self,SELF,,,1993,SELF,,,3,
648333,johanssonsoundsymbolic-Kunza-lowerleg-1,johanssonsoundsymbolic-Kunza,lowerleg,,n a n,,Johansson2020,,johanssonsoundsymbolic-Kunza-123_lowerleg-1,,...,lowerleg,LOWER LEG,,,447,LOWER LEG,LEG,,3,
655641,transnewguineaorg-gaikundi-wetwoexclusive-1,transnewguineaorg-gaikundi,wetwoexclusive,,n a n,,Greenhill2015,,transnewguineaorg-gaikundi-1866_weexclpluralpr...,,...,wetwoexclusive,WE TWO (EXCLUSIVE),,,2636,WE TWO (EXCLUSIVE),WE TWO,,3,


In [6]:
forms.to_csv('data/forms_total.csv', index=False)