In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
sed = pd.read_csv('Scrapers/sed.tsv', sep='\t', dtype={'ID': str})

In [3]:
# process the data
# split the FORM column by '/' and create a new row for each value
sed1 = sed.assign(FORM=sed.FORM.str.split('/')).explode('FORM')
# reassign 'FORM' to 'VALUE' column
sed1["VALUE"] = sed1["FORM"]
# split the CONCEPT column by ';' and create a new row for each value
sed1 = sed1.assign(CONCEPT=sed1.CONCEPT.str.split(';')).explode('CONCEPT')
# some of the CONCEPT values start with "1. ", "2. ", "3. " or "I. " "II. " "III. " "IV.". Remove these prefixes.
# they might precede with a space, so we use a regex to remove them
# remove the prefixes
sed1['CONCEPT'] = sed1['CONCEPT'].str.replace(r'^\s*\d+\. ', '', regex=True)
# remove the prefixes
sed1['CONCEPT'] = sed1['CONCEPT'].str.replace(r'^\s*[IVX]+\. ', '', regex=True)
# some of the CONCEPT values contain \([IVX]+\) that separate between different concepts. Split these values by \([IVX]+\) and create a new row for each value
sed1 = sed1.assign(CONCEPT=sed1.CONCEPT.str.split(r'\([IVX]+\)')).explode('CONCEPT')

In [4]:
sed1.to_csv('sed.tsv', sep='\t', index=False)

# Create files according to CLDF specifications

In [13]:
# create languages.tsv with the format
# ID	Name	Glottocode	Glottolog_Name	ISO639P3code	Macroarea	Latitude	Longitude	Family	NameInSource

langs = sed['DOCULECT'].unique()
langs = pd.DataFrame(langs, columns=['NameInSource'])
langs["Name"] = langs["NameInSource"]
langs["ID"] = langs["NameInSource"]

# save the data
langs.to_csv('cldf/languages.tsv', sep='\t', index=False)

In [11]:
list(langs["NameInSource"])

['PS',
 'Akkadian',
 'Ugaritic',
 'Phoenician',
 'Hebrew',
 'Official Aramaic',
 'Hatran',
 'Jewish Palestinian Aramaic',
 'Christian Palestinian Aramaic',
 'Samaritan',
 'Jewish Babylonian Aramaic',
 'Syriac',
 'Mandaic',
 'Maalula',
 'Mlaḥso',
 'Turoyo',
 'Arabic',
 'Sabaic',
 'Hadramitic',
 'Geez',
 'Tigre',
 'Tigrinya',
 'Amharic',
 'Mehri',
 'Jibbali',
 'Harsusi',
 'Harari',
 'Soddo',
 'Soqotri',
 'Palmyrean',
 'Qatabanian',
 'Ebla',
 'Minaean',
 'Gurage',
 'Samalian',
 'Old Aramaic',
 'Nabataean',
 'Selti',
 'Wolane',
 'Zway',
 'Ezha',
 'Endegen',
 'Ennemor',
 'Gyeto',
 'Muher',
 'Masqan',
 'Gogot',
 'Argobba',
 'Biblical Aramaic',
 'Hertevin',
 'Gafat',
 'Chaha',
 'PWS',
 'Proto-MSA',
 'Amarna Canaanite',
 'Deir Alla',
 'Demotic Aramaic',
 'Urmi',
 'Moabite',
 'Barwar',
 'PC',
 'PES',
 'PCS',
 'PNWS',
 'PArm',
 'Betanure',
 'Ammonite',
 'Jilu',
 'Qaraqosh',
 'Arbel',
 'Geez (epigraphic)',
 'Areal reconstruction']