In [1]:
import lingpy
from __future__ import unicode_literals, print_function, division
from lingpy import *
import pandas as pd
from lingpy.convert.strings import write_nexus
from lingpy.sequence.sound_classes import ipa2tokens
from segments.tokenizer import Tokenizer
from lingpy.basic.wordlist import Wordlist
from lingpy.sequence.sound_classes import check_tokens
import pandas as pd
from collections import defaultdict


## Tokenization and cognate detection

In [2]:
iranian = get_wordlist("csv_complete_entries.csv", delimiter = ";")

In [3]:
tk = Tokenizer()

In [4]:
iranian.add_entries("TOKENS", "IPA", tk)

In [5]:
iranian.output("tsv", filename = "iranian_tokens", 
               prettify = False, ignore = "all")

2021-09-18 16:08:12,759 [INFO] Data has been written to file <iranian_tokens.tsv>.


In [6]:
iranian_output = pd.read_csv("iranian_tokens.tsv", sep="\t")

In [7]:
iranian_output

Unnamed: 0,ID,DOCULECT,CONCEPT,IPA,VARIANTS,TOKENS
0,1,Bakhtiyari,1SG,mo,,m o
1,2,Balochi,1SG,man,,m a n
2,3,Gilaki,1SG,mʔn,,m ʔ n
3,4,Judeo-Tati,1SG,me,,m e
4,5,Kumzari,1SG,meh,,m e h
...,...,...,...,...,...,...
2222,2010,NK,yesterday,do,,d o
2223,2011,SWK,yesterday,dɨhu,,d ɨ h u
2224,2012,NWK,yesterday,do,,d o
2225,2101,Ishkashimi,yesterday,paruzd,,p a r u z d


In [8]:

errors = defaultdict(int)

for idx, tks in iranian.iter_rows('tokens'):
#     print("index {0}: {1}".format(idx, tks))
    for error in check_tokens(tks):
        errors[error[1]] += 1
print(len(errors))
print(errors)

3
defaultdict(<class 'int'>, {' ': 7099, '̣': 3, '̄': 1})


The errors were of little relevance and should have not interfered with the LingPy cognate detection process

### Cognate detection and alignment

In [20]:
iranian_corr = get_wordlist("iranian_tokens.tsv", delimiter="\t")

In [21]:
lex = LexStat("iranian_tokens.tsv", check = True, segments = "tokens")

2021-07-28 02:20:36,536 [INFO] No obvious errors found in the data.


In [22]:
lex.get_scorer(runs=10000)
lex.output('tsv', filename='iranian_lexstat.bin', ignore=[])
lex.cluster(method='lexstat', threshold=0.55, 
            ref="infomap", cluster_method='infomap', force = True)
lex.output('tsv', filename = "iranian_lexstat_result")

CORRESPONDENCE CALCULATION:   0%|                                                            | 0/288.0 [00:00<?, ?it/s]2021-07-28 02:20:45,619 [INFO] Calculating alignments for pair Bakhtiyari / Bakhtiyari.
2021-07-28 02:20:45,642 [INFO] Calculating alignments for pair Bakhtiyari / Balochi.
2021-07-28 02:20:45,658 [INFO] Calculating alignments for pair Bakhtiyari / Gilaki.
2021-07-28 02:20:45,672 [INFO] Calculating alignments for pair Bakhtiyari / Ishkashimi.
2021-07-28 02:20:45,682 [INFO] Calculating alignments for pair Bakhtiyari / Judeo-Tati.
2021-07-28 02:20:45,699 [INFO] Calculating alignments for pair Bakhtiyari / Kumzari.
CORRESPONDENCE CALCULATION:   2%|█▎                                                  | 7/288.0 [00:00<00:04, 69.16it/s]2021-07-28 02:20:45,724 [INFO] Calculating alignments for pair Bakhtiyari / Luri.
2021-07-28 02:20:45,734 [INFO] Calculating alignments for pair Bakhtiyari / Modern_Persian.
2021-07-28 02:20:45,755 [INFO] Calculating alignments for pair Bakhtiy

2021-07-28 02:20:46,850 [INFO] Calculating alignments for pair Ishkashimi / SK.
2021-07-28 02:20:46,862 [INFO] Calculating alignments for pair Ishkashimi / SWK.
2021-07-28 02:20:46,873 [INFO] Calculating alignments for pair Ishkashimi / Taleshi.
2021-07-28 02:20:46,884 [INFO] Calculating alignments for pair Ishkashimi / Wakhi.
CORRESPONDENCE CALCULATION:  30%|███████████████▍                                   | 87/288.0 [00:01<00:02, 72.47it/s]2021-07-28 02:20:46,896 [INFO] Calculating alignments for pair Ishkashimi / Waziri.
2021-07-28 02:20:46,906 [INFO] Calculating alignments for pair Ishkashimi / Yaghnobi.
2021-07-28 02:20:46,917 [INFO] Calculating alignments for pair Ishkashimi / Zazaki.
2021-07-28 02:20:46,928 [INFO] Calculating alignments for pair Ishkashimi / Zebaki.
2021-07-28 02:20:46,938 [INFO] Calculating alignments for pair Judeo-Tati / Judeo-Tati.
2021-07-28 02:20:46,954 [INFO] Calculating alignments for pair Judeo-Tati / Kumzari.
2021-07-28 02:20:46,970 [INFO] Calculatin

2021-07-28 02:20:48,086 [INFO] Calculating alignments for pair NK / Ormuri.
2021-07-28 02:20:48,099 [INFO] Calculating alignments for pair NK / Ossetic_Iron.
2021-07-28 02:20:48,120 [INFO] Calculating alignments for pair NK / Parachi.
2021-07-28 02:20:48,137 [INFO] Calculating alignments for pair NK / Pashto.
2021-07-28 02:20:48,151 [INFO] Calculating alignments for pair NK / SEK.
2021-07-28 02:20:48,166 [INFO] Calculating alignments for pair NK / Shughni.
CORRESPONDENCE CALCULATION:  60%|██████████████████████████████                    | 173/288.0 [00:02<00:01, 64.30it/s]2021-07-28 02:20:48,182 [INFO] Calculating alignments for pair NK / SK.
2021-07-28 02:20:48,198 [INFO] Calculating alignments for pair NK / SWK.
2021-07-28 02:20:48,216 [INFO] Calculating alignments for pair NK / Taleshi.
2021-07-28 02:20:48,230 [INFO] Calculating alignments for pair NK / Wakhi.
2021-07-28 02:20:48,243 [INFO] Calculating alignments for pair NK / Waziri.
2021-07-28 02:20:48,256 [INFO] Calculating alig

2021-07-28 02:20:49,416 [INFO] Calculating alignments for pair SEK / Zazaki.
CORRESPONDENCE CALCULATION:  89%|████████████████████████████████████████████▎     | 255/288.0 [00:03<00:00, 65.76it/s]2021-07-28 02:20:49,430 [INFO] Calculating alignments for pair SEK / Zebaki.
2021-07-28 02:20:49,437 [INFO] Calculating alignments for pair Shughni / Shughni.
2021-07-28 02:20:49,451 [INFO] Calculating alignments for pair Shughni / SK.
2021-07-28 02:20:49,464 [INFO] Calculating alignments for pair Shughni / SWK.
2021-07-28 02:20:49,487 [INFO] Calculating alignments for pair Shughni / Taleshi.
2021-07-28 02:20:49,503 [INFO] Calculating alignments for pair Shughni / Wakhi.
2021-07-28 02:20:49,520 [INFO] Calculating alignments for pair Shughni / Waziri.
CORRESPONDENCE CALCULATION:  91%|█████████████████████████████████████████████▍    | 262/288.0 [00:03<00:00, 66.68it/s]2021-07-28 02:20:49,533 [INFO] Calculating alignments for pair Shughni / Yaghnobi.
2021-07-28 02:20:49,546 [INFO] Calculating al

RANDOM CORRESPONDENCE CALCULATION:  20%|█████████                                   | 59/288.0 [00:57<03:39,  1.04it/s]2021-07-28 02:21:47,915 [INFO] Calculating random alignmentsfor pair Gilaki/Pashto.
RANDOM CORRESPONDENCE CALCULATION:  21%|█████████▏                                  | 60/288.0 [00:59<03:52,  1.02s/it]2021-07-28 02:21:49,080 [INFO] Calculating random alignmentsfor pair Gilaki/SEK.
RANDOM CORRESPONDENCE CALCULATION:  21%|█████████▎                                  | 61/288.0 [01:00<04:04,  1.08s/it]2021-07-28 02:21:50,284 [INFO] Calculating random alignmentsfor pair Gilaki/Shughni.
RANDOM CORRESPONDENCE CALCULATION:  22%|█████████▍                                  | 62/288.0 [01:01<03:59,  1.06s/it]2021-07-28 02:21:51,306 [INFO] Calculating random alignmentsfor pair Gilaki/SK.
RANDOM CORRESPONDENCE CALCULATION:  22%|█████████▋                                  | 63/288.0 [01:02<04:05,  1.09s/it]2021-07-28 02:21:52,465 [INFO] Calculating random alignmentsfor pair Gilaki

RANDOM CORRESPONDENCE CALCULATION:  34%|███████████████▏                            | 99/288.0 [01:28<03:23,  1.08s/it]2021-07-28 02:22:18,895 [INFO] Calculating random alignmentsfor pair Judeo-Tati/Parachi.
RANDOM CORRESPONDENCE CALCULATION:  35%|██████████████▉                            | 100/288.0 [01:29<03:12,  1.03s/it]2021-07-28 02:22:19,798 [INFO] Calculating random alignmentsfor pair Judeo-Tati/Pashto.
RANDOM CORRESPONDENCE CALCULATION:  35%|███████████████                            | 101/288.0 [01:30<03:19,  1.07s/it]2021-07-28 02:22:20,958 [INFO] Calculating random alignmentsfor pair Judeo-Tati/SEK.
RANDOM CORRESPONDENCE CALCULATION:  35%|███████████████▏                           | 102/288.0 [01:32<03:23,  1.09s/it]2021-07-28 02:22:22,117 [INFO] Calculating random alignmentsfor pair Judeo-Tati/Shughni.
RANDOM CORRESPONDENCE CALCULATION:  36%|███████████████▍                           | 103/288.0 [01:33<03:29,  1.13s/it]2021-07-28 02:22:23,335 [INFO] Calculating random alig

RANDOM CORRESPONDENCE CALCULATION:  62%|██████████████████████████▋                | 179/288.0 [02:40<01:49,  1.01s/it]2021-07-28 02:23:30,327 [INFO] Calculating random alignmentsfor pair NK/Zazaki.
RANDOM CORRESPONDENCE CALCULATION:  62%|██████████████████████████▉                | 180/288.0 [02:41<01:54,  1.06s/it]2021-07-28 02:23:31,504 [INFO] Calculating random alignmentsfor pair NK/Zebaki.
RANDOM CORRESPONDENCE CALCULATION:  63%|███████████████████████████                | 181/288.0 [02:41<01:24,  1.26it/s]2021-07-28 02:23:31,669 [INFO] Calculating random alignmentsfor pair NWK/NWK.
RANDOM CORRESPONDENCE CALCULATION:  63%|███████████████████████████▏               | 182/288.0 [02:42<01:32,  1.15it/s]2021-07-28 02:23:32,724 [INFO] Calculating random alignmentsfor pair NWK/Ormuri.
RANDOM CORRESPONDENCE CALCULATION:  64%|███████████████████████████▎               | 183/288.0 [02:43<01:29,  1.18it/s]2021-07-28 02:23:33,526 [INFO] Calculating random alignmentsfor pair NWK/Ossetic_Iron.

RANDOM CORRESPONDENCE CALCULATION:  90%|██████████████████████████████████████▋    | 259/288.0 [03:55<00:26,  1.09it/s]2021-07-28 02:24:45,605 [INFO] Calculating random alignmentsfor pair Shughni/Taleshi.
RANDOM CORRESPONDENCE CALCULATION:  90%|██████████████████████████████████████▊    | 260/288.0 [03:56<00:26,  1.05it/s]2021-07-28 02:24:46,621 [INFO] Calculating random alignmentsfor pair Shughni/Wakhi.
RANDOM CORRESPONDENCE CALCULATION:  91%|██████████████████████████████████████▉    | 261/288.0 [03:57<00:24,  1.08it/s]2021-07-28 02:24:47,467 [INFO] Calculating random alignmentsfor pair Shughni/Waziri.
RANDOM CORRESPONDENCE CALCULATION:  91%|███████████████████████████████████████    | 262/288.0 [03:58<00:22,  1.15it/s]2021-07-28 02:24:48,209 [INFO] Calculating random alignmentsfor pair Shughni/Yaghnobi.
RANDOM CORRESPONDENCE CALCULATION:  91%|███████████████████████████████████████▎   | 263/288.0 [03:59<00:22,  1.13it/s]2021-07-28 02:24:49,143 [INFO] Calculating random alignmentsfor

2021-07-28 02:25:20,624 [INFO] Analyzing words for concept <ant>.
SEQUENCE CLUSTERING:   4%|██▍                                                          | 4/100 [00:00<00:02, 33.43it/s]2021-07-28 02:25:20,663 [INFO] Analyzing words for concept <ash>.
2021-07-28 02:25:20,711 [INFO] Analyzing words for concept <back>.
2021-07-28 02:25:20,751 [INFO] Analyzing words for concept <big>.
SEQUENCE CLUSTERING:   7%|████▎                                                        | 7/100 [00:00<00:03, 30.67it/s]2021-07-28 02:25:20,771 [INFO] Analyzing words for concept <bird>.
2021-07-28 02:25:20,844 [INFO] Analyzing words for concept <bite>.
2021-07-28 02:25:20,870 [INFO] Analyzing words for concept <bitter>.
SEQUENCE CLUSTERING:  10%|██████                                                      | 10/100 [00:00<00:03, 28.18it/s]2021-07-28 02:25:20,900 [INFO] Analyzing words for concept <black>.
2021-07-28 02:25:20,934 [INFO] Analyzing words for concept <blood>.
2021-07-28 02:25:20,968 [INFO] Analyzin

2021-07-28 02:25:23,617 [INFO] Analyzing words for concept <stone>.
SEQUENCE CLUSTERING:  82%|█████████████████████████████████████████████████▏          | 82/100 [00:03<00:00, 24.61it/s]2021-07-28 02:25:23,657 [INFO] Analyzing words for concept <suck>.
2021-07-28 02:25:23,701 [INFO] Analyzing words for concept <sweet>.
2021-07-28 02:25:23,731 [INFO] Analyzing words for concept <tail>.
SEQUENCE CLUSTERING:  85%|███████████████████████████████████████████████████         | 85/100 [00:03<00:00, 25.86it/s]2021-07-28 02:25:23,762 [INFO] Analyzing words for concept <take>.
2021-07-28 02:25:23,801 [INFO] Analyzing words for concept <thick>.
2021-07-28 02:25:23,846 [INFO] Analyzing words for concept <thigh>.
SEQUENCE CLUSTERING:  88%|████████████████████████████████████████████████████▊       | 88/100 [00:03<00:00, 26.19it/s]2021-07-28 02:25:23,870 [INFO] Analyzing words for concept <this>.
2021-07-28 02:25:23,913 [INFO] Analyzing words for concept <tie>.
2021-07-28 02:25:23,959 [INFO] Analyz

In [23]:
lex = LexStat("iranian_lexstat.bin.tsv")
alm = Alignments("iranian_lexstat_result.tsv", 
                 ref = "infomap", segments = "tokens")
alm.align(method='progressive', scoredict=lex.cscorer)

In [24]:
msa = alm.get_msa('infomap')[7]
for i, idx in enumerate(msa['ID']):
    print(
        '{0:20}'.format(msa['taxa'][i]),  
        '\t',
        alm[idx, 'concept'],
        '\t',
        '\t'.join(msa['alignment'][i])
    )

Bakhtiyari           	 3SG 	 h	o
Balochi              	 3SG 	 ʔ	a
Ormuri               	 3SG 	 ʔ	a


In [25]:
alm.output('tsv', filename='iranian-aligned', ignore='all', prettify=False)

2021-07-28 02:31:17,666 [INFO] Data has been written to file <iranian-aligned.tsv>.


### Generating data to compute delta-score and Q-residuals

In [4]:
from lingpy.compare.partial import Partial
from lingpy.convert.plot import plot_tree

In [6]:

part = Partial('correct_cogid.tsv')
part.add_cognate_ids('INFOMAP', 'cogid', idtype='strict')

part.calculate('distance', ref='cogid')
part.calculate('tree', tree_calc='neighbor')

part.output('dst', filename='correct_cogid'+'distance')
part.output('tre', filename='correct_cogid'+'tree')





  deprecated('With PY3 path components are always `str`')
  deprecated('With PY3 path components are always `str`')
2021-09-16 18:45:15,532 [INFO] Successfully calculated distance.
2021-09-16 18:45:15,686 [INFO] Successfully calculated tree.
  deprecated('With PY3 path components are always `str`')
  deprecated('With PY3 path components are always `str`')
2021-09-16 18:45:15,690 [INFO] Data has been written to file <correct_cogiddistance.dst>.
  deprecated('With PY3 path components are always `str`')
  deprecated('With PY3 path components are always `str`')
2021-09-16 18:45:15,692 [INFO] Data has been written to file <correct_cogidtree.tre>.


Note that in the last section I have used the corrected cognates data, rather than the originally generated cognate sets