# SETUP ENVIRONMENT


In [3]:
# Check for and test tensorflow GPU hardware acceleration

%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## MNT DRIVE AND CLONE GITHUB REPO

In [4]:
# mnt drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Access token from drive without exposing in text (accsessible in logfiles lol. Better than nothing)
import os
import subprocess
from pathlib import Path

drive_path = Path('/content/drive/MyDrive/bioNLP/')
token = open(Path(drive_path.parent / "token/colab.txt"), "r").read()
tkn = token.split("=")

repo_name = "callebalik/clinical_NLP_SE.git"
cmd_string = 'git clone https://{0}@github.com/{1}'.format(tkn[1], repo_name)

!{cmd_string}
cmd_string = "" # removing the variable
print(cmd_string)

Cloning into 'clinical_NLP_SE'...
remote: Enumerating objects: 490, done.[K
remote: Counting objects: 100% (490/490), done.[K
remote: Compressing objects: 100% (332/332), done.[K
remote: Total 490 (delta 210), reused 356 (delta 110), pack-reused 0[K
Receiving objects: 100% (490/490), 3.38 MiB | 12.25 MiB/s, done.
Resolving deltas: 100% (210/210), done.



## Set paths for the environment and import models

In [6]:
# Set paths for the colab environment 
import os 
import sys
from pathlib import Path

repo_path = Path('/content/clinical_NLP_SE/')

models_path = Path(drive_path / 'models')
data_path = repo_path / 'data/'

sys.path.append(str(repo_path / 'sripts/')) # Let pyhton find scripts from the repo

In [7]:
# Make colab update/reaload modules when cells run
%load_ext autoreload
%autoreload 2

## Install requirements 

In [8]:
%%shell 
#Installed from requirements, but left as a note
# pip install spacy-transformers
# updating spacy in colab 
# pip install spacy==3.0.6  

pip install -r /content/clinical_NLP_SE/requirements.txt
python -m spacy info

Collecting scispacy
[?25l  Downloading https://files.pythonhosted.org/packages/46/d2/456e1f66f7ba65209746aac666b22e0c11e9aee6d9f549a2fdba5d49247b/scispacy-0.4.0-py3-none-any.whl (44kB)
[K     |████████████████████████████████| 51kB 4.1MB/s 
[?25hCollecting spacy-transformers
  Downloading https://files.pythonhosted.org/packages/e8/c5/a156f9c979cc14f5f41cf2e6ecfc55d1128ac0363930ec7cc6fe4d98b4a2/spacy_transformers-1.0.2-py2.py3-none-any.whl
Collecting spacy>=3.0.6
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 228kB/s 
[?25hCollecting python-dotenv>=0.5.1
  Downloading https://files.pythonhosted.org/packages/26/1f/ae3d06ec877df31f49448d24eee7198549edae2af00da60c85dad93e343f/python_dotenv-0.17.1-py2.py3-none-any.whl
Collecting conllu
  Downloading https://files.pythonhosted.org/packages/ae/be/be6959c3



## GET DATA AND MODELS


In [9]:
%%shell
# Get models from Kungbib https://github.com/Kungbib/swedish-spacy

#Transformer models - # N.B. Make sure to install the spacy-transformers extension, or the models won't work.
#wget https://data.kb.se/datasets/2020/10/swedish_nlp/spacy/sv_tagger-0.0.0.tar.gz

pip install "/content/drive/MyDrive/bioNLP/models/sv_pipeline-0.0.0.tar.gz"

# Older models
#wget https://data.kb.se/datasets/2020/10/swedish_nlp/spacy/sv_model_upos.zip
#wget https://data.kb.se/datasets/2020/10/swedish_nlp/spacy/sv_model_xpos.zip


Processing ./drive/MyDrive/bioNLP/models/sv_pipeline-0.0.0.tar.gz
Building wheels for collected packages: sv-pipeline
  Building wheel for sv-pipeline (setup.py) ... [?25l[?25hdone
  Created wheel for sv-pipeline: filename=sv_pipeline-0.0.0-cp37-none-any.whl size=935375398 sha256=b32cda88e7af7e65ebf88c574e3d6ca9b40ff262b26f5eb36059377adea5d33b
  Stored in directory: /root/.cache/pip/wheels/78/6b/1a/3724784df18ac433f7975f29d5cb3c47c3cfa3f40c4a87f60d
Successfully built sv-pipeline
Installing collected packages: sv-pipeline
Successfully installed sv-pipeline-0.0.0




### Getting the Swedish models https://data.kb.se/datasets/2020/10/swedish_nlp/spacy/

https://github.com/Kungbib/swedish-spacy

The models can be downloaded with the commands below. For colab this is slow and they are instead accessed from the mounted from a google drive where they have been downloaded. If first time download and add to the correct folder of your drive. 


### Note from Kungbib

** UPDATE February 2021 **: We are adding two transformer-based models trained in spaCy 3.0. They are available to download at the same link given below.
One model is a complete pipeline with UPOS tagger, parser, sentencer, ner and lemmatizer (sv_pipeline-0.0.0.tar.gz). Unfortunately the lemmatizer is not yet trainable in spaCy, so the performance is as good as the quality of the rules/lookup tables available for Swedish (i.e. not very good). 

If you need a Swedish lemmatizer we advise you for the moment to have a look at Stanza, efselab or lemmy.ww

---


The other model is a XPOS tagger in case you need language-specific part-of-speech tags (sv_tagger-0.0.0.tar.gz).

## INITIALIZE SPACY

In [10]:
import spacy 
nlp = spacy.load("sv_pipeline") # Load the swedish model (can be done this way since it's installed as a pip package)

# NLP Model setup 
* In colab use Ctrl + F8 in this cell to run all environment setup above before starting
* *Time estimate 3 min*

## NER

### ICD-10

In [11]:
''' Processing ICD codes using regex 

pattern and special signs 
LDD   LLLLLL
LDD † LLLLLL
LDD * LLLLLL

Example 
P75	*	Mekoniumileus vid cystisk fibros (E84.1)


'''

'''
Positive Lookbehind (?<=\D\d\d) Assert that the Regex below matches
  \D matches any character that's not a digit (equivalent to [^0-9])
  \d matches a digit (equivalent to [0-9])
  \d matches a digit (equivalent to [0-9])
\s matches any whitespace character (equivalent to [\r\n\t\f\v ])
+? matches the previous token between one and unlimited times, as few times as possible, expanding as needed (lazy)
. matches any character (except for line terminators)
? matches the previous token between zero and one times, as many times as possible, giving back as needed (greedy)
\s matches any whitespace character (equivalent to [\r\n\t\f\v ])

Global pattern flags
g modifier: global. All matches (don't return after first match)
m modifier: multi line. Causes ^ and $ to match the begin/end of each line (not only begin/end of string)
'''

import re

regex_ddd = '/(?<=\D\d\d)\s+?.?\s/gm'
regex_dddR = '/(?<=R\d\d)\s+?.?\s/gm' 
ddd = r'(?<=\D\d\d)\s+?.?\s'
dddd = r'(?<=\D\d\d\d)\s+?.?\s'
ddddd = r'((?<=\D\d\d\d\d)|(?<=\D\d\d\d\D))\s+?.?\s'

'''
A513B	†	Syfilitisk (sekundär) alopeci (L99.8)
Y3498		Ospecificerad skadehändelse, med oklar avsikt-plats, ospecificerad-andra specificerade aktiviteter
Y3499		Ospecificerad skadehändelse, med oklar avsikt-plats, ospecificerad-aktivitet, ospecificerad
Y586A		Komplikation av vaccin mot kikhosta, enbart (P)
Y586B		Komplikation av vaccin mot difteri, kikhosta och stelkramp, kombinerat (DPT)
Y586W		Komplikation av annat kombinerat 
'''

ICD_PATH = data_path / 'raw/codes/icd-10-se-2021-text'
file_path = ICD_PATH / 'digit3.txt'

ICD000 = []
with open(ICD_PATH / 'digit3.txt','r') as codes:
    for line in codes:
        x = re.split(ddd, line)
        ICD000.append(x[1].strip())

ICD0000 = []
with open(ICD_PATH / 'digit4.txt','r') as codes:
    for line in codes:
        x = re.split(dddd, line)
        ICD0000.append(x[1].strip())
        
ICD00000 = []
with open(ICD_PATH / 'digit5.txt','r') as codes:
    for line in codes:
        x = re.split(ddddd, line)
        ICD00000.append(x[1])

ICD = ICD000 + ICD0000 + ICD00000

### Negations

In [12]:
''' Processing negations '''

NEG_PATH = data_path / 'raw/negations'

NEG = []
with open(NEG_PATH / 'negEx2.txt','r') as neg:
    for line in neg:
        x = re.split(r'\s+(?=\[)', line)
        NEG.append(x[0].strip())

### Setup Entity ruler - Recognizing Custom Named entities

In [23]:
icd_entries = [nlp.make_doc(text.lower()) for text in ICD]
neg_phrases = [nlp.make_doc(text.lower()) for text in NEG]

In [41]:
# %%%%%%%%%
# ENTITY RULER
# %%%%%%%%%

# implementation note - Integration with built in named entity recognizer. 
# - - - 
# Ruler -->  Recognizer = 
#   - Recognizer respects existing entity spans
#   - Adjust its predictions around it. 
# Can improve accuracy in some cases. 
# - - -
# Recognizer --> Ruler = 
#   - Ruler only add spans to the doc.ents if they don’t overlap with existing entities predicted by the model 
#   - To overwrite overlapping entities, you can set overwrite_ents=True on initialization.


config = {
      "phrase_matcher_attr": "lower",
      "validate": True, # Whether patterns should be validated (passed to the Matcher and PhraseMatcher). Defaults to False.
      "overwrite_ents": False,
      "ent_id_sep": "||",
    }

try:
  ruler = nlp.add_pipe("entity_ruler", config=config)
  # ruler.phrase_matcher = matcher # Changes the pharsematcher
except (NameError, ValueError) as e:
  ruler = nlp.replace_pipe(name="entity_ruler", factory_name="entity_ruler", config=config)
  #ruler.phrase_matcher = matcher # Changes the phrsematcher
  print("Entity ruler already added, replacing instead of creating")


# ToDO implement test to see that there are no residual patterns 
print("----Printing patterns, should be empty at this point----")    
print(ruler.patterns)



# Add patterns to the entity ruler 
# ToDO implemnt data format and quality check in ICD and Neg files - should be done in data processing
patterns = []

for pattern in icd_entries: 
  p = {"label":"SYM","pattern":[{"LOWER":str(pattern)}]} # adds rule to label lowercase str of icd_code 
  patterns.append(p)

for pattern in neg_phrases:
  p = {"label":"NEG","pattern":[{"LOWER":str(pattern)}]}
  patterns.append(p)

# ToDO Check what function select_pipes(enable="tagger") fills. I think it's just faster as not whole model has to be loaded.
with nlp.select_pipes(enable="tagger"): 
  ruler.add_patterns(patterns)


# nlp.pipeline # pipeline components
# ruler.patterns # ruler patterns 

# --------
# save ruler to disk in jsonl format - can then be loaded to model 
#ruler.to_disk(Path(repo_path / "models/patterns.jsonl"))

Entity ruler already added, replacing instead of creating
----Printing patterns, should be empty at this point----
[]


In [44]:
# Small test to se if the NLP model now finds kolera and neg 
# We do not have to init anything as the ruler is already part of the pipelien 
doc = nlp("Kolera, är inte en hemsk sjukdom")

print([(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents])

[('Kolera', 'SYM', ''), ('inte', 'NEG', '')]


In [28]:
# Analyse the pipeline and store the analysis under 'pipe_analysis'
pipe_analysis = nlp.analyze_pipes(pretty=True)
# print(nlp.pipe_names)  # see all components in the pipeline

[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   transformer       doc._.trf_data                                      False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

### Analysing & Exporting model result using pandas and tabulate

In [None]:
from tabulate import tabulate
import pandas as pd

cols = ("space", "text", "lemma", "normalization", "POS", "explain", "stopword", "dep","NE", "sentiment")
rows = []

for t in doc: 
  if t.ent_type_ != "":

    row = [t.is_space, t.text, t.lemma_, t.norm_, t.pos_, spacy.explain(t.pos_), t.is_stop, t.dep_, t.ent_type_, t.sentiment]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)
print(tabulate(df, headers = cols, tablefmt='psql', showindex=False))


+---------+-------------+-----------+-----------------+-------+------------+------------+-----------+------+-------------+
| space   | text        | lemma     | normalization   | POS   | explain    | stopword   | dep       | NE   |   sentiment |
|---------+-------------+-----------+-----------------+-------+------------+------------+-----------+------+-------------|
| False   | dyspné      | dyspné    | dyspné          | NOUN  | noun       | False      | conj      | SYM  |           0 |
| False   | Tidigare    | Tidigare  | tidigare        | ADJ   | adjective  | True       | amod      | TME  |           0 |
| False   | Inga        | Inga      | inga            | DET   | determiner | True       | det       | NEG  |           0 |
| False   | utan        | utan      | utan            | ADP   | adposition | True       | case      | NEG  |           0 |
| False   | igår        | igår      | igår            | ADV   | adverb     | True       | case      | TME  |           0 |
| False   | kväl

In [None]:
import pandas as pd

'''
    raw text
    lemma – a root form of the word
    part of speech
    a flag for whether the word is a stopword – i.e., a common word that may be filtered out
'''

cols = (
    "NE",
    "IOB", 
    "text", 
    "lemma", 
    "POS", 
    "dep",
    "right edge"
     )

df = pd.DataFrame()

rows = []

for t in doc:
      if t.ent_type_ == "SYM" or "NEG":
        row = [
               t.ent_type_,
               t.ent_iob_, 
               t.text, 
               t.lemma_, 
               spacy.explain(t.pos_), 
               t.dep_,
               t.right_edge
        ] 
              
        rows.append(row)

df = pd.DataFrame(rows, columns=cols)
print(tabulate(df, headers = cols, tablefmt='psql', showindex=False))      

+------+-------+---------------------+---------------------+---------------------------+------------+---------------------+
| NE   | IOB   | text                | lemma               | POS                       | dep        | right edge          |
|------+-------+---------------------+---------------------+---------------------------+------------+---------------------|
|      | O     | Kontaktorsak        | Kontaktorsak        | noun                      | ROOT       | :                   |
|      | O     | :                   | :                   | punctuation               | punct      | :                   |
|      | O     | Medicinlarm         | Medicinlarm         | noun                      | ROOT       |                     |
|      | O     | ,                   | ,                   | punctuation               | punct      | ,                   |
| SYM  | B     | dyspné              | dyspné              | noun                      | conj       | dyspné              |
|      |

### Visualize model results with displacy

In [55]:
from spacy import displacy

# only display sentances with named entities - either SYM or NEG
sentence_spans = list(doc.sents)

sym_sentences = []
neg_sentences = []

for sent in sentence_spans:
  ents = list(sent.ents)
  for ent in ents:
    if ent.label_ == "SYM":
      # print(ent.label_)
      sym_sentences.append(sent) 
      
    if ent.label_ == "NEG":
      # print(ent.label_)
      neg_sentences.append(sent) 

displacy_options = {"compact": True, 
           "bg": "#09a3d5",
           "color": "white",
           "font": "Source Sans Pro",
           "fine_grained": True}

style = "dep"

what_to_render = sym_sentences
displacy.render(what_to_render, style=style, jupyter=True, options=displacy_options)

In [57]:
# Display model prediction as html 
import IPython
# IPython.display.HTML(filename=Path(repo_path / 'models/parses.html'))
IPython.display.HTML(filename=Path(repo_path / 'models/entities.html'))

In [55]:
from spacy import displacy

# only display sentances with named entities - either SYM or NEG
sentence_spans = list(doc.sents)

sym_sentences = []
neg_sentences = []

for sent in sentence_spans:
  ents = list(sent.ents)
  for ent in ents:
    if ent.label_ == "SYM":
      # print(ent.label_)
      sym_sentences.append(sent) 
      
    if ent.label_ == "NEG":
      # print(ent.label_)
      neg_sentences.append(sent) 

displacy_options = {"compact": True, 
           "bg": "#09a3d5",
           "color": "white",
           "font": "Source Sans Pro",
           "fine_grained": True}

style = "dep"

what_to_render = sym_sentences
displacy.render(what_to_render, style=style, jupyter=True, options=displacy_options)

## Acronyms - not production ready 

In [None]:
import scispacy
from scispacy.abbreviation import AbbreviationDetector

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

doc = nlp("""Förmaksflimmer (FF) \
          FF är den vanligaste takyarytmin. Förekomsten ökar med åldern, och är ca 1 % vid 50 år och > 10 % vid 80 år. Kan förekomma hos hjärtfriska individer, s k lone atrial fibrillation, men vanligen föreligger organisk hjärtsjukdom. Det är rekommenderat att dessa patienter handläggs på ett strukturerat sätt och med patienten som en viktig del i sitt team, ett personcentrerat arbetssätt, för bästa resultat.""")

doc = nlp("""Ett esofagus-EKG kan vara indicerat för att differentiera mellan FFl och andra regelbundna supraventrikulära takykardier, såsom AV nodal reentry-takykardi (AVNRT), WPW-syndrom eller ektopisk förmakstakykardi (EAT).""")
print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

ValueError: ignored

In [None]:
text = ("""Ett esofagus-EKG kan vara indicerat för att differentiera mellan FFl och andra regelbundna supraventrikulära takykardier, såsom AV nodal reentry-takykardi (AVNRT), WPW-syndrom eller ektopisk förmakstakykardi (EAT).""")

doc = nlp(text)

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")
 
text = replace_acronyms(text)
doc = nlp(text)
print(text)

Abbreviation 	 Definition
AVNRT 	 (21, 22) AV nodal reentry-takykardi
EAT 	 (29, 30) ektopisk förmakstakykardi
Ett esofagus-EKG kan vara indicerat för att differentiera mellan FFl och andra regelbundna supraventrikulära takykardier , såsom AV nodal reentry-takykardi ( AV nodal reentry-takykardi ) , WPW-syndrom eller ektopisk förmakstakykardi ( ektopisk förmakstakykardi ) .


In [51]:
from spacy import displacy


# styling output from displacy
colors = {
          "SYM": "linear-gradient(90deg, #99154e, #99154e)",
          "NEG": "linear-gradient(90deg, #ffc93c, #ffc93c)"
          }
 

options = {"compact": False, 
           "bg": "#09a3d5",
           "color": "white",
           "font": "Source Sans Pro",
            "ents": ["SYM", "NEG", "TMN"], 
           "colors": colors}

input = doc # display input with displacy 
displacy.render(input, style="ent", jupyter=True, options=options)

In [None]:
def replace_acronyms(text):
    doc = nlp(text)
    altered_tok = [tok.text for tok in doc]
    for abrv in doc._.abbreviations:
        altered_tok[abrv.start] = str(abrv._.long_form)

    return(" ".join(altered_tok))

# Run & Evaluate model on corpus data 

In [34]:
!# ToDo Make this step work as native python code instead of shell shifting colab adjustment

!cd /content/clinical_NLP_SE/data

!# ToDo SEtup test to check that conversion works after INCEpTION export problem is 

# convert the annotated corpus to spacy example format
# use 100 sentences before splitting a file into a new doc 
!python -m spacy convert /content/clinical_NLP_SE/data/interim/corpus/conll2003_V2 processed/ --converter conll --n-sents 100 

/content/clinical_NLP_SE/data
2021-05-31 07:21:02.046990: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;4mℹ Grouping every 100 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents): processed/chart1.spacy[0m
[38;5;4mℹ Grouping every 100 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents): processed/chart2.spacy[0m
[38;5;4mℹ Grouping every 100 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents): processed/chart3.spacy[0m
[38;5;4mℹ Grouping every 100 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents): processed/chart4.spacy[0m
[38;5;4mℹ Grouping every 100 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents): processed/chart5.spacy[0m
[38;5;4mℹ Grouping every 100 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents): processed/pat1.spacy[0m
[38;5;4mℹ

In [56]:
# Evaluate model

#!rm -r /content/clinical_NLP_SE/models/ 
#!mkdir /content/clinical_NLP_SE/models/
%cd /content/clinical_NLP_SE/models/
#ToDo Set paths as variables 
nlp.to_disk(repo_path / 'models/test_3/')
!python -m spacy evaluate /c ontent/clinical_NLP_SE/models/test_3/ ../data/processed/ --output models/metrics.json --displacy-path ../models/ --gpu-id 0

/content/clinical_NLP_SE/models
2021-05-31 07:44:32.297238: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      -    
TAG      0.00 
POS      -    
LEMMA    -    
UAS      -    
LAS      -    
NER P    7.69 
NER R    26.19
NER F    11.89
SENT P   28.05
SENT R   51.05
SENT F   36.20
SPEED    668  

[1m

                 P        R       F
SYM          13.33     7.41    9.52
TME           0.00     0.00    0.00
NEG          11.69   100.00   20.93
DIS           0.00     0.00    0.00
ORG           0.00     0.00    0.00
null          0.00     0.00    0.00
PRS           0.00     0.00    0.00
LOC           0.00     0.00    0.00
NEGATIONER    0.00     0.00    0.00
MSR           0.00     0.00    0.00

[38;5;2m✔ Generated 25 parses as HTML[0m
../models
Traceback (most recent call last):
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  Fi

# Analysis of Model Structure

### Inspect sentences and chunks

In [None]:
# show how the model finns start-end of sentences 
# Can, and may have to be improved for special language use cases such as in health records, as convential sentence stops and ends are not always used and the model otherwise has a harder time extracting the context. 
# This is in large part done in the tokenizer 

'''
for sent in doc.sents:
    print(">", sent)
'''

# print the models understandning of noun chunks 
# this is interesting because we can see what the model thinks belongs with what 

'''
for chunk in doc.noun_chunks:
    print(chunk.text)
'''

### Analysing the pipeline 

### Alignment

In [None]:
# ToDo    
# Current implementation of the alignment algorithm assumes that both tokenizations add up to the same string. 
# E.g. you’ll be able to align ["I", "'", "m"] and ["I", "'m"], which both add up to "I'm", 
# but not ["I", "'m"] and ["I", "am"]. 

from spacy.training import Alignment

bert_tokens = ["obama", "'", "s", "podcast"]
spacy_tokens = ["obama", "'s", "podcast"]
alignment = Alignment.from_strings(bert_tokens, spacy_tokens)
a2b = alignment.x2y
assert list(a2b.dataXd) == [0, 1, 1, 2]
print(list(a2b.dataXd) == [0, 1, 1, 2])

# If a2b.dataXd[1] == a2b.dataXd[2] == 1, that means that A[1] ("'") and A[2] ("s") both align to B[1] ("'s")

True
