# Selecting relevant segment types for patient similarity categories

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import json
import random
from pprint import pprint
import os
import sys
import pickle
from sklearn.metrics.pairwise import cosine_similarity

AICOPE_PY_LIB = os.environ.get("AICOPE_PY_LIB")
if AICOPE_PY_LIB and AICOPE_PY_LIB not in sys.path: sys.path.append(AICOPE_PY_LIB)
import importlib
import aicnlp
importlib.reload(aicnlp)

%config Completer.use_jedi = False
PACSIM_DATA = os.environ.get("AICOPE_SCRATCH") + "/pacsim"

## Input

In [2]:
with open(f"{PACSIM_DATA}/parts/tid2t.pickle", "rb") as f:
    tid2t = pickle.load(f)
    t2tid = {v:k for k, v in tid2t.items()}

parts = pd.read_feather(f"{PACSIM_DATA}/parts/parts_pred.feather")
parts.head(5)

Unnamed: 0,rid,pid,rord,srord,text,stext,title,stitle,label,tid,pred,ptitle
0,0,0,0,0,different TITLE 8: Dolor adipisci labore modi...,Dolor adipisci labore modi porro consectetur ...,different TITLE 8:,different title #,12,13,13,different title #
1,0,0,0,1,OTHER title : Tempora adipisci ut quaerat nu...,Tempora adipisci ut quaerat numquam velit. Se...,OTHER title :,other title,21,22,22,other title
2,0,0,0,2,Tempora quaerat ut sed. Neque sit sed dolorem....,Tempora quaerat ut sed. Neque sit sed dolorem....,,,-1,-1,2,other title #
3,0,0,0,3,some OTHER title 8:\nEius quiquia quisquam dol...,\nEius quiquia quisquam dolore. Neque sit temp...,some OTHER title 8:,some other title #,16,17,17,some other title #
4,1,0,1,0,A title 8: Porro aliquam velit voluptatem est...,Porro aliquam velit voluptatem est quaerat. A...,A title 8:,a title #,0,1,1,a title #


In [4]:
categories = pd.DataFrame({"title": tid2t.values()}, index=tid2t.keys())
categories["pcount"] = parts["pred"].value_counts()
categories["vec"] = list(np.load(f"{PACSIM_DATA}/parts/d2v_titles.npy"))
categories.head(2)

Unnamed: 0,title,pcount,vec
1,a title #,1284,"[-0.203216, -0.9543775, -1.4384353, 1.1079152,..."
2,other title #,2753,"[-0.23859496, 0.43761224, -0.75037026, 0.21120..."


In [7]:
def nearest_categories(cat):
    cid = t2tid[cat]
    pivot = categories.loc[cid].vec
    x = pivot.reshape(1, -1)
    y = np.vstack(categories.vec)
    # return x.shape, y.shape
    sim = cosine_similarity(x, y)[0]
    simord = sim.argsort()
    # return sim.argsort()
    near = categories.iloc[simord, :2].copy()
    near["sim"] = sim[simord]
    return near.iloc[::-1]
    
# nearest_categories("medikace").head(5)
nearest_categories("a title #").head(5)

Unnamed: 0,title,pcount,sim
1,a title #,1284,1.0
21,a title,41,0.722009
10,a same title #,149,0.563197
8,a other title #,157,0.557951
5,a interestin title #,172,0.548453


### All

In [8]:
categories["Fall"] = True
# categories

You can skip selecting other categories, it does not make any sense on the dummy data.
Skip until the [**Output**](#output) section

### K01 - Věk

In [226]:
basename = "objektivne"
colname = "Fr01"
base = nearest_categories(basename).query("sim > 0.70")
base

minus = set([])
plus = set([])
cutoff = 0.8

chosen = set(nearest_categories(basename).query(f"sim > {cutoff}").index)
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

69496

### K02 - Rodinná anamnéza
- Zárodečná mutace (BRCA1,2, CHECK2, Lynchův syndrom atd.)
- Zvýšený výskyt nádorů v rodině - 2 a více malignit různého typu u přímých příbuzných (rodiče, sourozenci, děti)
- Bez zvýšeného výskytu ZN v rodině

POST: -provedeno

In [115]:
base = nearest_categories("ra").query("sim > 0.5")
colname = "Fr02"

minus = set([619, 1745, 446, 44])
plus = set([1247, 219])
cutoff = 0.55

chosen = set(nearest_categories("ra").query(f"sim > {cutoff}").index)
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()
# categories.loc[sorted(chosen)]

35334

### K03 - Osobní anamnéza
- 1-4 komorbidity
- Bez komorbidit
- 4 a více komorbidit -polymorbidní pacient

In [116]:
basename = "oa"
colname = "Fr03"
base = nearest_categories(basename).query("0.6 > sim > 0.55")


minus = set([219, 1745, 154])
plus = set([1870, 1017])
cutoff = 0.57

chosen = set(nearest_categories(basename).query(f"sim > {cutoff}").index)
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

189689

### K04 - Sociální anamnéza
- Manuálně pracující
- Administrativní činnost
- Starobní důchodce
- Manažer, majitel firmy, podnikatel
- Student/ka


\+ pa - pracovni anamneza

In [117]:
basename = "sa"
colname = "Fr04"
base = nearest_categories(basename).query("sim > 0.55")
base

minus = set([1113, 24, ])
plus = set([1313, 1434])
cutoff = 0.588

chosen = set(nearest_categories(basename).query(f"sim > {cutoff}").index)
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

49068

### K05 - Medikace
- Bez medikace
- Do 3 trvale užívaných preparátů
- Nad 3 trvale užívané preparáty

post: -pocatek podani, -nalez

In [218]:
basename = "medikace"
colname = "Fr05"
base = nearest_categories(basename).query("0.6 > sim > 0.55")
base

minus = set([1858])
plus = set([11,15, 209])
cutoff = 0.583

chosen = set(nearest_categories(basename).query(f"sim > {cutoff}").index)
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

135888

### K06 - Alergie
- Nejsou
- Jsou


In [125]:
basename = "alergie"
colname = "Fr06"
base = nearest_categories(basename).query("sim > 0.55")
base

minus = set([])
plus = set([])
cutoff = 0.71

chosen = set(nearest_categories(basename).query(f"sim > {cutoff}").index)
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

29915

### K07 - Tumor
- Histologický typ
- SR pozitivita
- HER 2 pozitivita
- KI 67 (nízký do 20%, střední 20-50%, vysoký nad 50%)


status localis  
obj  
vysledek  

In [228]:
basename = "status localis"
colname = "Fr07"
base = nearest_categories(basename).query("0.75 > sim > 0.65")
base

minus = set([])
plus = set([8, 10, 37, 13, 107]) # klasifikace, vysledek
cutoff = 0.676

chosen = set(nearest_categories(basename).query(f"sim > {cutoff}").index)
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

209462

### K08 - Léčba
- Adjuvantní
- Neoadjuvantní
- Paliativní


???  
ae 162


In [179]:
basename = "zaver"
colname = "Fr08"
base = nearest_categories(basename).query("0.75 >  sim > 0.70")
base

minus = set([])
plus = set([])
cutoff = 0.676

chosen = set(nearest_categories("zaver").query(f"sim > 0.70").index)
chosen.update(set(nearest_categories("doporuceni").query(f"sim > 0.55").index))
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

607199

### K09 - Typ léčby
- Chemoterapie
- Hormonoterapie
- Cílená léčba
- Imunoterapie
- Radioterapie
- Chirurgie

doporuceni  
zaver  
postup  
info pro lekare 70  


In [180]:
basename = "zaver"
colname = "Fr09"
base = nearest_categories(basename).query("0.75 >  sim > 0.70")
base

minus = set([])
plus = set([])
cutoff = 0.676

chosen = set(nearest_categories("zaver").query(f"sim > 0.70").index)
chosen.update(set(nearest_categories("doporuceni").query(f"sim > 0.55").index))
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

607199

### K10 - Nežádoucí účinky léčby
- Gastrointestinální
- Hepatální
- Neurologické
- Oční
- Kardiální
- Plicní
- Urologické
- Muskuloskeletální
- Gynekologické
- Ušní, nosní, krční
- Kožní


In [223]:
basename = "subjektivni potize"
colname = "Fr10"
base = nearest_categories(basename).query("sim > 0.60")
base

minus = set([])
plus = set([529, 1633])
cutoff = 0.60

chosen = set(nearest_categories(basename).query(f"sim > {cutoff}").index)
chosen.difference_update(minus)
chosen.update(plus)
categories[colname] = False
categories.loc[sorted(chosen), colname] = True
categories[categories[colname]].pcount.sum()

123691

<a name="output"></a>
## Output

In [10]:
categories.head(5)

Unnamed: 0,title,pcount,vec,Fall
1,a title #,1284,"[-0.203216, -0.9543775, -1.4384353, 1.1079152,...",True
2,other title #,2753,"[-0.23859496, 0.43761224, -0.75037026, 0.21120...",True
3,title #,255,"[-1.0362962, -0.24237698, -0.4801941, 0.434444...",True
4,some title #,210,"[-0.4671393, 0.278243, 0.70961726, 0.53670365,...",True
5,a interestin title #,172,"[-0.24984509, -0.79904723, -0.32063228, 0.9148...",True


In [14]:
categories.reset_index().to_feather(f"{PACSIM_DATA}/parts/categories_pred.feather")

In [37]:
def cattable(categories):
    cat = categories.copy()
    cat["percent"] = (categories["pcount"] / categories["pcount"].sum()).round(4)*100
    cat = cat[["index", "title", "pcount", "percent"]].set_index("index")
    return cat.iloc[:100].style.format("{:.2f}", subset=["percent"]).to_latex()

print(cattable(categories))

\begin{tabular}{llrr}
 & title & pcount & percent \\
index &  &  &  \\
1 & zaver & 129783 & 4.87 \\
2 & doporuceni & 161882 & 6.07 \\
3 & lab. vysetreni & 52249 & 1.96 \\
4 & vysetreni & 45890 & 1.72 \\
5 & lekar & 37551 & 1.41 \\
6 & evidence & 37309 & 1.40 \\
7 & pristroj & 35171 & 1.32 \\
8 & klasifikace & 34278 & 1.29 \\
9 & exam type & 31512 & 1.18 \\
10 & nalez [ep & 31040 & 1.16 \\
11 & m & 28531 & 1.07 \\
12 & fa & 31065 & 1.17 \\
13 & vysledek & 21739 & 0.82 \\
14 & pocatek podani & 21312 & 0.80 \\
15 & pm & 21531 & 0.81 \\
16 & mg #.cteni [ep & 19848 & 0.74 \\
17 & alergie & 21120 & 0.79 \\
18 & operace & 36614 & 1.37 \\
19 & expozice & 14606 & 0.55 \\
20 & provedl & 14863 & 0.56 \\
21 & prijem & 14151 & 0.53 \\
22 & res & 35115 & 1.32 \\
23 & provedeno & 14067 & 0.53 \\
24 & ra & 16404 & 0.62 \\
25 & interkurence & 22279 & 0.84 \\
26 & vyska & 13113 & 0.49 \\
27 & linie & 12850 & 0.48 \\
28 & rezim & 12610 & 0.47 \\
29 & res. & 21125 & 0.79 \\
30 & tk/puls & 12063 & 0.45 \\
