# B. first automatic tag detection 

## Intro - Importing libraries and datasets

In [45]:
# import of libraries
import pandas as pd
from fuzzywuzzy import fuzz
from tqdm import tqdm

## A - Exemple for fuzz (Levenshtein distance)

In [6]:
# Example of fuzz # 1
input1 = 'New York City'
input2 = 'New York'

print('ratio : ', fuzz.ratio(input1, input2))
print('ratio : ', fuzz.partial_ratio(input1, input2))

ratio :  76
ratio :  100


In [24]:
# Example of fuzz # 2
input1 = 'New York City is a beautiful city'
input2 = 'New York'

print('ratio : ', fuzz.ratio(input1, input2))
print('partial ratio : ', fuzz.partial_ratio(input1, input2))

ratio :  39
partial ratio :  100


In [25]:
len(input2)/len(input1)

0.24242424242424243

In [43]:
# Example of fuzz # 3
input1 = 'New York City is a beautiful city'
input2 = 'New Yokss'

print('ratio : ', fuzz.ratio(input1, input2))
print('partial ratio : ', fuzz.partial_ratio(input1, input2))

ratio :  38
partial ratio :  78


In [19]:
(len(input2)-2)/len(input2)

0.7777777777777778

In [46]:
# import of thesaurus
thesaurus = pd.read_csv('data/thesaurus_key_words.csv', encoding="ISO-8859-1", sep=';')
thesaurus.head()

Unnamed: 0,classification_E,catégorie,symptome-fr,symptome-en,CIM_10,CIM11,Orphanet
0,E1,Période néonatale,Encéphalopathie myoclonique précoce,Benign familial neonatal epilepsy (BFNE),G40.8,8A61.0Y,1935.0
1,E2,Période néonatale,Epilepsie néonatale familiale bénigne (BFNE),Early myoclonic encephalopathy (EME),G40.8,8A61.10,1949.0
2,E3,Période néonatale,Syndrome d'ohtahara,Ohtahara syndrome,G40.8,8A62.Y,1934.0
3,E31,Nourrisons,Encépahlopathie myoclonique des affections non...,Myoclonic encephalopathy in nonprogressive dis...,G40.4,8A62.Y,86913.0
4,E33,Nourrisons,Epilepsie benigne du nourisson,Benign infantile epilepsy,G40.3,8A61.1Z,166302.0


41

In [47]:
# import of classification dataset
classification_dataset = pd.read_csv('data/classification_dataset.csv')
classification_dataset.head()

Unnamed: 0,filepath,report
0,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,Description: 2.5 to 5 hz spike/wave and polys...
1,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,LENGTH OF THE RECORDING: 22 minutes and 53 s...
2,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car..."
3,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,CLINICAL HISTORY: 27 year old gentleman with...
4,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car..."


In [38]:
classification_dataset['len'] = classification_dataset.report.apply(lambda x: len(x)) 

In [39]:
classification_dataset.sort_values(by='len',)

Unnamed: 0,filepath,report,len
1173,CR_Patients_info_patients-v0_4/edf/train/02_tc...,Not loaded,10
1245,CR_Patients_info_patients-v0_4/edf/train/02_tc...,FC focal slowing SPSP not 20 minutes,39
946,CR_Patients_info_patients-v0_4/edf/train/03_tc...,GPFA but canât find clinical seizures today,47
198,CR_Patients_info_patients-v0_4/edf/dev/02_tcp_...,Xs theta and xs beta and some paroxysmal burs...,62
199,CR_Patients_info_patients-v0_4/edf/dev/02_tcp_...,Definition: Pharmacologic coma but waking up ...,65
...,...,...,...
491,CR_Patients_info_patients-v0_4/edf/train/01_tc...,History: 63 yo left handed man with h/o seizu...,12281
1363,CR_Patients_info_patients-v0_4/edf/train/02_tc...,CLINICAL HISTORY: This is a 16-month-old with...,22601
1330,CR_Patients_info_patients-v0_4/edf/train/02_tc...,CLINICAL HISTORY: This is a 25-year-old woman...,22963
411,CR_Patients_info_patients-v0_4/edf/train/01_tc...,History: The patient is a 54 year old RH man ...,23592


In [42]:
classification_dataset.report[1245]

' FC focal slowing  SPSP  not 20 minutes'

# I - Working with Levenshtein distance on full text

## A - Using partial ratio on full text

In [79]:

# We will calculate the partial_ratio for each thesaury therme and update it in a result dataset

%%time

for i in tqdm(list(thesaurus['symptome-en'])):
    classification_dataset[i] = classification_dataset['report'].apply(lambda x: fuzz.partial_ratio(x, i)) 

df_results = pd.DataFrame(data=classification_dataset.columns[4:], columns=['target'])
df_results['ratio'] = df_results['target'].apply(lambda x: max(classification_dataset[x]))

# What can we predict at best?
df_results.sort_values(by='ratio', ascending=False)

100%|██████████| 46/46 [00:47<00:00,  1.03s/it]CPU times: user 47 s, sys: 46.9 ms, total: 47.1 s
Wall time: 47.3 s



Unnamed: 0,target,ratio
13,Lennox-Gastaut syndrome,100
16,Epilepsy with generalized tonicclonic seizure...,81
7,West syndrome,77
31,temporal epilepsy,76
23,central epilepsy,75
26,frontal epilepsy,75
27,insular epilepsy,75
6,Dravet syndrome,73
2,Benign infantile epilepsy,72
30,parietal epilepsy,71


It looks we have "honest" results, but in reality other than Lneeox-Gastaut syndrome it does not really works... Ex: for temporal epilsepy, ratio is high thanks to "epilepsy" alone. 

In [48]:
# For a target, output the related reports sorted by partial_ratio

def research_similarity(target):
    df = pd.DataFrame(classification_dataset['report'])
    df['partial_ratio'] = df['report'].apply(lambda x: fuzz.partial_ratio(x, target))
    df = df.sort_values(by='partial_ratio', ascending=False)
    return df

## B - Using token_sort_ratio on full text

In [318]:

# We will calculate the token_sort_ratio for each thesaury therme and update it in a result dataset

%time

for i in tqdm(list(thesaurus['symptome-en'])):
    classification_dataset[i] = classification_dataset['report'].apply(lambda x: fuzz.token_sort_ratio(x, i)) 

df_results = pd.DataFrame(data=classification_dataset.columns[4:], columns=['target'])
df_results['ratio'] = df_results['target'].apply(lambda x: (classification_dataset[x]).max())

# What can we predict at best?
df_results.sort_values(by='ratio', ascending=False)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.3 µs


Unnamed: 0,target,ratio
37,Other location,50
38,Unkown location,48
8,Epilepsy with myoclonic atonic (previously ast...,46
3,Epilepsy of infancy with migrating focal seizures,44
18,Autosomal dominant epilepsy with auditory feat...,43
33,Mesial temporal lobe epilepsy with hippocampal...,41
34,Mesial temporal lobe epilepsy without hippoca...,40
16,Epilepsy with generalized tonicclonic seizure...,40
20,Gelastic seizures with hypothalamic hamartoma,39
11,Epilepsy with myoclonic absences,37


This method does not really works out.

## C - Example Cases

### 1 - Focus on Lennox-Gastaut

In [137]:
# Looking for the index where the index is high
research_similarity('Lennox-Gastaut')['partial_ratio'].head(30)

838     100
117     100
45      100
46      100
47      100
48      100
49      100
51      100
1258    100
843     100
220     100
1315    100
372     100
44      100
1107    100
276     100
1302    100
227     100
1211    100
547     100
1167    100
684     100
1269    100
597      50
596      50
595      50
594      50
593      50
598      50
809      43
Name: partial_ratio, dtype: int64

In [140]:
# Looking for the text correlated with the report at index 227 
research_similarity('Lennox-Gastaut').report[227]

' CLINICAL HISTORY: This is a 27-year-old male with a history of severe MR, multiple medical problems with multiple brief seizures per month.  Seizures characterized by generalized shaking lasting 20 seconds. MEDICATIONS: Lamictal, Tegretol, Tranxene, and many others. INTRODUCTION: Digital video EEG is performed in the lab using standard 10-20 system of electrode placement with one channel of EKG. The patient is drowsy or somnolent. Photic stimulation is performed. DESCRIPTION OF THE RECORD: The background EEG is markedly abnormal and is primarily a mixture of rhythmic 3 Hz activity with smaller amounts of 2 Hz activity and some 4 to 5 Hz theta.  There are multifocal spike and slow wave complexes identified in the record including bifrontal, high amplitude spike and slow wave complexes with an approximately 2 Hz after going slow wave.  Focal epileptiform activity is also seen in the occipital regions, sometimes maximum at O2 and at other times with a poly spike wave component at O1-O2.

it works with 100 partial ratio

In [None]:
# Looking for the text correlated with the report at index 227 
research_similarity('Lennox-Gastaut').report[597]

It's not working on 50 partial ratio.

### 2 - Focus on temporal epilepsy

In [154]:
research_similarity('temporal epilepsy')

Unnamed: 0,report,partial_ratio
377,EEG REMARKS: 7 L temporal Spikes but seems se...,76
191,CLINICAL HISTORY: 40 year old right handed ma...,71
194,CLINICAL HISTORY: 40 year old right handed ma...,71
1133,CLINICAL HISTORY: \tForty-seven-year-old male...,71
523,HISTORY: A 62-year-old woman with adult-onse...,71
...,...,...
145,"CLINICAL HISTORY: A 25-year-old man, with hi...",0
426,REASON FOR STUDY: Seizures. CLINICAL HISTORY...,0
118,CLINICAL HISTORY: A 35-year-old woman with c...,0
688,REASON FOR STUDY: Change in mental status. C...,0


In [155]:
research_similarity('temporal epilepsy')['report'][191]

' CLINICAL HISTORY: 40 year old right handed male with encephalitis and recurrent seizures. MEDICATIONS: Lacosamide, dilantin, Ativan, Klonopin INTRODUCTION: Continuous digital video EEG monitoring was performed at bedside using standard 10-20 system of electrode placement with 1 channel of EKG. As this section of the records begins, the patient reports "he is feeling great" as if he is not having more seizures. Then subsequently he has 2 events that he describes as auras, which are seizures with impairment of awareness. He does have occasional myoclonic jerks. DESCRIPTION OF THE RECORD: This section of the 24-hour period includes more of the rhythmic repetitive slowing than noted at other times. Isolated high amplitude right hemispheric spike and wave activity is observed. Push button times include 5:20 which includes actually a seizure. Although the patient describes this as an aura, it is really a focal motor seizure with loss of axial tone and stiffening of the right leg. The patie

No trace of temporal epilepsy: it just does not work out!

### 3 - Other research

Let's try to research medication associate with one Lennox-Gastaut syndrome: maybe we can find other occurences? 

In [171]:
research_similarity('Keppra Ativan famotidine Lovenox topiramate Flagyl Depakote').head(30)

Unnamed: 0,report,partial_ratio
44,DURATION OF STUDY: Study date 03/26/2013 thr...,90
51,DURATION OF STUDY: Study date 03/26/2013 thr...,90
48,REASON FOR STUDY: Seizures. CLINICAL HISTORY...,90
47,REASON FOR STUDY: Seizures. CLINICAL HISTORY...,90
46,DURATION OF STUDY: Study date 03/26/2013 thr...,90
45,REASON FOR STUDY: Seizures. CLINICAL HISTORY...,90
52,REASON FOR STUDY: Seizures. CLINICAL HISTORY...,54
652,CLINICAL HISTORY: 60 year old right handed fe...,53
1071,CLINICAL HISTORY: 60 year old right handed fe...,53
1070,CLINICAL HISTORY: 60 year old right handed fe...,53


Analysis show it does not really works

# II - Working with Levenshtein distance on each sentence of a  text

Empiric test have shown that precision can be higher if tested on sentences rather than full text. Let's try the efficiency!

In [49]:
# len > 5 to overcome the small words, which have naturally a high ratio
def partial_ratio_by_sentence(texte, target):
    max = 0
    for i in texte.split('.'):
        if fuzz.partial_ratio(i, target) > max:
            if len(i) > 5:
                max = fuzz.partial_ratio(i, target)
    return max

# For a target, output the related reports sorted by partial_ratio

def research_similarity_by_sentence(target):
    df = pd.DataFrame(classification_dataset['report'])
    df['partial_ratio'] = df['report'].apply(lambda x: partial_ratio_by_sentence(x, target))
    df = df.sort_values(by='partial_ratio', ascending=False)
    return df


In [321]:

# We will calculate the partial_ratio for each thesaury therme and update it in a result dataset
%time

classification_dataset_by_sentence = classification_dataset[['filepath', 'report']]

for i in tqdm(list(thesaurus['symptome-en'])):
    print(i)
    classification_dataset_by_sentence[i] = classification_dataset_by_sentence['report'].apply(lambda x: partial_ratio_by_sentence(x, i)) 

df_results_by_sentence = pd.DataFrame(data=classification_dataset_by_sentence.columns[2:], columns=['target'])
df_results_by_sentence['ratio'] = df_results_by_sentence['target'].apply(lambda x: classification_dataset_by_sentence[x].max())

# What can we predict at best?
df_results_by_sentence.sort_values(by='ratio', ascending=False)

0%|          | 0/46 [00:00<?, ?it/s]CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.82 µs
Benign familial neonatal epilepsy (BFNE)
  2%|▏         | 1/46 [00:42<31:54, 42.55s/it]Early myoclonic encephalopathy (EME)
  4%|▍         | 2/46 [01:13<28:32, 38.93s/it]Ohtahara syndrome
  7%|▋         | 3/46 [01:26<22:28, 31.36s/it]Myoclonic encephalopathy in nonprogressive disorders
  9%|▊         | 4/46 [02:23<27:16, 38.96s/it]Benign infantile epilepsy
 11%|█         | 5/46 [02:45<23:10, 33.91s/it]Epilepsy of infancy with migrating focal seizures
 13%|█▎        | 6/46 [03:35<25:46, 38.66s/it]Benign familial infantile epilepsy
 15%|█▌        | 7/46 [04:07<23:57, 36.87s/it]Myoclonic epilepsy in infancy (MEI)
 17%|█▋        | 8/46 [04:38<22:03, 34.84s/it]Dravet syndrome
 20%|█▉        | 9/46 [04:50<17:15, 27.99s/it]West syndrome
 22%|██▏       | 10/46 [05:00<13:37, 22.71s/it]Epilepsy with myoclonic atonic (previously astatic) seizures
 24%|██▍       | 11/46 [06:04<20:29, 35.14s/it]Late 

Unnamed: 0,target,ratio
33,temporal epilepsy,100
15,Lennox-Gastaut syndrome,100
18,Epilepsy with generalized tonicclonic seizure...,91
32,parietal epilepsy,91
30,multifocal epilepsy,89
31,occipital epilepsy,89
10,Epilepsy with myoclonic atonic (previously ast...,88
25,central epilepsy,88
22,Gelastic seizures with hypothalamic hamartoma,88
34,external temporal epilepsy,88


In [323]:
df_results_by_sentence.to_csv('df_results_by_sentence.csv')

# III - Using a simplified Thesaurus

In [50]:
# Loading simplified Thesaurus
thesaurus_simplified = pd.read_csv('data/thesaurus_key_words - simplified.csv', encoding="ISO-8859-1", sep=';')

In [51]:
len(thesaurus_simplified['symptome-en-simple'].unique())

36

In [52]:
thesaurus_simplified.head()

Unnamed: 0,classification_E,catégorie,symptome-fr,symptome-en,symptome-en-simple,Comments,CIM_10,CIM11,Orphanet
0,E1,Période néonatale,Encéphalopathie myoclonique précoce,Benign familial neonatal epilepsy (BFNE),BFNE,"Too much common words, keeping acronym",G40.8,8A61.0Y,1935.0
1,E2,Période néonatale,Epilepsie néonatale familiale bénigne (BFNE),Early myoclonic encephalopathy (EME),EME,"Too much common words, keeping acronym",G40.8,8A61.10,1949.0
2,E3,Période néonatale,Syndrome d'ohtahara,Ohtahara syndrome,Ohtahara,syndrome is too common,G40.8,8A62.Y,1934.0
3,E31,Nourrisons,Encépahlopathie myoclonique des affections non...,Myoclonic encephalopathy in nonprogressive dis...,Myoclonic encephalopathy,simplification,G40.4,8A62.Y,86913.0
4,E33,Nourrisons,Epilepsie benigne du nourisson,Benign infantile epilepsy,infantile,generalisation,G40.3,8A61.1Z,166302.0


In [53]:
classification_dataset.iloc[:,0:2]

Unnamed: 0,filepath,report
0,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,Description: 2.5 to 5 hz spike/wave and polys...
1,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,LENGTH OF THE RECORDING: 22 minutes and 53 s...
2,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car..."
3,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,CLINICAL HISTORY: 27 year old gentleman with...
4,CR_Patients_info_patients-v0_4/edf/dev/01_tcp_...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car..."
...,...,...
1418,CR_Patients_info_patients-v0_4/edf/train/02_tc...,CLINICAL HISTORY: 49 year old female with alc...
1419,CR_Patients_info_patients-v0_4/edf/train/02_tc...,CLINICAL HISTORY: A25 year old male with a si...
1420,CR_Patients_info_patients-v0_4/edf/train/02_tc...,CLINICAL HISTORY: 69 year old woman with onse...
1421,CR_Patients_info_patients-v0_4/edf/train/02_tc...,CLINICAL HISTORY: 68 year old woman with righ...


In [54]:

# We will calculate the token_sort_ratio for each thesaury therme and update it in a result dataset

%time
classification_dataset_simple = classification_dataset.iloc[:,0:2]

for i in tqdm(list(thesaurus_simplified['symptome-en-simple'].unique())):
    classification_dataset_simple[i] = classification_dataset_simple['report'].apply(lambda x: partial_ratio_by_sentence(x, i)) 

df_results_simple = pd.DataFrame(data=classification_dataset_simple.columns[2:], columns=['target'])
df_results_simple['ratio'] = df_results_simple['target'].apply(lambda x: (classification_dataset_simple[x]).max())

# What can we predict at best?
df_results_simple = df_results_simple.sort_values(by='ratio', ascending=False)
df_results_simple.to_csv('df_results_simple.csv')
df_results_simple

0%|          | 0/36 [00:00<?, ?it/s]CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.48 µs
100%|██████████| 36/36 [04:50<00:00,  8.08s/it]


Unnamed: 0,target,ratio
18,tonic-clonic,100
11,Gastaut type,100
32,parietal,100
31,occipital,100
28,frontal,100
25,central,100
24,Rasmussen,100
1,EME,100
17,temporal lobe,100
15,Lennox-Gastaut,100


More elements are at 100! We now set the threshold score to keep the results. For a word with n caracters with e number of characters to change, the score is (n-e)/n.
For less than 6 characters, we take for principle that there should be no error (ex: acronyms)
For more than 6 characters, we can take 2 errors for reference
For two words search, we can take 4 errors for reference (2 by words)

In [55]:
# defining threshold

df_results_simple['threshold'] = df_results_simple['target'].apply(lambda x: 100 if len(x) <= 6 else 
((len(x)-1)*100/len(x) if x.find(" ") == -1 else
((len(x)-2)*100/len(x)))
)
df_results_simple['correspondance'] = df_results_simple['ratio'] >= df_results_simple['threshold']
df_results_simple = df_results_simple.sort_values(by=['correspondance', 'threshold'], ascending=False)
df_results_simple

Unnamed: 0,target,ratio,threshold,correspondance
1,EME,100,100.0,True
15,Lennox-Gastaut,100,92.857143,True
18,tonic-clonic,100,91.666667,True
30,mutlifocal,90,90.0,True
31,occipital,100,88.888889,True
24,Rasmussen,100,88.888889,True
35,temporal occipital,100,88.888889,True
4,infantile,100,88.888889,True
32,parietal,100,87.5,True
28,frontal,100,85.714286,True


In [56]:
# For each typology, inputing 1 if the local Levenshtein partial ratio is correct

classification_dataset_hashtag = classification_dataset_simple
for i in tqdm(classification_dataset_hashtag.columns[2:]):
    threshold = df_results_simple[df_results_simple['target'] == i]['threshold'].iloc[0]
    print(threshold)
    classification_dataset_hashtag[i] = classification_dataset_hashtag[i].apply(lambda x: 1 if (x >= df_results_simple[df_results_simple['target'] == i]['threshold'].iloc[0])==True else 0)

classification_dataset_hashtag.to_csv('classification_dataset_hashtag.csv')

0%|          | 0/36 [00:00<?, ?it/s]100.0
  3%|▎         | 1/36 [00:00<00:27,  1.29it/s]100.0
  6%|▌         | 2/36 [00:01<00:24,  1.38it/s]87.5
  8%|▊         | 3/36 [00:01<00:22,  1.45it/s]91.66666666666667
 11%|█         | 4/36 [00:02<00:21,  1.46it/s]88.88888888888889
 14%|█▍        | 5/36 [00:03<00:21,  1.44it/s]86.66666666666667
 17%|█▋        | 6/36 [00:04<00:21,  1.42it/s]100.0
 19%|█▉        | 7/36 [00:04<00:21,  1.34it/s]100.0
 22%|██▏       | 8/36 [00:06<00:23,  1.18it/s]100.0
 25%|██▌       | 9/36 [00:07<00:24,  1.09it/s]100.0
 28%|██▊       | 10/36 [00:08<00:24,  1.04it/s]87.5
 31%|███       | 11/36 [00:09<00:24,  1.02it/s]83.33333333333333
 33%|███▎      | 12/36 [00:10<00:22,  1.07it/s]100.0
 36%|███▌      | 13/36 [00:11<00:22,  1.04it/s]88.88888888888889
 39%|███▉      | 14/36 [00:11<00:19,  1.10it/s]93.33333333333333
 42%|████▏     | 15/36 [00:12<00:18,  1.13it/s]92.85714285714286
 44%|████▍     | 16/36 [00:13<00:17,  1.15it/s]93.33333333333333
 47%|████▋     | 17/36 [0

In [57]:
classification_dataset_hashtag.iloc[:,2:].sum().sort_values(ascending=False)

frontal                     558
central                     536
occipital                   252
parietal                    153
tonic-clonic                140
temporal lobe               134
Lennox-Gastaut               23
mutlifocal                   18
Rasmussen                     8
temporal occipital            8
EME                           5
Gastaut type                  4
infantile                     1
supplementary motor area      0
Temporoparietal junction      0
Ohtahara                      0
Myoclonic encephalopathy      0
migrating focal               0
Unkown                        0
MEI                           0
Dravet                        0
West                          0
myoclonic atonic              0
CAE                           0
myoclonic absences            0
Landau-Kleffner               0
external temporal             0
Panayiotopoulos               0
MTLE with HS                  0
insular                       0
jAE                           0
ADEAF   

Example of intersting case: we now value well Rasmussen reports

In [60]:
classification_dataset_hashtag.iloc[0,2:].sum()

0

In [72]:
classification_dataset_hashtag.iloc[i:i+1,2:].iloc[0].sum()

1.0

In [73]:
liste_sum = []
for i in range(classification_dataset_hashtag.shape[0]):
    liste_sum.append(classification_dataset_hashtag.iloc[i:i+1,2:].iloc[0].sum())

In [75]:
classification_dataset_hashtag['sum'] = liste_sum

In [77]:
classification_dataset_hashtag['sum'].value_counts()

1.0    464
2.0    399
0.0    377
3.0    157
4.0     24
5.0      1
6.0      1
Name: sum, dtype: int64

In [58]:
# Let's check of several examples 
target = 'Rasmussen'
for report in a[classification_dataset_hashtag[target]==1]['report']:
    display(report)

" CLINICAL HISTORY:  Rasmussen's encephalitis with breakthrough seizures. MEDICATIONS:  Keppra, IVIG, phenobarbital, Klonopin, others. INTRODUCTION:  Continuous video EEG monitoring is performed in the unit.  During a section of the record, the patient has approximately 40 simple partial seizures, all characterized by involuntary movements on the right.  Other seizures can occur out of sleep, but in this 24-hour section almost all the seizures seem to wake him up and are associated with right-sided shaking. The seizures have variable patterns, but all localize to the left hemisphere.  Some seem to start with a beta buzz in the left central region, others with more higher amplitude spike and wave activity.  The interictal activity includes a pattern with excess beta and theta from the right hemisphere.  The left hemisphere demonstrates __________ delta and the epileptiform activity interictally is more of a polyspike activity in the left posterior temporal region or central parietal reg

' CLINICAL HISTORY:  41 year old right handed male with Rasmussenâ\x80\x99s encephalitis with increasing seizures. MEDICATIONS:  Topiramate, Lacosamide, Phenobarbital, Klonopin, Lipitor, Pantoprazole, Lisinopril INTRODUCTION:  Digital video EEG was performed in lab using standard 10-20 system of electrode placement with channel of EKG.  Photic stimulation was performed. DESCRIPTION OF THE RECORD:  The background EEG is markedly abnormal.  As the record begins, the activity includes a prominent interhemispheric asymmetry.  It is medium amplitude, but slow, primarily theta on the right with some occasional posterior delta.  From there left there is clearly a breach with a high amplitude spike and slow-wave complex at T3 and T5.  It is also picked up at C3/P3. The first seizure occurs within 1 minute with a burst of 14 Hz activity emanating from the left frontal region with frequency evolution.  This is over 4 minutes and 35 seconds into the EEG.  Additional seizure occurs at 4 minutes an

' CLINICAL HISTORY:  Rasmussen encephalitis. MEDICATIONS:  Vimpat, Topamax, phenobarbital, IVIG, and Solu-Medrol. INTRODUCTION:  Continuous video EEG monitoring is performed for this individual.  He has many seizures typically characterized by right-sided shaking. DESCRIPTION OF THE RECORD:  The majority of the seizures occur on the evening of the 26th with multiple, repetitive focal seizures.  Aside from this, he demonstrates stage 2 sleep with vertex waves, K complexes and spindles.  By the later sections of the record on the 27th, the patient has more significant sections where he is awake, doing well and then drifting off to sleep. This piece of EEG concludes at 3:24 on the 27th. IMPRESSION:'

" CLINICAL HISTORY:  Rasmussen's encephalitis. MEDICATIONS:  Topamax, IVIG, Glucosamine, phenobarbital. INTRODUCTION:  Digital video EEG with long term EEG monitoring is performed in the long term monitoring unit using standard 10-20 system of electrode placement with 1 channel EKG.  The patient has a tender scalp and the tech sometimes had to modify the electrode placement. DESCRIPTION OF THE RECORD:  The interictal EEG continues to demonstrate focal slowing from the left hemisphere with left posterior temporal sharp waves.  Multiple seizures are identified in the 24 hour section, including in wakefulness and sleep.  The patient does not seem to wake up for all of them.  Stage II sleep, including the 2:00 a.m. to 3:00 a.m. section are prominent.  The nurses were aware of the seizures in sleep.  These seizures seem to be beginning with a burst of fast activity, almost some 10 to 5 Hz which is picked up very close to the midline.  The activity is really very prominent at CZ where it is 

' CLINICAL HISTORY:  41 year old right handed male with Rasmussenâ\x80\x99s encephalitis with increasing seizures. MEDICATIONS:  Topiramate, Lacosamide, Phenobarbital, Klonopin, Lipitor, Pantoprazole, Lisinopril INTRODUCTION:  Digital video EEG was performed in lab using standard 10-20 system of electrode placement with channel of EKG.  Photic stimulation was performed. DESCRIPTION OF THE RECORD:  The background EEG is markedly abnormal.  As the record begins, the activity includes a prominent interhemispheric asymmetry.  It is medium amplitude, but slow, primarily theta on the right with some occasional posterior delta.  From there left there is clearly a breach with a high amplitude spike and slow-wave complex at T3 and T5.  It is also picked up at C3/P3. The first seizure occurs within 1 minute with a burst of 14 Hz activity emanating from the left frontal region with frequency evolution.  This is over 4 minutes and 35 seconds into the EEG.  Additional seizure occurs at 4 minutes an

' DATES OF STUDY:  February 23-24, 2012. CLINICAL HISTORY:  Rasmussen encephalitis with increase in seizures. MEDICATIONS:  Vimpat, Topamax, phenobarbital, IVIG, others. INTRODUCTION:  Continuous video EEG monitoring is performed in the unit using standard 10-20 system of electrode placement with one channel of EKG.  This is an awake and asleep record. DESCRIPTION OF THE RECORD:  Random wakefulness and sleep, in wakefulness, the background EEG is somewhat slow from the right hemisphere.  The left hemisphere demonstrates arrhythmic delta activity with a high amplitude left posterior temporal spike complex. Clinical seizures are noted reliably with the patient and nurse and there are more than 20 pushbutton events, approximately 23, all 30-60 seconds in duration.  They are characterized by focal motor activity on the right hemibody.  Electrocardiographically, there is a buzz of mixed 5 and 10 Hz activity in the left hemisphere including the central regions.   There are a handful of seizu

' CLINICAL HISTORY:  A 42-year-old gentleman with Rasmussen encephalitis and increasing right-sided weakness as well as 2 tonic-clonic seizures and simple partial seizures. MEDICATIONS:  Vimpat Topamax, phenobarbital, IVIG, and others. INTRODUCTION:  Digital video EEG was performed in the lab using standard 10-20 system of electrode placement with 1-channel EKG.  Hyperventilation was not possible but photic stimulation was completed.  This was an awake and drowsy record. The patient had brief seizures with R jerks just prior to initiation of EEG and had a clinical seizure with eyes closed,  looking left,  and slowed responsiveness DESCRIPTION OF THE RECORD:  In wakefulness, the background EEG demonstrates a marked asymmetry between the 2 hemispheres.  The right hemisphere demonstrates modest background slowing with excess theta.  The left hemisphere demonstrates significant disruption of faster frequency activity.  Frequent sharp waves or spike is noted, high amplitude in the left hemi

" CLINICAL HISTORY:  A 42-year-old male with Rasmussen's encephalitis, status post left craniotomy with recent focal motor seizure followed by right-sided weakness and then epilepsy partialis continua. MEDICATIONS:  Decadron, phenobarbital, lacosamide, Zocor, others. INTRODUCTION:  Digital video EEG is performed in the lab/bedside using standard 10-20 system of electrode placement with one channel of EKG.  Photic stimulation was completed.  The patient was not experiencing involuntary movements during the EEG.  So this is a technically satisfactory EEG with acceptable impedances, but the craniotomy defect was noted. DESCRIPTION OF THE RECORD:  The background EEG is abnormal and demonstrates an asymmetry.  The right hemisphere is moderately slow with primarily a theta frequency background noted in wakefulness.  The left hemisphere demonstrates more significant arrhythmic delta activity particularly in the left posterior quadrant.  A high amplitude epileptiform discharge, high amplitude 

In [21]:
# Let's check of several examples 
target = 'Myoclonic encephalopathy'
for report in classification_dataset_hashtag[classification_dataset_hashtag[target]==1]['report']:
    display(report)
# not working

' CLINICAL HISTORY:  An 82-year-old woman with recent cranial surgery and refractory statue epilepticus. Severe metabolic encephalopathy. MEDICATIONS: \tDilantin, Depakote, phenobarbital, insulin, digoxin, Lopressor. INTRODUCTION:  Continuous video EEG is performed at bedside in ICU using standard 10-20 system of electrode placement with 1 channel EKG. During the record, including on the 26th, the Phenobarbital was turned off. DESCRIPTION OF THE RECORD: In the initial data on the 24th there are intermittent seizures occurring in the contexts of burst suppression. There are prolonged periods of suppression with brief seizures. However, by 16:28 the EEG is completely suppressed. This section of the record was previously dictated. Additional recording after 16:37 for the following subsequent days includes the following: October 24 to October 25: A suppressed EEG. The patient can be visualized in the unit. Technologist and nurses work with her. The background EEG remains markedly suppresse

Next steps:
- better thesaurus
- better threshold
- extracting the "candidates", store them in dictionary for right "mistakes"

Seuil à 100%: quelle est la part taguée
Examens non tagués: quelle proportion?
taguer les médicaments?

In [92]:
print(classification_dataset_hashtag[classification_dataset_hashtag['sum'] == 6])

filepath  \
220  CR_Patients_info_patients-v0_4/edf/dev/02_tcp_...   

                                                report  BFNE  EME  Ohtahara  \
220   CLINICAL HISTORY: This is a 35-year-old woman...     0    0         0   

     Myoclonic encephalopathy  infantile  migrating focal  Unkown  MEI  ...  \
220                         0          0                0       0    0  ...   

     Temporoparietal junction  frontal  insular  mutlifocal  occipital  \
220                         0        1        0           1          1   

     parietal  external temporal  MTLE with HS  temporal occipital  sum  
220         0                  0             0                   0  6.0  

[1 rows x 39 columns]


AttributeError: 'NoneType' object has no attribute 'values'

In [89]:
# top 5 with 0 info
for i in range(5):
    print(classification_dataset_hashtag[classification_dataset_hashtag['sum'] == 0]['report'].iloc[i])
    print('')

Description: 2.5 to 5 hz spike/wave and polyspike wave Genâl slow Abnormal

 MEDICATIONS:  Vimpat, Norvasc, Felbamate, Carnitor, clonidine, Celexa, Lopressor, Topiramate HISTORY:  A 28-year-old male with refractory epilepsy, previously on Seroquel, now with weight loss, failure to thrive, refusing to eat, ambulate or cooperate. INTRODUCTION:  A digital video EEG was performed at the bedside using standard 10-20 system of electrode placement with one channel in EKG.  The patient was poorly cooperative throughout the record.  Occasional muscle twitch artifact was noted. DESCRIPTION OF RECORD:  The background EEG was diffusely slow with a lower voltage beta and theta pattern.  The normal milestones of an awake adult EEG were not identified.

 MEDICATIONS:  Vimpat, Norvasc, Felbamate, Carnitor, clonidine, Celexa, Lopressor, Topiramate HISTORY:  A 28-year-old male with refractory epilepsy, previously on Seroquel, now with weight loss, failure to thrive, refusing to eat, ambulate or cooper

Sortir des rapports "pauvres"

# IV - Rake experiment

In [18]:
from rake_nltk import Rake

r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.

r.extract_keywords_from_text(research_similarity('temporal epilepsy')['report'][191])

r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.

['isolated high amplitude right hemispheric spike',
 '40 year old right handed male',
 'characteristic tonic clonic activity obscures',
 'push button times include 5',
 'continuous digital video eeg monitoring',
 'high amplitude sharp waves',
 'particularly parietal occipital region',
 'rhythmic mixed theta delta',
 'bedside using standard 10',
 'prominent sharp waves',
 'additional sharp waves',
 'previous eeg monitoring',
 'occasional myoclonic jerks',
 'localization related mechanism',
 'little bit confused',
 'blood pressure cuff',
 'eeg monitor ends',
 'third seizure type',
 'hour period includes',
 'focal motor seizure',
 'rhythmic repetitive slowing',
 'complex partial seizures',
 'seizure time 13',
 'eeg monitoring',
 'right leg',
 'wave activity',
 'rhythmic slowing',
 'rhythmic discharge',
 'entire eeg',
 'includes actually',
 'recurrent seizures',
 'records begins',
 'movement artifact',
 'left hemisphere',
 'left hand',
 'klonopin introduction',
 'feeling great',
 'electrod