# C. Hashtag extraction

## Intro - Importing libraries and datasets

In [38]:
# import of libraries
import pandas as pd
from fuzzywuzzy import fuzz
from tqdm import tqdm
import re

In [2]:
# Loading simplified Thesaurus
thesaurus_simplified = pd.read_csv('1. Data/thesaurus_key_words - simplified.csv', encoding="ISO-8859-1", sep=';')
thesaurus_simplified.head()

Unnamed: 0,classification_E,catégorie,symptome-fr,symptome-en,symptome-en-simple,type of crisis,Comments,CIM_10,CIM11,Orphanet
0,E1,Période néonatale,Encéphalopathie myoclonique précoce,Benign familial neonatal epilepsy (BFNE),BFNE,,"Too much common words, keeping acronym",G40.8,8A61.0Y,1935.0
1,E2,Période néonatale,Epilepsie néonatale familiale bénigne (BFNE),Early myoclonic encephalopathy (EME),EME,,"Too much common words, keeping acronym",G40.8,8A61.10,1949.0
2,E3,Période néonatale,Syndrome d'ohtahara,Ohtahara syndrome,Ohtahara,,syndrome is too common,G40.8,8A62.Y,1934.0
3,E31,Nourrisons,Encépahlopathie myoclonique des affections non...,Myoclonic encephalopathy in nonprogressive dis...,Myoclonic encephalopathy,,simplification,G40.4,8A62.Y,86913.0
4,E33,Nourrisons,Epilepsie benigne du nourisson,Benign infantile epilepsy,infantile,,generalisation,G40.3,8A61.1Z,166302.0


In [3]:
# import of classification dataset
classification_dataset = pd.read_csv('2. Results/Classification_dataset.csv')
classification_dataset.head()

Unnamed: 0,filepath,report
0,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,Description: 2.5 to 5 hz spike/wave and polys...
1,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,LENGTH OF THE RECORDING: 22 minutes and 53 s...
2,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car..."
3,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,CLINICAL HISTORY: 27 year old gentleman with...
4,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car..."


## A - Extracting the simplified hashtags

In [5]:
# len > 5 to overcome the small words, which have naturally a high ratio
def partial_ratio_by_sentence(texte, target):
    max = 0
    for i in texte.split('.'):
        if fuzz.partial_ratio(i, target) > max:
            if len(i) > 5:
                max = fuzz.partial_ratio(i, target)
    return max

# For a target, output the related reports sorted by partial_ratio

def research_similarity_by_sentence(target):
    df = pd.DataFrame(classification_dataset['report'])
    df['partial_ratio'] = df['report'].apply(lambda x: partial_ratio_by_sentence(x, target))
    df = df.sort_values(by='partial_ratio', ascending=False)
    return df


In [8]:

# We will calculate the token_sort_ratio for each thesaury therme and update it in a result dataset

%time

classification_dataset_hashtag = classification_dataset

for i in tqdm(list(thesaurus_simplified['symptome-en-simple'].unique())):
    classification_dataset_hashtag[i] = classification_dataset_hashtag['report'].apply(lambda x: partial_ratio_by_sentence(x, i)) 

classification_dataset_hashtag.to_csv('2. Results/classification_dataset_hashtag.csv')

df_results_hashtag = pd.DataFrame(data=classification_dataset_hashtag.columns[2:], columns=['target'])
df_results_hashtag['ratio'] = df_results_hashtag['target'].apply(lambda x: (classification_dataset_hashtag[x]).max())

# What can we predict at best?
df_results_hashtag = df_results_hashtag.sort_values(by='ratio', ascending=False)
df_results_hashtag

0%|          | 0/38 [00:00<?, ?it/s]CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.44 µs
100%|██████████| 38/38 [09:22<00:00, 14.79s/it]


Unnamed: 0,target,ratio
37,temporal occipital,100
11,Gastaut type,100
28,frontal,100
25,central,100
24,Rasmussen,100
31,multifocal,100
32,occipital,100
1,EME,100
18,tonic-clonic,100
17,temporal lobe,100


In [9]:
# defining threshold

df_results_hashtag['threshold'] = df_results_hashtag['target'].apply(lambda x: 100 if len(x) <= 6 else 
((len(x)-1)*100/len(x) if x.find(" ") == -1 else
((len(x)-2)*100/len(x)))
)
df_results_hashtag['correspondance'] = df_results_hashtag['ratio'] >= df_results_hashtag['threshold']

# Sorting values
df_results_hashtag = df_results_hashtag.sort_values(by=['correspondance', 'threshold'], ascending=False)
df_results_hashtag.to_csv('2. Results/df_results_hashtag.csv', index=False)
df_results_hashtag

Unnamed: 0,target,ratio,threshold,correspondance
1,EME,100,100.0,True
15,Lennox-Gastaut,100,92.857143,True
18,tonic-clonic,100,91.666667,True
31,multifocal,100,90.0,True
37,temporal occipital,100,88.888889,True
24,Rasmussen,100,88.888889,True
32,occipital,100,88.888889,True
4,infantile,100,88.888889,True
33,parietal,100,87.5,True
36,temporal,100,87.5,True


In [10]:
# For each typology, inputing 1 if the local Levenshtein partial ratio is correct

for i in tqdm(classification_dataset_hashtag.columns[2:]):
    threshold = df_results_hashtag[df_results_hashtag['target'] == i]['threshold'].iloc[0]
    print(threshold)
    classification_dataset_hashtag[i] = classification_dataset_hashtag[i].apply(lambda x: 1 if (x >= df_results_hashtag[df_results_hashtag['target'] == i]['threshold'].iloc[0])==True else 0)
classification_dataset_hashtag.to_csv('2. Results/classification_dataset_hashtag.csv', index=False)
classification_dataset_hashtag

0%|          | 0/38 [00:00<?, ?it/s]100.0
  3%|▎         | 1/38 [00:01<00:56,  1.54s/it]100.0
  5%|▌         | 2/38 [00:03<00:55,  1.55s/it]87.5
  8%|▊         | 3/38 [00:04<00:50,  1.46s/it]91.66666666666667
 11%|█         | 4/38 [00:05<00:47,  1.38s/it]88.88888888888889
 13%|█▎        | 5/38 [00:06<00:43,  1.31s/it]86.66666666666667
 16%|█▌        | 6/38 [00:07<00:40,  1.26s/it]100.0
 18%|█▊        | 7/38 [00:09<00:38,  1.25s/it]100.0
 21%|██        | 8/38 [00:10<00:36,  1.23s/it]100.0
 24%|██▎       | 9/38 [00:11<00:34,  1.20s/it]100.0
 26%|██▋       | 10/38 [00:12<00:33,  1.19s/it]87.5
 29%|██▉       | 11/38 [00:13<00:31,  1.17s/it]83.33333333333333
 32%|███▏      | 12/38 [00:14<00:30,  1.16s/it]100.0
 34%|███▍      | 13/38 [00:15<00:28,  1.15s/it]88.88888888888889
 37%|███▋      | 14/38 [00:17<00:28,  1.17s/it]93.33333333333333
 39%|███▉      | 15/38 [00:18<00:27,  1.21s/it]92.85714285714286
 42%|████▏     | 16/38 [00:19<00:28,  1.28s/it]93.33333333333333
 45%|████▍     | 17/38 [0

Unnamed: 0,filepath,report,BFNE,EME,Ohtahara,Myoclonic encephalopathy,infantile,migrating focal,Unkown,MEI,...,frontal,fontral temporal,insular,multifocal,occipital,parietal,external temporal,MTLE with HS,temporal,temporal occipital
0,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,Description: 2.5 to 5 hz spike/wave and polys...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,LENGTH OF THE RECORDING: 22 minutes and 53 s...,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,CLINICAL HISTORY: 27 year old gentleman with...,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418,1. Data/CR_Patients_info_patients-v0_4/edf/tra...,CLINICAL HISTORY: 49 year old female with alc...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1419,1. Data/CR_Patients_info_patients-v0_4/edf/tra...,CLINICAL HISTORY: A25 year old male with a si...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1420,1. Data/CR_Patients_info_patients-v0_4/edf/tra...,CLINICAL HISTORY: 69 year old woman with onse...,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1421,1. Data/CR_Patients_info_patients-v0_4/edf/tra...,CLINICAL HISTORY: 68 year old woman with righ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## B - Making file with hashtags the simplified hashtags

In [46]:
classification_dataset_hashtag = pd.read_csv('2. Results/classification_dataset_hashtag.csv')
classification_dataset_hashtag.head()

Unnamed: 0,filepath,report,BFNE,EME,Ohtahara,Myoclonic encephalopathy,infantile,migrating focal,Unkown,MEI,...,frontal,fontral temporal,insular,multifocal,occipital,parietal,external temporal,MTLE with HS,temporal,temporal occipital
0,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,Description: 2.5 to 5 hz spike/wave and polys...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,LENGTH OF THE RECORDING: 22 minutes and 53 s...,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,CLINICAL HISTORY: 27 year old gentleman with...,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# Inputing keywords

classification_dataset_hashtag['hashtag'] = ''
classification_dataset_hashtag['temp'] = ''

for i in tqdm(classification_dataset_hashtag.columns[2:]):
    classification_dataset_hashtag ['temp'] = classification_dataset_hashtag[i]
    classification_dataset_hashtag['temp'] = classification_dataset_hashtag['temp'].apply(lambda x: (i + ", ") if x == 1 else "")
    classification_dataset_hashtag['hashtag'] = classification_dataset_hashtag['hashtag'] + classification_dataset_hashtag['temp']

classification_dataset_hashtag['hashtag'] = classification_dataset_hashtag['hashtag'].apply(lambda x: x[:-2] if len(x) >2 else x)
classification_dataset_hashtag_exported = classification_dataset_hashtag[['filepath', 'report', 'hashtag']]
classification_dataset_hashtag_exported.to_csv('2. Results/classification_dataset_hashtag_exported.csv', index=False)
classification_dataset_hashtag_exported ['hashtag'].unique()

100%|██████████| 40/40 [00:00<00:00, 514.43it/s]


array(['', 'central, frontal, temporal', 'frontal',
       'tonic-clonic, temporal', 'temporal lobe, central, temporal',
       'central, temporal', 'temporal lobe, central, frontal, temporal',
       'temporal lobe, temporal', 'frontal, temporal', 'temporal',
       'frontal, fontral temporal, temporal',
       'temporal lobe, frontal, temporal',
       'tonic-clonic, frontal, temporal',
       'central, frontal, parietal, temporal',
       'Lennox-Gastaut, central, frontal, multifocal, temporal',
       'Lennox-Gastaut, occipital', 'central, multifocal, temporal',
       'occipital, temporal', 'central', 'frontal, occipital',
       'central, frontal', 'Rasmussen, central, parietal, temporal',
       'central, fontral temporal, occipital, parietal, temporal',
       'parietal, temporal',
       'temporal lobe, Rasmussen, frontal, temporal', 'Rasmussen',
       'Rasmussen, temporal', 'Rasmussen, central, temporal',
       'tonic-clonic, Rasmussen, parietal, temporal',
       'central,

## C. Extracting keywords and position, and type of crisis

In [48]:
def surline_keyword_info_extractor(text, target, threshold):
    max = 0
    keyword_list = []
    position_list = []
    for i in text.split('.'):
        if fuzz.partial_ratio(i, target) >= threshold:
            for j in re.split(';|,|:| |:',i):
                # CONFIRM PARTIAL RATIO
                if fuzz.partial_ratio(j, target) >= threshold and len(j)>4:
                    keyword_list.append(j)
                    position_list.append(text.find(j,text.find(i)))
    
    return keyword_list, position_list


def crisis_type_correspondance(symptomes, thesaurus_simplified):
    try:
        symptome_list = symptomes.split(', ')
        crisis_type_list = []

        for symptome in symptome_list:
            crisis_type = thesaurus_simplified[thesaurus_simplified['symptome-en-simple'] == symptome]['type of crisis'].iloc[0]
            if crisis_type not in crisis_type_list:
                crisis_type_list.append(crisis_type)
        # return string 
        if len(crisis_type_list) == 2:
            crisis_types = crisis_type_list[0] + ', '+ crisis_type_list[1]
        if len(crisis_type_list) == 1:
            crisis_types = crisis_type_list[0]

        return crisis_types

    except:
        return ''

In [49]:
# Addint crisis type
classification_dataset_hashtag_exported['crisis_type'] = classification_dataset_hashtag_exported['hashtag'].apply(lambda x: crisis_type_correspondance(x, thesaurus_simplified))
classification_dataset_hashtag_exported.head()

Unnamed: 0,filepath,report,hashtag,crisis_type
0,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,Description: 2.5 to 5 hz spike/wave and polys...,,
1,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,LENGTH OF THE RECORDING: 22 minutes and 53 s...,"central, frontal, temporal",focal
2,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car...",,
3,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,CLINICAL HISTORY: 27 year old gentleman with...,frontal,focal
4,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car...",,


In [50]:
classification_dataset_hashtag_exported['surline_data'] = ''

for i in tqdm(range(classification_dataset_hashtag_exported.shape[0])):
    list = []
    targets = classification_dataset_hashtag_exported.iloc[i].hashtag
    text = classification_dataset_hashtag_exported.iloc[i].report

    try:
        for target in targets.split(', '):
            # modify for threshold
            threshold = df_results_hashtag[df_results_hashtag['target'] == target]['threshold'].iloc[0]
            list.append(surline_keyword_info_extractor(text, target, threshold))
        classification_dataset_hashtag_exported['surline_data'].iloc[i] = list
        
    except:
        pass

classification_dataset_hashtag_exported.to_csv('2. Results/classification_dataset_hashtag_exported.csv', index=False)

100%|██████████| 1423/1423 [00:29<00:00, 48.66it/s]


In [52]:
classification_dataset_hashtag_exported

Unnamed: 0,filepath,report,hashtag,crisis_type,surline_data
0,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,Description: 2.5 to 5 hz spike/wave and polys...,,,
1,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,LENGTH OF THE RECORDING: 22 minutes and 53 s...,"central, frontal, temporal",focal,"[([central, frontocentral, frontocentral], [87..."
2,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car...",,,
3,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,CLINICAL HISTORY: 27 year old gentleman with...,frontal,focal,"[([frontally, frontal], [959, 1760])]"
4,1. Data/CR_Patients_info_patients-v0_4/edf/dev...,"MEDICATIONS: Vimpat, Norvasc, Felbamate, Car...",,,
...,...,...,...,...,...
1418,1. Data/CR_Patients_info_patients-v0_4/edf/tra...,CLINICAL HISTORY: 49 year old female with alc...,"parietal, temporal",focal,"[([parietal], [929]), ([temporal], [938])]"
1419,1. Data/CR_Patients_info_patients-v0_4/edf/tra...,CLINICAL HISTORY: A25 year old male with a si...,"tonic-clonic, temporal","Generalized, focal","[([Tonic-clonic], [225]), ([temporal, temporal..."
1420,1. Data/CR_Patients_info_patients-v0_4/edf/tra...,CLINICAL HISTORY: 69 year old woman with onse...,"central, frontal, occipital",focal,"[([central, central], [565, 824]), ([frontal, ..."
1421,1. Data/CR_Patients_info_patients-v0_4/edf/tra...,CLINICAL HISTORY: 68 year old woman with righ...,,,
