In [54]:
!pip install nltk

[33mYou are using pip version 9.0.1, however version 9.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [70]:
import numpy as np 
import pandas as pd
import os
import re
import enchant

from tqdm import tqdm
from keras.preprocessing import sequence, text
from nltk import distance

In [71]:
NB_CATEGORIES = 51
PADDING = 150
MAX_NB_WORDS = 1e6

In [72]:
dataFolder = 'challenge_data'
medicament_list_path = 'challenge_data/medicList.txt' 
xPath = os.path.join(dataFolder, 'input_train.csv')
yPath = os.path.join(dataFolder, 'challenge_output_data_training_file_predict_the_expected_answer.csv')

# Loading, parsing and spliting training and testing data
x = pd.read_csv(xPath, delimiter=';', usecols=[1]).values.ravel()
y = pd.read_csv(yPath, delimiter=';', usecols=[1]).values.ravel()

In [73]:
# keras tokenizer gives all informations about our vocabulary
myTokenizer = text.Tokenizer(
            num_words= MAX_NB_WORDS,
            filters="!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'´’™©®«»",
            split=" "
        )
myTokenizer.fit_on_texts(x)
sequences = myTokenizer.texts_to_sequences(x)
sequences = sequence.pad_sequences(sequences, PADDING)

x_vocab  = list(myTokenizer.word_index.keys())
print('Size of the vocab', len(x_vocab))

Size of the vocab 9983


In [75]:
d = enchant.Dict('fr_FR')

## Try to include knowledge about the medicaments

In [76]:
MEDICAMENTS = []

with open(os.path.join(dataFolder, 'medicList.txt')) as f:
    for line in f:
        MEDICAMENTS.append(line.lower().rstrip())

[d.add(w) for w in MEDICAMENTS] 
print('Liste de médicaments regroupant les libéllés ATC et les'
      'dénominations de spécialité, de taille: {}'.format(len(MEDICAMENTS)))
print('Sample of medicament names: ', MEDICAMENTS[:10])

Liste de médicaments regroupant les libéllés ATC et lesdénominations de spécialité, de taille: 8390
Sample of medicament names:  ['a 313 200  pour cent', 'a 313 50 000 u.i', 'abacavir', 'abacavir/lamivudine', 'abacavir/lamivudine pharma', 'abacavir/lamivudine pharos', 'abamipharm', 'abboticine', 'abelcet', 'abstral']


In [39]:
x_oov = [w for w in x_vocab if d.check(w) == False]

In [8]:
print('nbre de mots erronés', len(x_oov), " soit une proportion de  ", len(x_oov)/len(x_vocab), "mots absents de pyenchant dictionnary.")
print('sample of absent words:', x_oov[:10])

nbre de mots erronés 3814  soit une proportion de   0.3529520636683324 mots absents de pyenchant dictionnary.
sample of absent words: ['aurrais', 'coryzalia', 'acupan', 'havlane', 'beta', '22juillet', 'bromokin', 'climaxol', 'disgestions', 'hepatite']


In [88]:
x_corrected = []
verbose = False

# use enchant to correct spelling errors (include medicaments that we inserted at the initialization of spell_dict)
common_mistakes = ['ème', 'eme', 'éme', 'ans', 'an', 'années', 'année', 'h', 'heure', 'mg', 'g', 'jrs', 'j', 'jours', 'min', 'mn', 'jour', 'kg', 'cm']
nb_re = re.compile('(\d+)(\S+)')
for w in tqdm(x_oov):
    nb = nb_re.search(w)
    if nb!= None:
        if nb.group(2) in common_mistakes:
            if verbose:
                print('catch numerical spelling error' + nb.group())
            w = w.replace(nb.group(1), nb.group(1) + ' ')
    # disgard other numerical expression that we don't want to clean
    if re.search('\d+', w) == None:
        corrected_w = d.suggest(w)[0]
        if distance.edit_distance(w, corrected_w) <= 2:
            if verbose:
                print("corrected " + w + ' in ' + corrected_w)
            w = corrected_w
    x_corrected.append(w)
    
correct_dict = dict(zip(x_oov, x_corrected))

100%|██████████| 3319/3319 [10:16<00:00,  5.38it/s]


In [89]:
correct_dict

{'had': 'ha',
 'repondrai': 'répondrai',
 'raynaud': 'raynaud',
 'démangaisons': 'démangeaisons',
 'vaginose': 'vagin ose',
 '7ième': '7ième',
 'davoir': 'd’avoir',
 'spm': 'spam',
 'minkiété': 'minkiété',
 'antihypertensuers': 'antihypertenseurs',
 'valériana': 'valériane',
 'jasminielle': 'jasminielle',
 'vegan': 'végan',
 'disgestions': 'digestions',
 'voltaréne': 'voltaréne',
 'pevary': 'pevary',
 '16ème': '16 ème',
 'cr': 'ce',
 '16ans': '16 ans',
 '14h30': '14h30',
 'etes': 'étés',
 'lab': 'bal',
 'jui': 'juin',
 'deja': 'déjà',
 'proctolog': 'proctologie',
 'contracetif': 'contraceptif',
 'high': 'high',
 '2ibuprofen': '2ibuprofen',
 'n°5': 'n°5',
 'symptmes': 'symptômes',
 'démengeaison': 'démangeaison',
 'bétaserc': 'bétaserc',
 'exiiste': 'existe',
 'turbuhaler': 'turbuhaler',
 'co': 'quo',
 'enervee': 'énervée',
 'ths': 'tus',
 'bronchokod': 'bronchokod',
 '30min': '30 min',
 'embete': 'embête',
 'cyt': 'cit',
 'etude': 'étude',
 'folliculum': 'follicule',
 'bjr': 'ber',
 'a

In [90]:
import csv
w = csv.writer(open('../posos-data-challenge/challenge_data/corrections.csv', "w"))
for key, val in correct_dict.items():
    w.writerow([key, val])