In [1]:
import numpy as np 
import pandas as pd
import os
import re
import enchant

from tqdm import tqdm
from keras.preprocessing import sequence, text
from nltk import distance

Using TensorFlow backend.


In [2]:
NB_CATEGORIES = 51
PADDING = 150
MAX_NB_WORDS = 1e6

In [3]:
dataFolder = 'challenge_data/'
medicament_list_path = 'challenge_data/medicList.txt' 
xPath = os.path.join(dataFolder, 'input_train.csv')
yPath = os.path.join(dataFolder, 'output_train.csv')
x_testPath = os.path.join(dataFolder, 'input_test.csv')
# Loading, parsing and spliting training and testing data
x_train = pd.read_csv(xPath, delimiter=';', usecols=[1]).values.ravel()
x_test = pd.read_csv(x_testPath, delimiter=';', usecols=[1]).values.ravel()
y = pd.read_csv(yPath, delimiter=';', usecols=[1]).values.ravel()
# concatenate x_train et x_test to get all questions
x = np.concatenate((x_train, x_test))

In [4]:
# keras tokenizer gives all informations about our vocabulary
myTokenizer = text.Tokenizer(
            num_words= MAX_NB_WORDS,
            filters="!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'´’™©®«»",
            split=" "
        )
myTokenizer.fit_on_texts(x)
sequences = myTokenizer.texts_to_sequences(x)
sequences = sequence.pad_sequences(sequences, PADDING)

x_vocab  = list(myTokenizer.word_index.keys())
print('Size of the vocab', len(x_vocab))

Size of the vocab 11024


In [5]:
d = enchant.Dict('fr_FR')

## Try to include knowledge about the medicaments

In [6]:
MEDICAMENTS = []

with open(os.path.join(dataFolder, 'medicList.txt')) as f:
    for line in f:
        MEDICAMENTS.append(line.lower().rstrip())

[d.add(w) for w in MEDICAMENTS] 
print('Liste de médicaments regroupant les libéllés ATC et les'
      'dénominations de spécialité, de taille: {}'.format(len(MEDICAMENTS)))
print('Sample of medicament names: ', MEDICAMENTS[:10])

Liste de médicaments regroupant les libéllés ATC et lesdénominations de spécialité, de taille: 8390
Sample of medicament names:  ['a 313 200  pour cent', 'a 313 50 000 u.i', 'abacavir', 'abacavir/lamivudine', 'abacavir/lamivudine pharma', 'abacavir/lamivudine pharos', 'abamipharm', 'abboticine', 'abelcet', 'abstral']


In [7]:
x_oov = [w for w in x_vocab if d.check(w) == False]

In [8]:
print('nbre de mots erronés', len(x_oov), " soit une proportion de  ", len(x_oov)/len(x_vocab), "mots absents de pyenchant dictionnary.")
print('sample of absent words:', x_oov[:10])

nbre de mots erronés 3815  soit une proportion de   0.3460631349782293 mots absents de pyenchant dictionnary.
sample of absent words: ['severe', 'eruption', '30jours', 'perlodel', 'occidentalis', 'valproate', 'dégout', 'ashme', 'neurosimulateur', 'rehercher']


In [9]:
x_corrected = []
verbose = False

# use enchant to correct spelling errors (include medicaments that we inserted at the initialization of spell_dict)
common_mistakes = ['ème', 'eme', 'éme', 'ans', 'an', 'années', 'année', 'h', 'heure', 'mg', 'g', 'jrs', 'j', 'jours', 'min', 'mn', 'jour', 'kg', 'cm']
nb_re = re.compile('(\d+)(\S+)')
for w in tqdm(x_oov):
    nb = nb_re.search(w)
    if nb!= None:
        if nb.group(2) in common_mistakes:
            if verbose:
                print('catch numerical spelling error' + nb.group())
            w = w.replace(nb.group(1), nb.group(1) + ' ')
    # disgard other numerical expression that we don't want to clean
    if re.search('\d+', w) == None:
        corrected_w = d.suggest(w)[0]
        if distance.edit_distance(w, corrected_w) <= 2:
            if verbose:
                print("corrected " + w + ' in ' + corrected_w)
            w = corrected_w
    x_corrected.append(w)
    
correct_dict = dict(zip(x_oov, x_corrected))

100%|██████████| 3815/3815 [11:55<00:00,  5.33it/s]


In [10]:
correct_dict

{'severe': 'sévère',
 'prothombine': 'prothrombine',
 'génant': 'gainant',
 'eruption': 'éruption',
 '30jours': '30 jours',
 '9ch': '9ch',
 'gagnerdu': 'gagner du',
 'perlodel': 'perlodel',
 'nourisson': 'nourrisson',
 'desagreable': 'desagreable',
 'osteoporose': 'ostéoporose',
 '10e': '10e',
 'naturland': 'naturland',
 'eme': 'me',
 'arrété': 'arrête',
 'occidentalis': 'occidentalise',
 'deconseiller': 'déconseiller',
 'valproate': 'valproate',
 'eclatant': 'éclatant',
 'adutles': 'adultes',
 'dégout': 'd’égout',
 'pkoi': 'pkoi',
 'pendnt': 'pendent',
 'délètre': 'délétère',
 'çane': 'cane',
 'déroxat': 'déroxat',
 '25°c': '25°c',
 'oracilline': 'oracilline',
 'necessaire': 'nécessaire',
 'endrocure': 'endrocure',
 'ashme': 'asthme',
 'neurosimulateur': 'neurosimulateur',
 'décolté': 'récolté',
 'dermatolo': 'dermatolo',
 'generique': 'générique',
 'hallucinogéne': 'hallucinogène',
 'k2': 'k2',
 'rehercher': 'rechercher',
 'ocmbien': 'combien',
 'carbonicum': 'carbonique',
 'remeron'

In [11]:
import csv
w = csv.writer(open('../posos-data-challenge/challenge_data/corrections.csv', "w"))
for key, val in correct_dict.items():
    w.writerow([key, val])