# Import des données

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import os
import re
import datetime
import sys
%matplotlib inline 

In [22]:
def displayUniqueCount(series):
    data = np.unique(series, return_counts=True)
    df = pd.DataFrame(
        data = {'Valeur':data[0], 'Nombre':data[1]},
        columns = ['Valeur', 'Nombre'])
    df.sort_values(by ="Nombre", ascending=False, inplace=True)
    return df

In [3]:
ROOT_DIR = os.getcwd()
DATA ='data_challenge'
DATA_DIR = os.path.join(ROOT_DIR, DATA)
TRAIN="train"
TEST="test"
SOURCE="source"
TARGET="prix"

In [4]:
train = pd.read_csv(os.path.join(DATA_DIR, 'boites_medicaments_train.csv'),
                    encoding='utf-8',
                    sep=';')

test = pd.read_csv(os.path.join(DATA_DIR, 'boites_medicaments_test.csv'),
                   encoding='utf-8', 
                   sep=';')

train[SOURCE] = TRAIN
test[SOURCE] = TEST
BIG = pd.concat([train, test], axis=0)

# Preparation des donnees

### Types de features

In [5]:
# features numériques
feat_num = ['libelle_plaquette', 'libelle_ampoule', 'libelle_flacon', 
            'libelle_tube', 'libelle_stylo', 'libelle_seringue',
            'libelle_pilulier', 'libelle_sachet', 'libelle_comprime', 
            'libelle_gelule', 'libelle_film', 'libelle_poche',
            'libelle_capsule'] + ['nb_plaquette', 'nb_ampoule', 
            'nb_flacon', 'nb_tube', 'nb_stylo', 'nb_seringue',
            'nb_pilulier', 'nb_sachet', 'nb_comprime', 'nb_gelule', 
            'nb_film', 'nb_poche', 'nb_capsule', 'nb_ml']
# features date
feat_dates = ['date declar annee', 'date amm annee']
# features catégorielles
feat_cat = ['statut', 'etat commerc', 'agrement col', 'tx rembours',
           'statut admin', 'type proc']
# features texte
feat_text = ['libelle', 'titulaires', 'substances', 'forme pharma', 'voies admin']


featToDel = ['libelle', 'id'] #,'voies admin_NB','substances_NB'
featToDummy = feat_cat
featToDummy.remove('tx rembours')
featToDummy.extend(['forme pharma'])
featToEncode = []
featToTransform = [ "convertTx", "log", "voies_admin", "substance", "titulaire"] #"buildBIG",, "new_libelle"

config = {"featToDel":featToDel,
          "featToDummy":featToDummy,
          "featToEncode":featToEncode,
          "featToTransform":featToTransform}

print "DELETE    :  ", featToDel
print "DUMMY     :  ", featToDummy
print "ENCODE    :  ", featToEncode
print "TRANSFORM :  ", featToTransform

DELETE    :   ['libelle', 'id']
DUMMY     :   ['statut', 'etat commerc', 'agrement col', 'statut admin', 'type proc', 'forme pharma']
ENCODE    :   []
TRANSFORM :   ['convertTx', 'log', 'voies_admin', 'substance', 'titulaire']


### Feature Engineering : LIBELLE

In [6]:
def addNBLibelle(val, X):
    regexp = re.compile("([0-9]{1,5})\s" + val + "*")
    X["nb_" + val] = X.libelle.apply(lambda w : regexp.search(w).group(1) if regexp.search(w) else 0)
    X["libelle_" + val] = X.libelle.apply(lambda w : 1 if regexp.search(w) else 0)
    return X

### Feature Engineering : SUBSTANCES - VOIES ADMIN

Création du Bag of words substances

In [7]:
def create_column(X, name):

    distinctCategs = (X[name]
                      .apply(lambda col_value : col_value.split(','))
                      .apply(pd.Series)
                      .unstack()
                      .dropna()
                      .str.strip()
                      .unique())

    #pd.DataFrame(distinctCategs).to_csv("distinctCateg"+name +".csv", encoding="utf-8", sep=";")
    
    for distinctCateg in distinctCategs:
        X[distinctCateg] = X[name].apply(lambda x : 1 if distinctCateg in x else 0)
        
    X[name + "_NB"] = X[distinctCategs].sum(axis=1)
    
    print "Add " + str(len(distinctCategs)) + " new columns"
    return X

### Feature Engineering : SUBSTANCES

In [8]:
def transfo_substances(X, index):

    X = (X
         .apply(lambda subs: re.sub(r'(\([^)]*\))','', subs))
         .apply(lambda subs: subs.replace("é","e"))
         .apply(lambda subs: subs.replace("es","e"))
         .apply(lambda subs: subs.replace("ee","e"))
         .apply(lambda subs: subs.replace("-",""))
         .apply(lambda subs: subs.replace("doxyxycline", "doxycycline"))
         .apply(lambda subs: subs.replace("alpha", "alfa")))
    
    stop_words={"d", "de", "des","du","à","10a","11","11a","12f","1331","135","13c","14","158","15b","16","165","175","179a",
            "17f","18","181","18c","197","19a","19f","1a","1b","20","2009","2010","2011","2013","2014","22f","23f","27","2a",
            "2b","3073","326f","3350","33f","361","39","4000","4385","55","6a","6b","74xp","7f","88","940","9715293",
            "974p","980","9n","9v", "extrait","allergenique", "humaine", "recombinante", "conjuguee", "la", "extrait",
            "fruit", "proteine", "vectrice", "des","en", "gel","gomme","venin", "proteine", "recombinante", "adsorbee",
            "virus", "humain", "hpv", "type", "proteines", "pre", "purifiee", "souche", "ra", "vivant", "riche", "sec",
            "ab", "et", "polyoside", "serotypes", "conjugues","vectrice","attendue","utilisee", "synthetique", "oxyde", 
            "base", "acide","acides", "actif","active", "adjuvant", "adsorbe", "afrique","ecorce", "mou", "fusion",
            "groupe", "chaine", "agglomere", "cire", "element", "especes", "excipient", "externe", "forme", "glacial", 
            "leger", "lourd", "chlorhydrate", "sodique","hydrochlorothiazide","anhydre"}

    vectorizer = CountVectorizer(min_df=0., max_df=1.0, stop_words=stop_words, strip_accents="ascii")
    SUBS_VECT = vectorizer.fit_transform(X)
    feat_names = vectorizer.get_feature_names()

    return pd.DataFrame(data=SUBS_VECT.A, columns=feat_names, index= index)

### Encodage des features catégorielles

Les algorithmes de machine learning s'attendent à avoir en entrée des nombres, et non pas des chaînes de caractères. C'est pourquoi nous transformons les features catégorielles en nombres, à l'aide de LabelEncoder()

In [9]:
def parse(X, config, silent=False) :
    start = datetime.datetime.today().now()
    if not silent:
        print '__________________________________________ PARSE __________________________________________'
        print '--> ', start.strftime('%d-%m-%Y : %H:%M')
        print 'BEFORE : ', X.shape
    
    if "titulaire" in config["featToTransform"]:
        print "Split titulaires to titulaire and country"
        X["titulaires"] = (X["titulaires"]
                      .apply(lambda col_value : col_value.replace('LABORATOIRES',''))
                      .apply(lambda col_value : col_value.replace('BRISTOL-MYERS',' BRISTOL MYERS'))
                      .apply(lambda col_value : col_value.replace('SANOFI-AVENTIS','SANOFI AVENTIS'))
                      .str.strip()
                      .apply(lambda substance : substance.split(' ')[0] )
                      .apply(pd.Series))
        config["featToEncode"].extend(['titulaires'])

    if "substance" in config["featToTransform"]:
        print "Create a Substance column for each substance"
        X["substances"] = (X["substances"]
                           .apply(lambda subs: re.sub(r'(\([^)]*\))','', subs))
                           .apply(lambda subs: subs.replace("es","e"))
                           .apply(lambda subs: subs.replace("ee","e"))
                           .apply(lambda subs: subs.replace("doxyxycline", "doxycycline"))
                           .apply(lambda subs: subs.replace("alpha", "alfa")))
        X = create_column(X, "substances")
        X.drop("substances", axis =1, inplace=True)
    
    if "voies_admin" in config["featToTransform"]:
        print "Create a Voie Admin column for each voie admin"
        X = create_column(X, "voies admin")
        X.drop("voies admin", axis =1, inplace=True)
        
    if "convertTx" in config["featToTransform"]:
        print "convert tx rembours"
        X["tx rembours"] = [int(x.replace("%", "")) for x in X["tx rembours"]]
    
    if "fillNR" in config["featToTransform"]:
        print "---Fill NR "
        X = fillNR(X)
        
    if "new_libelle" in config["featToTransform"]:
        lib_to_add = ["cartouche","bouteille","film","inhalateur"]
        for lib in lib_to_add:
            X = addNBLibelle(lib, X)
    
    #remove target
    y_train = X.loc[X.source == 'train',TARGET]
    
    if "log" in config["featToTransform"]:
        y_train = y_train.apply(np.log)
                    
    del X[TARGET]
    
    print "---Feature to encode "
    #Feature to encode
    featToEncode = config["featToEncode"]
    for encodeToCat in featToEncode:
        print "encode " + encodeToCat
        le = LabelEncoder()
        le.fit(X[encodeToCat])
        X.loc[:,encodeToCat] = le.transform(X[encodeToCat])
    
    print "---Feature to dummy "
    #Feature to dummy
    featToDummy = config["featToDummy"]
    if featToDummy:
        dums = pd.get_dummies(X[featToDummy]).astype(int)
        X = pd.concat([X, dums], axis=1)
        X.drop(featToDummy, axis =1, inplace=True)
    
    print "---Feature to delete "
    #Feature to delete
    featToDel = config["featToDel"]
    X.drop(featToDel, axis =1, inplace=True)
    
    if "buildBIG" in config["featToTransform"]:
        print "---Build BIG.csv " 
        X.to_csv(DATA_DIR + '/'+ 'BIG.csv', header=True, index=False,sep=';', encoding='utf-8')
    
    X_train = X[X[SOURCE] == TRAIN]
    X_test = X[X[SOURCE] == TEST]    
    del X_test[SOURCE]
    del X_train[SOURCE]
    
    
    if not silent:
        print '%s Elapsed time :%d s' % (datetime.datetime.today().now().strftime('--> %d-%m-%Y : %H:%M'), 
        (datetime.datetime.today().now().now() - start).total_seconds())
        print 'AFTER TRAIN :', X_train.shape
        print 'AFTER TEST :', X_test.shape
        sys.stdout.flush()
    
    return X_train, X_test, y_train

In [10]:
X_train, X_test, y_train = parse(BIG.copy(), config)

__________________________________________ PARSE __________________________________________
-->  27-10-2016 : 17:47
BEFORE :  (12235, 43)
Split titulaires to titulaire and country
Create a Substance column for each substance
Add 1739 new columns
Create a Voie Admin column for each voie admin
Add 45 new columns
convert tx rembours
---Feature to encode 
encode titulaires
---Feature to dummy 
---Feature to delete 
--> 27-10-2016 : 17:47 Elapsed time :21 s
AFTER TRAIN : (8564, 2053)
AFTER TEST : (3671, 2053)


# Creation d'un modele

### Métrique MAPE

In [11]:
# Mean Absolute Percentage Error
def mape_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

### Cross validation

In [12]:
def crossVal (X, y, clf, scorer, n_splits=10, shuffle = False) :
    folds= ms.KFold(n_splits=n_splits, shuffle= shuffle, random_state=None) #shuffle = False, random_state
    # When shuffle=True, pseudo-random number generator state used for shuffling.
    # If None, use default numpy RNG for shuffling
    
    scoreList = []
    i = 0
    for trainIdx, testIdx in folds.split(X):
        i +=1 
        XTrainFolds = None; XTestFolds = None; yTrainFolds= None; yTestFolds= None
        yPred = []
        XTrainFolds, XTestFolds = X.iloc[trainIdx], X.iloc[testIdx]
        yTrainFolds, yTestFolds = y.iloc[trainIdx], y.iloc[testIdx]
        clf = clf.fit(XTrainFolds, yTrainFolds)
        res = clf.predict(XTestFolds)
        
        if "log" in config["featToTransform"]:
            score = scorer(np.exp(yTestFolds), np.exp(res))
        else:
            score = scorer(yTestFolds, res)
        
        scoreList.append(score)
        print "*** MAPE Error : " + str(i) + " - " + str(score)
        sys.stdout.flush()
        
    print "----------------"
    print "  - Mean :" + str(np.mean(scoreList))
    print "  - Ecart Max-Min :" + str(np.max(scoreList) - np.min(scoreList))

In [13]:
clf = ExtraTreesRegressor(n_estimators=11, max_features= 'auto', criterion="mse", max_depth=None, min_samples_split=5, n_jobs=-1)
#If “auto”, then max_features=n_features.
#paramsET = {'n_jobs': -1, 'silent': 1, 'n_estimators': 50, 'max_features': 'auto', 'min_samples_split': 5.0, 'max_depth': 30}
#clf = RandomForestRegressor(n_estimators = int(paramsET['n_estimators']), n_jobs = -1, verbose = 0)

#clf = GradientBoostingRegressor(verbose = 1, max_features='auto')

In [14]:
crossVal(X_train, y_train, clf, mape_error)

*** MAPE Error : 1 - 34.6072342524
*** MAPE Error : 2 - 34.9123103139
*** MAPE Error : 3 - 39.649395899
*** MAPE Error : 4 - 31.7653116503
*** MAPE Error : 5 - 30.4430226844
*** MAPE Error : 6 - 38.5198550212
*** MAPE Error : 7 - 33.9462645939
*** MAPE Error : 8 - 31.651018609
*** MAPE Error : 9 - 32.4795954816
*** MAPE Error : 10 - 32.5675254986
----------------
  - Mean :34.0541534004
  - Ecart Max-Min :9.20637321465


In [15]:
print "Features sorted by their score:"
sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), X_train.columns), 
             reverse=True)

Features sorted by their score:


[(0.1747, u'type proc_Proc\xe9dure centralis\xe9e'),
 (0.1008, u'tx rembours'),
 (0.0958, u'nb_comprime'),
 (0.0525, u'type proc_Proc\xe9dure nationale'),
 (0.0234, u'libelle_plaquette'),
 (0.0233, u'nb_gelule'),
 (0.02, u'PARAC\xc9TAMOL'),
 (0.0162, u'nb_seringue'),
 (0.0162, u'date amm annee'),
 (0.0145, u'date declar annee'),
 (0.013, u'orale'),
 (0.0117, u'RIBAVIRINE'),
 (0.0117, u'IODE'),
 (0.0094, u'FLUCONAZOLE'),
 (0.0091, u'OLANZAPINE'),
 (0.0087, u'intra-art\xe9rielle'),
 (0.0086, u'nb_ml'),
 (0.0081, u'ophtalmique'),
 (0.0078, u'sous-cutan\xe9e'),
 (0.0077, 'substances_NB'),
 (0.0077, u'TAMOXIF\xc8NE'),
 (0.0071, u'METFORMINE'),
 (0.0068, u'forme pharma_solution pour perfusion'),
 (0.0054, u'forme pharma_comprim\xe9 pellicul\xe9'),
 (0.0053, u'L\xc9VOTHYROXINE SODIQUE'),
 (0.0052, u'libelle_poche'),
 (0.0047, u'RISP\xc9RIDONE'),
 (0.0046, u'FENTANYL'),
 (0.004, u'RIS\xc9DRONATE MONOSODIQUE'),
 (0.004, u'ACICLOVIR'),
 (0.0038, u'DON\xc9P\xc9ZIL'),
 (0.0038, u'ALLOPURINOL'),
 (

# Calcul des predictions et soumission

In [16]:
# On entraine de nouveau le modèle, cette fois sur l'intégralité des données
#clf = ExtraTreesRegressor(n_estimators=11, max_features= 'auto', criterion="mse", max_depth=None, min_samples_split=5, n_jobs=-1)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
if "log" in config["featToTransform"]:
    predictions = np.exp(predictions)
            


In [17]:
pd.DataFrame(predictions, index=test['id']).to_csv('submit/soumission.csv',  
                          header=['prix'],
                          sep = ';', 
                          dtypes={'id':'int','prix':'int'})