# Import des données

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import os
import re
import datetime
import sys
%matplotlib inline 

In [2]:
def displayUniqueCount(series):
    data = np.unique(series, return_counts=True)
    df = pd.DataFrame(
        data = {'Valeur':data[0], 'Nombre':data[1]},
        columns = ['Valeur', 'Nombre'])
    df.sort_values(by ="Nombre", ascending=False, inplace=True)
    return df

In [3]:
ROOT_DIR = os.getcwd()
DATA ='data_challenge'
DATA_DIR = os.path.join(ROOT_DIR, DATA)
RES ='submit'
RES_DIR = os.path.join(ROOT_DIR, RES)
TRAIN="train"
TEST="test"
SOURCE="source"
TARGET="prix"

In [4]:
train = pd.read_csv(os.path.join(DATA_DIR, 'boites_medicaments_train.csv'),
                    encoding='utf-8',
                    sep=';')

test = pd.read_csv(os.path.join(DATA_DIR, 'boites_medicaments_test.csv'),
                   encoding='utf-8', 
                   sep=';')

train[SOURCE] = TRAIN
test[SOURCE] = TEST
BIG = pd.concat([train, test], axis=0)

# Preparation des donnees

### Types de features

In [5]:
# features numériques
feat_num = ['libelle_plaquette', 'libelle_ampoule', 'libelle_flacon', 
            'libelle_tube', 'libelle_stylo', 'libelle_seringue',
            'libelle_pilulier', 'libelle_sachet', 'libelle_comprime', 
            'libelle_gelule', 'libelle_film', 'libelle_poche',
            'libelle_capsule'] + ['nb_plaquette', 'nb_ampoule', 
            'nb_flacon', 'nb_tube', 'nb_stylo', 'nb_seringue',
            'nb_pilulier', 'nb_sachet', 'nb_comprime', 'nb_gelule', 
            'nb_film', 'nb_poche', 'nb_capsule', 'nb_ml']
# features date
feat_dates = ['date declar annee', 'date amm annee']
# features catégorielles
feat_cat = ['statut', 'etat commerc', 'agrement col', 'tx rembours',
           'statut admin', 'type proc']
# features texte
feat_text = ['libelle', 'titulaires', 'substances', 'forme pharma', 'voies admin']


featToDel = ['libelle', 'id'] #,'voies admin_NB','substances_NB'
featToDummy = feat_cat
featToDummy.remove('tx rembours')
featToDummy.extend(['forme pharma'])
featToEncode = []
featToTransform = [ "convertTx", "log", "voies_admin", "substance", "titulaire", "new_libelle"] #, "gboost", "ensemble", "new_libelle", "ensemble","buildBIG", 

config = {"featToDel":featToDel,
          "featToDummy":featToDummy,
          "featToEncode":featToEncode,
          "featToTransform":featToTransform}

print "DELETE    :  ", featToDel
print "DUMMY     :  ", featToDummy
print "ENCODE    :  ", featToEncode
print "TRANSFORM :  ", featToTransform

DELETE    :   ['libelle', 'id']
DUMMY     :   ['statut', 'etat commerc', 'agrement col', 'statut admin', 'type proc', 'forme pharma']
ENCODE    :   []
TRANSFORM :   ['convertTx', 'log', 'voies_admin', 'substance', 'titulaire', 'new_libelle']


### Feature Engineering : LIBELLE

In [6]:
def addNBLibelle(val, X):
    regexp = re.compile("([0-9]{1,5})\s" + val + "*")
    X["nb_" + val] = X.libelle.apply(lambda w : regexp.search(w).group(1) if regexp.search(w) else 0)
    X["libelle_" + val] = X.libelle.apply(lambda w : 1 if regexp.search(w) else 0)
    return X

### Feature Engineering : SUBSTANCES - VOIES ADMIN

Création du Bag of words substances

In [7]:
def create_column(X, name):

    distinctCategs = (X[name]
                      .apply(lambda col_value : col_value.split(','))
                      .apply(pd.Series)
                      .unstack()
                      .dropna()
                      .str.strip()
                      .unique())

    #pd.DataFrame(distinctCategs).to_csv("distinctCateg"+name +".csv", encoding="utf-8", sep=";")
    
    for distinctCateg in distinctCategs:
        X[distinctCateg] = X[name].apply(lambda x : 1 if distinctCateg in x else 0)
        
    X[name + "_NB"] = X[distinctCategs].sum(axis=1)
    
    print "Add " + str(len(distinctCategs)) + " new columns"
    return X

### Feature Engineering : SUBSTANCES

In [8]:
def transfo_substances(X, index):

    X = (X
         .apply(lambda subs: re.sub(r'(\([^)]*\))','', subs))
         .apply(lambda subs: subs.replace("é","e"))
         .apply(lambda subs: subs.replace("es","e"))
         .apply(lambda subs: subs.replace("ee","e"))
         .apply(lambda subs: subs.replace("-",""))
         .apply(lambda subs: subs.replace("doxyxycline", "doxycycline"))
         .apply(lambda subs: subs.replace("alpha", "alfa")))
    
    stop_words={"d", "de", "des","du","à","10a","11","11a","12f","1331","135","13c","14","158","15b","16","165","175","179a",
            "17f","18","181","18c","197","19a","19f","1a","1b","20","2009","2010","2011","2013","2014","22f","23f","27","2a",
            "2b","3073","326f","3350","33f","361","39","4000","4385","55","6a","6b","74xp","7f","88","940","9715293",
            "974p","980","9n","9v", "extrait","allergenique", "humaine", "recombinante", "conjuguee", "la", "extrait",
            "fruit", "proteine", "vectrice", "des","en", "gel","gomme","venin", "proteine", "recombinante", "adsorbee",
            "virus", "humain", "hpv", "type", "proteines", "pre", "purifiee", "souche", "ra", "vivant", "riche", "sec",
            "ab", "et", "polyoside", "serotypes", "conjugues","vectrice","attendue","utilisee", "synthetique", "oxyde", 
            "base", "acide","acides", "actif","active", "adjuvant", "adsorbe", "afrique","ecorce", "mou", "fusion",
            "groupe", "chaine", "agglomere", "cire", "element", "especes", "excipient", "externe", "forme", "glacial", 
            "leger", "lourd", "chlorhydrate", "sodique","hydrochlorothiazide","anhydre"}

    vectorizer = CountVectorizer(min_df=0., max_df=1.0, stop_words=stop_words, strip_accents="ascii")
    SUBS_VECT = vectorizer.fit_transform(X)
    feat_names = vectorizer.get_feature_names()

    return pd.DataFrame(data=SUBS_VECT.A, columns=feat_names, index= index)

### Encodage des features catégorielles

Les algorithmes de machine learning s'attendent à avoir en entrée des nombres, et non pas des chaînes de caractères. C'est pourquoi nous transformons les features catégorielles en nombres, à l'aide de LabelEncoder()

In [9]:
def parse(X, config, silent=False) :
    start = datetime.datetime.today().now()
    if not silent:
        print '__________________________________________ PARSE __________________________________________'
        print '--> ', start.strftime('%d-%m-%Y : %H:%M')
        print 'BEFORE : ', X.shape
    
    if "titulaire" in config["featToTransform"]:
        print "Get titulaire first word"
        X["titulaires"] = (X["titulaires"]
                      .apply(lambda col_value : col_value.replace('LABORATOIRES',''))
                      .apply(lambda col_value : col_value.replace('BRISTOL-MYERS',' BRISTOL MYERS'))
                      .apply(lambda col_value : col_value.replace('SANOFI-AVENTIS','SANOFI AVENTIS'))
                      .str.strip()
                      .apply(lambda titulaire : titulaire.split(' ')[0] )
                      .apply(pd.Series))
        config["featToEncode"].extend(['titulaires'])

    if "substance" in config["featToTransform"]:
        print "Create a Substance column for each substance"
        X["substances"] = (X["substances"]
                           .apply(lambda subs: re.sub(r'(\([^)]*\))','', subs))
                           .apply(lambda subs: subs.replace("es","e"))
                           .apply(lambda subs: subs.replace("ee","e"))
                           .apply(lambda subs: subs.replace("doxyxycline", "doxycycline"))
                           .apply(lambda subs: subs.replace("alpha", "alfa")))
        X = create_column(X, "substances")
        X.drop("substances", axis =1, inplace=True)
    
    if "voies_admin" in config["featToTransform"]:
        print "Create a Voie Admin column for each voie admin"
        X = create_column(X, "voies admin")
        X.drop("voies admin", axis =1, inplace=True)
        
    if "convertTx" in config["featToTransform"]:
        print "convert tx rembours"
        X["tx rembours"] = [int(x.replace("%", "")) for x in X["tx rembours"]]
        
    if "new_libelle" in config["featToTransform"]:
        lib_to_add = ["cartouche","bouteille","film","inhalateur", "PEHD"]
        for lib in lib_to_add:
            X = addNBLibelle(lib, X)
            
    if "drop_duplicates" in config["featToTransform"]:
        X.drop_duplicates(inplace=True)
        
    if "gboost"in config["featToTransform"]:
        RESULTAT = pd.read_csv(os.path.join(RES_DIR, 'predX15.csv'),
                            encoding='utf-8',
                            sep=';')
        SUB_15 = pd.read_csv(os.path.join(RES_DIR, 'soumission15.csv'),
                            encoding='utf-8',
                            sep=';')
        X['SUB_BOOST'] = pd.concat([RESULTAT.prix, SUB_15.prix],axis=0)
        
    if "ensemble" in config["featToTransform"]:
        RESULTAT = pd.read_csv(os.path.join(RES_DIR, 'resultat.csv'),
                            encoding='utf-8',
                            sep=';')
        SUB_1 = pd.read_csv(os.path.join(RES_DIR, 'soumission1.csv'),
                            encoding='utf-8',
                            sep=';')
        SUB_1.rename(index=str, columns={"prix": "SUB_1"}, inplace=True)
        SUB_4 = pd.read_csv(os.path.join(RES_DIR, 'soumission4.csv'),
                            encoding='utf-8',
                            sep=';')
        SUB_4.rename(index=str, columns={"prix": "SUB_4"}, inplace=True)
        SUB_9 = pd.read_csv(os.path.join(RES_DIR, 'soumission9.csv'),
                            encoding='utf-8',
                            sep=';')
        SUB_9.rename(index=str, columns={"prix": "SUB_9"}, inplace=True)
        SUB_11 = pd.read_csv(os.path.join(RES_DIR, 'soumission11.csv'),
                            encoding='utf-8',
                            sep=';')
        SUB_11.rename(index=str, columns={"prix": "SUB_11"}, inplace=True)
        SUB = pd.concat([SUB_1.SUB_1, SUB_4.SUB_4, SUB_9.SUB_9, SUB_11.SUB_11],axis=1)

        RESULTAT_ENS = pd.concat([RESULTAT, SUB],axis=0)
        BIG['SUB_1']=RESULTAT_ENS['SUB_1']
        BIG['SUB_4']=RESULTAT_ENS['SUB_4']
        BIG['SUB_9']=RESULTAT_ENS['SUB_9']
        BIG['SUB_11']=RESULTAT_ENS['SUB_11']

    #remove target
    y_train = X.loc[X.source == 'train',TARGET]
    
    if "log" in config["featToTransform"]:
        y_train = y_train.apply(np.log1p)
                    
    del X[TARGET]
    
    print "---Feature to encode "
    #Feature to encode
    featToEncode = config["featToEncode"]
    for encodeToCat in featToEncode:
        print "encode " + encodeToCat
        le = LabelEncoder()
        le.fit(X[encodeToCat])
        X.loc[:,encodeToCat] = le.transform(X[encodeToCat])
    
    print "---Feature to dummy "
    #Feature to dummy
    featToDummy = config["featToDummy"]
    print featToDummy
    if featToDummy:
        dums = pd.get_dummies(X[featToDummy]).astype(int)
        X = pd.concat([X, dums], axis=1)
        X.drop(featToDummy, axis =1, inplace=True)
    
    print "---Feature to delete "
    #Feature to delete
    featToDel = config["featToDel"]
    X.drop(featToDel, axis =1, inplace=True)
    
    if "buildBIG" in config["featToTransform"]:
        print "---Build BIG.csv " 
        X.to_csv(DATA_DIR + '/'+ 'BIG.csv', header=True, index=False,sep=';', encoding='utf-8')
    
    X_train = X[X[SOURCE] == TRAIN]
    X_test = X[X[SOURCE] == TEST]    
    del X_test[SOURCE]
    del X_train[SOURCE]
    
    
    if not silent:
        print '%s Elapsed time :%d s' % (datetime.datetime.today().now().strftime('--> %d-%m-%Y : %H:%M'), 
        (datetime.datetime.today().now().now() - start).total_seconds())
        print 'AFTER TRAIN :', X_train.shape
        print 'AFTER TEST :', X_test.shape
        sys.stdout.flush()
    
    return X_train, X_test, y_train

In [10]:
X_train, X_test, y_train = parse(BIG.copy(), config)

__________________________________________ PARSE __________________________________________
-->  31-10-2016 : 20:21
BEFORE :  (12235, 43)
Get titulaire first word
Create a Substance column for each substance
Add 1739 new columns
Create a Voie Admin column for each voie admin
Add 45 new columns
convert tx rembours
---Feature to encode 
encode titulaires
---Feature to dummy 
['statut', 'etat commerc', 'agrement col', 'statut admin', 'type proc', 'forme pharma']
---Feature to delete 
--> 31-10-2016 : 20:22 Elapsed time :22 s
AFTER TRAIN : (8564, 2061)
AFTER TEST : (3671, 2061)


# Creation d'un modele

### Métrique MAPE

In [11]:
# Mean Absolute Percentage Error
def mape_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

### Cross validation

In [12]:
def crossVal (X, y, clf, scorer, n_splits=10, shuffle = False) :
    folds= ms.KFold(n_splits=n_splits, shuffle= shuffle, random_state=None) #shuffle = False, random_state
    # When shuffle=True, pseudo-random number generator state used for shuffling.
    # If None, use default numpy RNG for shuffling
    scoreList = []
    i = 0
    for trainIdx, testIdx in folds.split(X):
        i +=1 
        XTrainFolds = None; XTestFolds = None; yTrainFolds= None; yTestFolds= None
        yPred = []
        XTrainFolds, XTestFolds = X.iloc[trainIdx], X.iloc[testIdx]
        yTrainFolds, yTestFolds = y.iloc[trainIdx], y.iloc[testIdx]
        clf = clf.fit(XTrainFolds, yTrainFolds)
        res = clf.predict(XTestFolds)
        
        if "log" in config["featToTransform"]:
            score = scorer(np.expm1(yTestFolds), np.expm1(res))
        else:
            score = scorer(yTestFolds, res)
        scoreList.append(score)
        print "*** MAPE Error : " + str(i) + " - " + str(score)
        sys.stdout.flush()
        
    print "----------------"
    print "  - Mean :" + str(np.mean(scoreList))
    print "  - Ecart Max-Min :" + str(np.max(scoreList) - np.min(scoreList))

In [14]:
clf = GradientBoostingRegressor(n_estimators=500, loss='huber', alpha=0.23, max_depth=80, max_leaf_nodes=100, random_state=3, min_samples_split=10, verbose=1)

In [69]:
crossVal(X_train, y_train, clf, mape_error)

# Calcul des predictions et soumission

In [16]:
clf.fit(X_train, y_train)

predXtrain = clf.predict(X_train)

predictions = clf.predict(X_test)
if "log" in config["featToTransform"]:
    predictions = np.expm1(predictions)

      Iter       Train Loss   Remaining Time 
         1           0.1830           44.32m
         2           0.1604           44.73m
         3           0.1333           44.81m
         4           0.1164           45.72m
         5           0.1025           45.28m
         6           0.0904           44.89m
         7           0.0783           44.78m
         8           0.0683           44.60m
         9           0.0607           45.25m
        10           0.0541           45.15m
        20           0.0204           46.75m
        30           0.0105           47.18m
        40           0.0068           46.50m
        50           0.0048           45.09m
        60           0.0038           43.21m
        70           0.0031           68.17m
        80           0.0025           62.41m
        90           0.0022           58.14m
       100           0.0019           53.73m
       200           0.0007           30.93m
       300           0.0004           21.69m
       40

In [17]:
pd.DataFrame(predictions, index=test['id']).to_csv('submit/soumission19.csv',  
                          header=['prix'],
                          sep = ';', 
                          dtypes={'id':'int','prix':'int'})