In [1]:
from drugs import Drugs
import pandas as pd

In [2]:
drugs = Drugs()

In [3]:
df = pd.read_csv("../data/drugs_train.csv")

In [10]:
df_ingredient= pd.read_csv("../data/active_ingredients.csv")

In [4]:
from sklearn.model_selection import train_test_split
df1, df_test = train_test_split(df, random_state=40, test_size=0.2)

In [5]:
drugs.train(df1)

RMSE SCORE ON TRAIN: 31.14520769045343
RMSE SCORE ON VAL: 70.00058365441745


In [6]:
from drugs.core.transformers.encoders import IngredientsEncoder

In [29]:
from typing import List

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from drugs.constants import (
    ACTIVE_INGREDIENT,
    DRUG_ID,
    HIGH_CARD_COLUMNS,
    ONE_HOT_COLUMNS,
    PRICE,
    REIMBURSEMENT_RATE,
    STRS_TO_CHECK,
)


class IngredientsEncoder(BaseEstimator, TransformerMixin):
    """
    Encode ingredients with a mean/median/quantile value of the price

    -> To estimate how much an ingredient costs in a drug we take the price of the drug and divide
    it by the number of ingredients it contains. Then that ingredient would be assigned a feature which
    is the mean price over all the dataset.

    -> Therefore the ingredients of a drug would be represented as a sorted list of length <top_k>
    """

    def __init__(self, column: str = ACTIVE_INGREDIENT):
        self.column = column
        self.ingredient_price_map = {}

    @staticmethod
    def _make_ingredient_prices(x: pd.Series, top_k: int = 5) -> List[float]:
        if len(x) >= top_k:
            return x.iloc[0:top_k].tolist()
        else:
            x_list = x.tolist()
            return x_list + [0 for _ in range(top_k - len(x_list))]

    def fit(
        self,
        df: pd.DataFrame,
        df_ingredient: pd.DataFrame = None,
        agg_func: str = "mean",
    ):
        df_ingredient_copy = df_ingredient.copy()
        df_ingredient_copy[PRICE] = df_ingredient_copy.drug_id.map(
            df.set_index(DRUG_ID)[PRICE]
        )
        df_ingredient_copy["nb_ingredients"] = df_ingredient_copy.groupby(DRUG_ID)[
            self.column
        ].transform("nunique")
        df_ingredient_copy["ingredient_price"] = (
            df_ingredient_copy[PRICE] / df_ingredient_copy.nb_ingredients
        )
        self.ingredient_price_map = (
            df_ingredient_copy.groupby(self.column)
            .ingredient_price.agg(func=agg_func)
            .to_dict()
        )
        return self

    def transform(
        self, df: pd.DataFrame, df_ingredients: pd.DataFrame, top_k: int = 5
    ) -> pd.DataFrame:

        f_cols = [f"ingredient{i}_feature" for i in range(top_k)]
        df_ingredients_copy = df_ingredients.copy()
        df_ingredients_copy["ingredient_price"] = df_ingredients_copy[self.column].map(
            self.ingredient_price_map
        )
        features_df = (
            df_ingredients_copy.sort_values("ingredient_price", ascending=True)
            .groupby(DRUG_ID, sort=False)
            .agg({"ingredient_price": lambda x: self._make_ingredient_prices(x, top_k)})
            .reset_index()
        )
        features_df[f_cols] = pd.DataFrame(features_df.ingredient_price.tolist())
        return df.merge(features_df.drop("ingredient_price", axis=1))



In [30]:
ie = IngredientsEncoder()

In [31]:
ie.fit(df, df_ingredient=df_ingredient)

IngredientsEncoder()

In [32]:
ie.transform(df, df_ingredient)

Unnamed: 0,drug_id,description,administrative_status,marketing_status,approved_for_hospital_use,reimbursement_rate,dosage_form,route_of_administration,marketing_authorization_status,marketing_declaration_date,marketing_authorization_date,marketing_authorization_process,pharmaceutical_companies,price,ingredient0_feature,ingredient1_feature,ingredient2_feature,ingredient3_feature,ingredient4_feature
0,0_train,3 plaquette(s) thermoformée(s) PVC-Aluminium d...,Présentation active,Déclaration de commercialisation,oui,65%,comprimé pelliculé,orale,Autorisation active,20140101,20140101,Procédure décentralisée,MAJORELLE LUXEMBOURG SOPARFI (LUXEMBOURG),2.83,1.985000,0.000000,0.0,0.0,0.0
1,1_train,plaquette(s) thermoformée(s) aluminium de 28 c...,Présentation active,Déclaration de commercialisation,oui,65%,comprimé à croquer,orale,Autorisation active,20130101,20090101,Procédure de reconnaissance mutuelle,TEVA SANTE,14.30,7.708571,8.018889,0.0,0.0,0.0
2,2_train,plaquette(s) thermoformée(s) PVC-aluminium de ...,Présentation active,Déclaration de commercialisation,oui,65%,gélule à libération prolongée,orale,Autorisation active,20000101,19960101,Procédure nationale,MYLAN SAS,5.66,13.363421,0.000000,0.0,0.0,0.0
3,3_train,30 sachet(s)-dose(s) papier aluminium complexe...,Présentation active,Déclaration de commercialisation,oui,65%,granulés à libération prolongée,orale,Autorisation active,20050101,20040101,Procédure nationale,SANOFI AVENTIS FRANCE,24.27,10.711667,10.711667,0.0,0.0,0.0
4,4_train,plaquette(s) thermoformée(s) PVC-Aluminium de ...,Présentation active,Déclaration d'arrêt de commercialisation,non,100%,comprimé pelliculé,orale,Autorisation active,20150101,20100101,Procédure nationale,TEVA SANTE,59.94,64.461538,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8559,8559_train,plaquette(s) thermoformée(s) polyamide alumini...,Présentation active,Déclaration d'arrêt de commercialisation,non,65%,comprimé orodispersible,orale,Autorisation active,20150101,20080101,Procédure décentralisée,RATIOPHARM (ALLEMAGNE),43.50,35.173418,0.000000,0.0,0.0,0.0
8560,8560_train,"12 seringue(s) préremplie(s) en verre de 0,5 ...",Présentation active,Déclaration de commercialisation,oui,65%,solution injectable,sous-cutanée,Autorisation active,19990101,19980101,Procédure centralisée,MERCK SERONO EUROPE (ROYAUME-UNI),781.37,752.595000,0.000000,0.0,0.0,0.0
8561,8561_train,flacon(s) polyéthylène haute densité (PEHD) de...,Présentation active,Déclaration d'arrêt de commercialisation,non,65%,comprimé gastro-résistant(e),orale,Autorisation active,20140101,20080101,Procédure nationale,CRISTERS,3.94,2.987571,2.987571,0.0,0.0,0.0
8562,8562_train,plaquette(s) thermoformée(s) PVC-Aluminium PVD...,Présentation active,Déclaration de commercialisation,oui,65%,comprimé pelliculé sécable,orale,Autorisation active,20090101,20010101,Procédure nationale,ARROW GENERIQUES,14.21,7.222195,0.000000,0.0,0.0,0.0
