In [43]:
from typing import List
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class HighCardTransformer(BaseEstimator, TransformerMixin):
    """
    Encode high cardinality features with a mean/median/quantile value of the price
    """

    def __init__(self, columns: List[str] = None):
        self.columns = columns
        self.columns_dict = {}

    def fit(self, df: pd.DataFrame, agg_func: str = "mean"):
        for col in self.columns:
            col_map = df.groupby(col).price.agg(func=agg_func)
            self.columns_dict[col] = col_map
        return self

    def transform(self, df: pd.DataFrame):
        df_copy = df.copy()
        for col in self.columns:
            df_copy[col + "_feature"] = df_copy[col].map(self.columns_dict[col])
            df_copy.drop(columns=[col], inplace=True)
        return df_copy

In [44]:
import pandas as pd

In [45]:
train_df = pd.read_csv("../data/drugs_train.csv")

In [46]:
high_card_columns = ["dosage_form", "route_of_administration", "pharmaceutical_companies"]

hct = HighCardTransformer(columns=high_card_columns)

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(train_df)

In [48]:
X_train.head(2)

Unnamed: 0,drug_id,description,administrative_status,marketing_status,approved_for_hospital_use,reimbursement_rate,dosage_form,route_of_administration,marketing_authorization_status,marketing_declaration_date,marketing_authorization_date,marketing_authorization_process,pharmaceutical_companies,price
8101,8101_train,3 plaquette(s) PVC-Aluminium de 21 comprimé(s),Présentation active,Déclaration de commercialisation,oui,65%,comprimé enrobé et comprimé enrobé et compri...,orale,Autorisation active,19860101,19850101,Procédure nationale,PFIZER HOLDING FRANCE,3.79
6698,6698_train,plaquette(s) thermoformée(s) polyamide alumini...,Présentation active,Déclaration d'arrêt de commercialisation,non,65%,comprimé orodispersible,orale,Autorisation active,20130101,20060101,Procédure nationale,RATIOPHARM (ALLEMAGNE),6.24


In [49]:
hct.fit(X_train)

HighCardTransformer(columns=['dosage_form', 'route_of_administration',
                             'pharmaceutical_companies'])

In [54]:
X_train_tr = hct.transform(X_val)

In [55]:
X_train_tr.head(2)

Unnamed: 0,drug_id,description,administrative_status,marketing_status,approved_for_hospital_use,reimbursement_rate,marketing_authorization_status,marketing_declaration_date,marketing_authorization_date,marketing_authorization_process,price,dosage_form_feature,route_of_administration_feature,pharmaceutical_companies_feature
6173,6173_train,1 flacon(s) en verre de 30 ml avec compte-gou...,Présentation active,Déclaration de commercialisation,oui,65%,Autorisation active,19660101,19940101,Procédure nationale,2.73,27.240366,20.418007,12.24359
1446,1446_train,1 flacon(s) polyéthylène de 90 gélule(s),Présentation active,Déclaration de commercialisation,oui,65%,Autorisation active,20070101,19960101,Procédure nationale,104.94,17.4875,20.418007,33.409216


In [31]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from typing import List
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class HighCardTransformer(BaseEstimator, TransformerMixin):
    """
    Encode high cardinality features with a mean/median/quantile value of the price
    """

    def __init__(self, columns: List[str] = None):
        self.columns = columns
        self.columns_dict = {}

    def fit(self, df: pd.DataFrame, agg_func: str = "mean"):
        for col in self.columns:
            col_map = df.groupby(col).price.agg(func=agg_func)
            self.columns_dict[col] = col_map
        return self

    def transform(self, df: pd.DataFrame):
        df_copy = df.copy()
        for col in self.columns:
            df_copy[col + "_feature"] = df_copy[col].map(self.columns_dict[col])
            df_copy.drop(columns=[col], inplace=True)


# Use Custom Transformer
df = pd.DataFrame({"a": [1, -2, 3], "price": [-4, 5, 6], "c": [-7, -8, 9]})
pipe = Pipeline(
    steps=[
        ("multiply_cols_by_3", MultiplyColumns(high_card_columns=["a", "c"]))
    ]
)
transformed_df = pipe.fit_transform(df)
print(df)

   a  price  c
0  1     -4 -7
1 -2      5 -8
2  3      6  9
