In [88]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [89]:
df = pd.read_csv("../data/drugs_train.csv")

In [90]:
df_ing = pd.read_csv("../data/active_ingredients.csv")

In [91]:
df.head(2)

Unnamed: 0,drug_id,description,administrative_status,marketing_status,approved_for_hospital_use,reimbursement_rate,dosage_form,route_of_administration,marketing_authorization_status,marketing_declaration_date,marketing_authorization_date,marketing_authorization_process,pharmaceutical_companies,price
0,0_train,3 plaquette(s) thermoformée(s) PVC-Aluminium d...,Présentation active,Déclaration de commercialisation,oui,65%,comprimé pelliculé,orale,Autorisation active,20140101,20140101,Procédure décentralisée,MAJORELLE LUXEMBOURG SOPARFI (LUXEMBOURG),2.83
1,1_train,plaquette(s) thermoformée(s) aluminium de 28 c...,Présentation active,Déclaration de commercialisation,oui,65%,comprimé à croquer,orale,Autorisation active,20130101,20090101,Procédure de reconnaissance mutuelle,TEVA SANTE,14.3


In [92]:
df_ing.head(2)

Unnamed: 0,drug_id,active_ingredient
0,0_train,DÉSOGESTREL
1,1_train,MONTÉLUKAST ACIDE


In [133]:
"highgh".split(" ")

['highgh']

In [93]:
df_ing = df_ing[df_ing.drug_id.str.contains("train")]

In [94]:
def split_or_keep(s):
    if " " in s:
        return s.split(" ")[0]
    return s

In [95]:
df_ing.active_ingredient = df_ing.active_ingredient.str.strip()
df_ing.active_ingredient = df_ing.active_ingredient.map(split_or_keep)

In [96]:
df_ing.head(2)

Unnamed: 0,drug_id,active_ingredient
0,0_train,DÉSOGESTREL
1,1_train,MONTÉLUKAST


In [97]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df, random_state=40, test_size=0.2)

In [98]:
df_ing_train = df_ing[df_ing.drug_id.isin(X_train.drug_id)]
df_ing_test = df_ing[df_ing.drug_id.isin(X_test.drug_id)]

## Ingredient price estimation

In [99]:
df_ing_train["ing_count"] = df_ing_train.groupby("drug_id").active_ingredient.transform("nunique")
df_ing_train["price"] = df_ing_train.drug_id.map(X_train.set_index("drug_id").price)
df_ing_train["ing_price"] = df_ing_train.price / df_ing_train.ing_count

In [100]:
df_ing_train.head(6)

Unnamed: 0,drug_id,active_ingredient,ing_count,price,ing_price
3,2_train,CHLORHYDRATE,1,5.66,5.66
6,4_train,LÉTROZOLE,1,59.94,59.94
7,5_train,FLUTAMIDE,1,27.17,27.17
8,6_train,BEXAROTÈNE,1,877.83,877.83
9,7_train,GLIMÉPIRIDE,1,6.58,6.58
10,8_train,MALÉATE,1,18.63,18.63


In [101]:
df_ing_train["ing_feature"] = df_ing_train.groupby("active_ingredient").ing_price.transform("mean")

In [102]:
df_ing_train.head(10)

Unnamed: 0,drug_id,active_ingredient,ing_count,price,ing_price,ing_feature
3,2_train,CHLORHYDRATE,1,5.66,5.66,9.379887
6,4_train,LÉTROZOLE,1,59.94,59.94,63.621818
7,5_train,FLUTAMIDE,1,27.17,27.17,27.17
8,6_train,BEXAROTÈNE,1,877.83,877.83,877.83
9,7_train,GLIMÉPIRIDE,1,6.58,6.58,9.66
10,8_train,MALÉATE,1,18.63,18.63,5.070337
11,9_train,CEFPODOXIME,1,6.13,6.13,5.261512
12,9_train,CEFPODOXIME,1,6.13,6.13,5.261512
13,10_train,ESTRADIOL,1,6.25,6.25,4.263617
14,10_train,ESTRADIOL,1,6.25,6.25,4.263617


In [103]:
ing_price_map = df_ing_train.groupby("active_ingredient").ing_price.mean()

In [104]:
df_ing_test["ing_feature"] = df_ing_test.active_ingredient.map(ing_price_map)

In [105]:
df_ing_test.head()

Unnamed: 0,drug_id,active_ingredient,ing_feature
0,0_train,DÉSOGESTREL,2.154
1,1_train,MONTÉLUKAST,15.502439
2,1_train,MONTÉLUKAST,15.502439
4,3_train,ACIDE,17.81879
5,3_train,VALPROATE,7.519167


In [106]:
df_ing_test.fillna(df_ing_test.ing_feature.mean(), inplace=True)

In [107]:
df_ing_test.sort_values('ing_feature',ascending=True).head()

Unnamed: 0,drug_id,active_ingredient,ing_feature
11845,7313_train,FLUOR,0.696333
9905,6162_train,FLUOR,0.696333
587,342_train,FLUOR,0.696333
12899,7911_train,ALGINATE,0.729167
8033,5002_train,DEXAMÉTHASONE,0.738333


## Create features per drug

In [108]:
top_ing = 10
def agg_func(x):
    if len(x) >=top_ing:
        return x.iloc[0:top_ing].tolist()
    else:
        part_one = x.iloc[0:top_ing].tolist()
        s = [0 for _ in range(top_ing - len(part_one))]
        return part_one + s

In [109]:
f_cols = [f"f_{i}" for i in range(top_ing)]

In [110]:
train_f = df_ing_train.sort_values('ing_feature',
                                    ascending=True
                                  ).groupby('drug_id', 
                                            sort=False
                                           ).agg({'ing_feature':lambda x: agg_func(x)}).reset_index()

test_f = df_ing_test.sort_values('ing_feature',
                                  ascending=True
                                  ).groupby('drug_id', 
                                            sort=False
                                           ).agg({'ing_feature':lambda x: agg_func(x)}).reset_index()

In [111]:
train_f[f_cols] = pd.DataFrame(train_f.ing_feature.tolist())
train_f.drop("ing_feature", axis=1, inplace=True)
test_f[f_cols] = pd.DataFrame(test_f.ing_feature.tolist())
test_f.drop("ing_feature", axis=1, inplace=True)

In [112]:
X_train = X_train.merge(train_f)
X_test = X_test.merge(test_f)

In [113]:
X_train.head(2)

Unnamed: 0,drug_id,description,administrative_status,marketing_status,approved_for_hospital_use,reimbursement_rate,dosage_form,route_of_administration,marketing_authorization_status,marketing_declaration_date,...,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9
0,2850_train,plaquette(s) thermoformée(s) aluminium polyami...,Présentation active,Déclaration de commercialisation,oui,65%,comprimé sécable,orale,Autorisation active,20140101,...,5.070337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3258_train,1 seringue(s) préremplie(s) polycyclooléfine -...,Présentation active,Déclaration de commercialisation,oui,100%,poudre et solvant pour solution injectable,sous-cutanée,Autorisation active,20060101,...,20.130466,60.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
X_test.head(2)

Unnamed: 0,drug_id,description,administrative_status,marketing_status,approved_for_hospital_use,reimbursement_rate,dosage_form,route_of_administration,marketing_authorization_status,marketing_declaration_date,...,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9
0,3955_train,plaquette(s) polyamide aluminium PVC-Aluminium...,Présentation active,Déclaration de commercialisation,oui,65%,comprimé pelliculé sécable,orale,Autorisation active,19920101,...,16.982795,16.982795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5939_train,plaquette(s) thermoformée(s) PVC-Aluminium de ...,Présentation active,Déclaration de commercialisation,oui,65%,gélule,orale,Autorisation active,20150101,...,9.274242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [117]:
y_train, y_test = X_train["price"], X_test["price"]

In [126]:
y_mean, y_std = y_train.mean(), y_train.std()

In [127]:
y_train_bis = (y_train - y_mean) / y_std
y_test_bis = (y_test - y_mean) / y_std

In [128]:
model = LinearRegression(n_jobs=-1)
#model = xgb.XGBRegressor(n_estimators=7, max_depth=7, verbosity=1, reg_lambda=0.001, reg_alpha=0.1)

In [129]:
model.fit(X_train[f_cols], y_train_bis)

LinearRegression(n_jobs=-1)

In [132]:
from sklearn.metrics import mean_squared_error
d = {"train": [X_train[f_cols], y_train], "val": [X_test[f_cols], y_test]}
for k in d:
    print("*"*50)
    print(k)
    preds = y_mean + y_std * model.predict(d[k][0])
    truth = d[k][1]
    rmse = mean_squared_error(preds, truth, squared=False)
    print(f"rmse: {rmse}")

**************************************************
train
rmse: 40.835496081100594
**************************************************
val
rmse: 59.104125254953345
