Training of a new model

In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score

df_drug = pd.read_csv("drug200.csv")

label_encoder = LabelEncoder()

categorical_features = [feature for feature in df_drug.columns if df_drug[feature].dtypes == 'O']
for feature in categorical_features:
    df_drug[feature]=label_encoder.fit_transform(df_drug[feature])
    
X = df_drug.drop("Drug", axis=1)
y = df_drug["Drug"]

model = DecisionTreeClassifier(criterion="entropy")
model.fit(X, y)

kfold = KFold(random_state=42, shuffle=True)
cv_results = cross_val_score(model, X, y, cv=kfold, scoring="accuracy")
print("Accuracy")
print(cv_results.mean())

Accuracy
0.99


Saving Trained Model

In [2]:
import pickle

pickle_file = open('model.pkl', 'ab')
pickle.dump(model, pickle_file)                     
pickle_file.close()

Determine encoded values of features

In [5]:
df_drug = pd.read_csv("drug200.csv")

label_encoder = LabelEncoder()

categorical_features = [feature for feature in df_drug.columns if df_drug[feature].dtypes == 'O']
for feature in categorical_features:
    print(feature, list(df_drug[feature].unique()), list(label_encoder.fit_transform(df_drug[feature].unique())), "\n")

Sex ['F', 'M'] [0, 1] 

BP ['HIGH', 'LOW', 'NORMAL'] [0, 1, 2] 

Cholesterol ['HIGH', 'NORMAL'] [0, 1] 

Drug ['DrugY', 'drugC', 'drugX', 'drugA', 'drugB'] [0, 3, 4, 1, 2] 



Representation of Model as function mapping consumer's inputs to output

In [8]:
gender_map = {"F": 0, "M": 1}
bp_map = {"HIGH": 0, "LOW": 1, "NORMAL": 2}
cholestol_map = {"HIGH": 0, "NORMAL": 1}
drug_map = {0: "DrugY", 3: "drugC", 4: "drugX", 1: "drugA", 2: "drugB"}

def predict_drug(Age, 
                 Sex, 
                 BP, 
                 Cholesterol, 
                 Na_to_K):

    # 1. Read the machine learning model from its saved state ...
    pickle_file = open('model.pkl', 'rb')     
    model = pickle.load(pickle_file)
    
    # 2. Transform the "raw data" passed into the function to the encoded / numerical values using the maps / dictionaries
    Sex = gender_map[Sex]
    BP = bp_map[BP]
    Cholesterol = cholestol_map[Cholesterol]

    # 3. Make an individual prediction for this set of data
    y_predict = model.predict([[Age, Sex, BP, Cholesterol, Na_to_K]])[0]

    # 4. Return the "raw" version of the prediction i.e. the actual name of the drug rather than the numerical encoded version
    return drug_map[y_predict] 

    
print("----------------------------------")
print("----------------------------------")

# Testing of that model function
predict_drug(47, "F", "LOW",  "HIGH", 14)


----------------------------------
----------------------------------


  "X does not have valid feature names, but"


'drugC'