In [85]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline

In [68]:
pharma_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_begs.csv')

In [69]:
pharma_data.shape

(23097, 18)

In [70]:
pharma_data.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,DX6,56,18.479385,YES,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,DX2,36,22.945566,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,DX6,48,27.510027,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,DX1,5,19.130976,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,,128,1.3484,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1


In [71]:
pharma_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  23097 non-null  int64  
 1   Diagnosed_Condition        23097 non-null  int64  
 2   Patient_ID                 23097 non-null  int64  
 3   Treated_with_drugs         23084 non-null  object 
 4   Patient_Age                23097 non-null  int64  
 5   Patient_Body_Mass_Index    23097 non-null  float64
 6   Patient_Smoker             23097 non-null  object 
 7   Patient_Rural_Urban        23097 non-null  object 
 8   Patient_mental_condition   23097 non-null  object 
 9   A                          21862 non-null  float64
 10  B                          21862 non-null  float64
 11  C                          21862 non-null  float64
 12  D                          21862 non-null  float64
 13  E                          21862 non-null  flo

In [72]:
pharma_data.describe(include = "all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
ID_Patient_Care_Situation,23097,,,,16545.7,9532.26,2.0,8280.0,16597.0,24825.0,33014.0
Diagnosed_Condition,23097,,,,26.4131,15.0309,0.0,13.0,26.0,39.0,52.0
Patient_ID,23097,,,,6261.28,3595.99,1.0,3181.0,6242.0,9363.0,12515.0
Treated_with_drugs,23084,32.0,DX6,8606.0,,,,,,,
Patient_Age,23097,,,,33.2098,19.5499,0.0,16.0,33.0,50.0,149.0
Patient_Body_Mass_Index,23097,,,,23.4548,3.80766,1.0893,20.2055,23.3862,26.7882,29.9996
Patient_Smoker,23097,3.0,NO,13246.0,,,,,,,
Patient_Rural_Urban,23097,2.0,RURAL,16134.0,,,,,,,
Patient_mental_condition,23097,1.0,Stable,23097.0,,,,,,,
A,21862,,,,0.897905,0.30278,0.0,1.0,1.0,1.0,1.0


In [73]:
pharma_data["Treated_with_drugs"] = pharma_data["Treated_with_drugs"].fillna("UNKNOWN")
pharma_data[["A", "B", "C", "D", "E", "F", "Z", "Number_of_prev_cond"]] = pharma_data[["A", "B", "C", "D", "E", "F", "Z", "Number_of_prev_cond"]].fillna(0)

In [74]:
pharma_data["Treated_with_drugs"] = pharma_data["Treated_with_drugs"].str.strip()
print(pharma_data["Treated_with_drugs"].unique())

['DX6' 'DX2' 'DX1' 'UNKNOWN' 'DX3' 'DX5' 'DX4' 'DX2 DX5' 'DX2 DX3'
 'DX1 DX3' 'DX1 DX2' 'DX3 DX4' 'DX1 DX4' 'DX1 DX5' 'DX2 DX4 DX5'
 'DX1 DX2 DX3' 'DX3 DX5' 'DX2 DX4' 'DX3 DX4 DX5' 'DX2 DX3 DX5'
 'DX1 DX2 DX3 DX4' 'DX4 DX5' 'DX2 DX3 DX4' 'DX1 DX4 DX5' 'DX1 DX2 DX4'
 'DX1 DX3 DX5' 'DX1 DX2 DX5' 'DX1 DX3 DX4' 'DX1 DX3 DX4 DX5'
 'DX1 DX2 DX4 DX5' 'DX2 DX3 DX4 DX5' 'DX1 DX2 DX3 DX5'
 'DX1 DX2 DX3 DX4 DX5']


In [75]:
encoder_drugs = LabelEncoder()
pharma_data["Treated_with_drugs"] = encoder_drugs.fit_transform(pharma_data["Treated_with_drugs"])

encoder_smoker = LabelEncoder()
pharma_data["Patient_Smoker"] = encoder_smoker.fit_transform(pharma_data["Patient_Smoker"])

encoder_ruralurban = LabelEncoder()
pharma_data["Patient_Rural_Urban"] = encoder_ruralurban.fit_transform(pharma_data["Patient_Rural_Urban"])

encoder_mental = LabelEncoder()
pharma_data["Patient_mental_condition"] = encoder_mental.fit_transform(pharma_data["Patient_mental_condition"])


In [76]:
X = pharma_data.drop(columns=["Survived_1_year"])
y = pharma_data["Survived_1_year"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 1)

In [77]:
tree_classifier_model = DecisionTreeClassifier()

In [78]:
tree_classifier_model.fit(X_train, y_train)

DecisionTreeClassifier()

In [81]:
y_pred_train = tree_classifier_model.predict(X_train)

In [84]:
report_train = classification_report(y_train, y_pred_train)
print(report_train)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7211
           1       1.00      1.00      1.00     12421

    accuracy                           1.00     19632
   macro avg       1.00      1.00      1.00     19632
weighted avg       1.00      1.00      1.00     19632



In [86]:
print(confusion_matrix(y_train, y_pred_train))

[[ 7211     0]
 [    0 12421]]


In [87]:
y_pred_test = tree_classifier_model.predict(X_test)

In [89]:
report_test = classification_report(y_test, y_pred_test)
print(report_test)

              precision    recall  f1-score   support

           0       0.69      0.72      0.70      1283
           1       0.83      0.81      0.82      2182

    accuracy                           0.77      3465
   macro avg       0.76      0.76      0.76      3465
weighted avg       0.78      0.77      0.77      3465



In [90]:
print(confusion_matrix(y_test, y_pred_test))

[[ 918  365]
 [ 422 1760]]


(to be continued ...)