In [1]:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from IPython.display import Image
from six import StringIO
from sklearn.tree import export_graphviz
import pydotplus

In [2]:
df = pd.read_csv('../../datasets/train.csv', encoding="latin-1")
df.head()

Unnamed: 0,Column1,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,...,Fire_Involvement,Alcohol,Drug,Product_1,Product_2,Product_3,Narrative,Stratum,PSU,Weight
0,900763,180608221.0,5/29/2018,25,1,2.0,,,35.0,71.0,...,0.0,,,3265.0,0.0,0.0,25YOM PAIN TO KNEE WHEN LIFTING WEIGHTS ...,L,42.0,55.4188
1,762342,180109488.0,1/1/2018,70,2,1.0,,,30.0,55.0,...,0.0,,,276.0,0.0,0.0,70 YOF PT STATES SHE FELL IN HER KITCHEN & HIT...,M,16.0,78.3782
2,560150,170718633.0,7/5/2017,18,1,0.0,,,92.0,59.0,...,0.0,,,3286.0,0.0,0.0,18 YO M LAC FINGER-4-WHEELER ACCIDENT,V,17.0,16.1828
3,464771,170429466.0,4/1/2017,17,2,1.0,,,34.0,53.0,...,0.0,,,4056.0,0.0,0.0,17 YOF A SHELF FELL ONTO PT WRIST CAUSING A CO...,C,32.0,4.757
4,866180,180504499.0,4/27/2018,27,1,2.0,,,35.0,64.0,...,0.0,,,1267.0,0.0,0.0,27 YOM HAS KNEE PX WHICH WORSENS WHILE PLAYING...,V,93.0,16.5919


In [3]:
burns = df[(df["Diagnosis"] == 49) | (df["Diagnosis"] == 46)]

In [4]:
inputs = burns.filter(["Diagnosis", "Age", "Sex","Race"]).dropna()
inputs

Unnamed: 0,Diagnosis,Age,Sex,Race
1014,49.0,65,2,0.0
1209,46.0,28,1,1.0
1461,49.0,30,2,1.0
3596,49.0,50,1,0.0
4337,49.0,26,2,0.0
...,...,...,...,...
700505,49.0,42,2,0.0
700680,49.0,223,1,2.0
701549,49.0,66,2,1.0
702092,49.0,51,1,0.0


In [5]:
age = LabelEncoder()
sex = LabelEncoder()
race = LabelEncoder()
inputs["age_n"] = age.fit_transform(inputs["Age"])
inputs["sex_n"] = sex.fit_transform(inputs["Sex"])
inputs["race_n"] = race.fit_transform(inputs["Race"])
inputs

Unnamed: 0,Diagnosis,Age,Sex,Race,age_n,sex_n,race_n
1014,49.0,65,2,0.0,64,1,0
1209,46.0,28,1,1.0,27,0,1
1461,49.0,30,2,1.0,29,1,1
3596,49.0,50,1,0.0,49,0,0
4337,49.0,26,2,0.0,25,1,0
...,...,...,...,...,...,...,...
700505,49.0,42,2,0.0,41,1,0
700680,49.0,223,1,2.0,107,0,2
701549,49.0,66,2,1.0,65,1,1
702092,49.0,51,1,0.0,50,0,0


In [6]:
target = inputs["Diagnosis"]
target

1014      49.0
1209      46.0
1461      49.0
3596      49.0
4337      49.0
          ... 
700505    49.0
700680    49.0
701549    49.0
702092    49.0
702330    49.0
Name: Diagnosis, Length: 1131, dtype: float64

In [7]:
inputs_n = inputs.drop(['Diagnosis', 'Age', 'Sex', 'Race'], axis='columns')
inputs_n

Unnamed: 0,age_n,sex_n,race_n
1014,64,1,0
1209,27,0,1
1461,29,1,1
3596,49,0,0
4337,25,1,0
...,...,...,...
700505,41,1,0
700680,107,0,2
701549,65,1,1
702092,50,0,0


In [8]:
model = tree.DecisionTreeClassifier()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(inputs_n, target, test_size=0.2, random_state=1) # 80% training and 20% test

In [10]:
X_train, xval, y_train, yval = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 60% training and 20% 20% validation

In [11]:
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [12]:
model.score(X_test,y_test)

0.7136563876651982

In [13]:
predict = model.predict(X_test)
print("Accuracy Score: {}\n\nClassification Report:\n{}".format(accuracy_score(y_test, predict), classification_report(y_test, predict)))

Accuracy Score: 0.7136563876651982

Classification Report:
              precision    recall  f1-score   support

        46.0       0.26      0.28      0.27        43
        49.0       0.83      0.82      0.82       184

    accuracy                           0.71       227
   macro avg       0.54      0.55      0.55       227
weighted avg       0.72      0.71      0.72       227



In [14]:
confusion_matrix(y_test, model.predict(X_test))

array([[ 12,  31],
       [ 34, 150]])

In [15]:
# diagnosis = df["Diagnosis"].dropna().unique()
diagnosis = ['71', '55', '59', '53', '64', '57', '56', '51', '72', '48', '58', '62', '52',
       '74', '61', '41', '60', '63', '65', '66', '42', '68', '49', '50', '46', '69',
       '54', '47', '73', '67']
diagnosis = ['46', '49']

In [16]:
feature_cols = ["Age", "Sex", "Race"]
dot_data = StringIO()
export_graphviz(model, out_file=dot_data, filled=True, rounded=True, special_characters=True,feature_names = feature_cols,
    class_names=diagnosis)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('../../outputs/demographic-burninjury.png')
Image(graph.create_png())

KeyboardInterrupt: 