In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [21]:
# Read data
animals = pd.read_csv('horse.csv')
animals.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


Preprocessing

In [22]:
# Initialize target variable
target = animals['outcome']

# Get the unique values of the target variable
target.unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [23]:
# Extract the features of the dataset by dropping the target variable
animals = animals.drop(['outcome'], axis=1)

In [24]:
# Check if there are categorical data
animals.info()

# We can see that we have 16 categorical variables

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [25]:
# Convert categorical data into numerical data
animals = pd.get_dummies(animals)
animals.head()

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,530101,38.5,66.0,28.0,,45.0,8.4,,11300,0,...,0,0,0,0,0,0,1,0,1,0
1,534817,39.2,88.0,20.0,,50.0,85.0,2.0,2208,0,...,0,0,1,0,1,0,1,0,1,0
2,530334,38.3,40.0,24.0,,33.0,6.7,,0,0,...,0,1,0,0,0,0,1,0,0,1
3,5290409,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208,0,...,0,0,0,0,0,1,0,1,0,1
4,530255,37.3,104.0,35.0,,74.0,7.4,,4300,0,...,0,0,0,0,0,0,1,0,1,0


Modelling

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Initialize independent and dependent variables
X, y = animals.values, target.values

# Split the dataset into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Check shape of the training set
print("Training set shape:", X_train.shape)

Training set shape: (239, 67)


In [35]:
from sklearn.impute import SimpleImputer
import numpy as np

# Replace NaN values with the most frequent value that appears, since we must be careful with missing values in Decision Trees
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# Fit imputer into training and testing data and transform them
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

In [36]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()

# Fit the decision tree classifier into the training data
classifier.fit(X_train, y_train)

# Predictions for the testing set
y_predict = classifier.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score

# Get the accuracy score
print("Accuracy:", accuracy_score(y_predict, y_test))

# Very low accuracy

Accuracy: 0.5333333333333333


In [38]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

# Fit the random forest classifier into the training data
classifier.fit(X_train, y_train)

# Predictions for the testing set
y_predict = classifier.predict(X_test)

# Get the accuracy score
print("Accuracy:", accuracy_score(y_predict, y_test))

# Accuracy improved

Accuracy: 0.6833333333333333
