# Loading dataset

In [1]:
import pandas as pd

In [2]:
# Let's load the dataset and see it's shape and the first 5 entries
horse = pd.read_csv('horse.csv')
print(horse.shape)
horse.head()

(299, 28)


Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


# Data Pre-processing

In [3]:
# Lets take out the target, i.e., 'outcome' column from the dataset
target = horse['outcome']
print(target.shape)
print(target.unique())

(299,)
['died' 'euthanized' 'lived']


In [4]:
target.isnull().any()

False

So, we have no missing values in our target

In [5]:
# Let's remove the 'outcome' column and get out feature dataset
data = horse.drop(['outcome'], axis=1)
print(data.shape)
data.head()

(299, 27)


Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,distend_large,45.0,8.4,,,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,other,50.0,85.0,cloudy,2.0,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,normal,33.0,6.7,,,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,,48.0,7.2,serosanguious,5.3,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,,74.0,7.4,,,no,4300,0,0,no


In [6]:
# Let's see the number of numerical & categorical features we have
data.dtypes.value_counts()

object     16
float64     7
int64       4
dtype: int64

So, we have 16 categorical features and 11 numerical features.

Let's check if these features have missing values.

In [7]:
data.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

So, we see that there are many missing values. These need to be dealt with before fitting in any model.

Let's fill the missing values of categorical variables first

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')   # Our strategy will be to fill the most frequent value

for column in data.columns:
    if data[column].dtype == 'object':
        imputed = imputer.fit_transform(data[column].values.reshape(-1,1))
        data[column] = pd.DataFrame(imputed)

In [9]:
# Now ;et's check the missing values again
data.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities        0
peripheral_pulse           0
mucous_membrane            0
capillary_refill_time      0
pain                       0
peristalsis                0
abdominal_distention       0
nasogastric_tube           0
nasogastric_reflux         0
nasogastric_reflux_ph    246
rectal_exam_feces          0
abdomen                    0
packed_cell_volume        29
total_protein             33
abdomo_appearance          0
abdomo_protein           198
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

We see that there are no missing values in the categorical features now.

Now, let's fill the missing values of numerical variables

In [10]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')   # Our strategy will be to fill the median value

for column in data.columns:
    if data[column].dtype != 'object':
        imputed = imputer.fit_transform(data[column].values.reshape(-1,1))
        data[column] = pd.DataFrame(imputed)

In [11]:
data.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

In [12]:
data.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101.0,38.5,66.0,28.0,cool,reduced,normal_pink,more_3_sec,...,distend_large,45.0,8.4,cloudy,2.3,no,11300.0,0.0,0.0,no
1,yes,adult,534817.0,39.2,88.0,20.0,cool,normal,pale_cyanotic,less_3_sec,...,other,50.0,85.0,cloudy,2.0,no,2208.0,0.0,0.0,no
2,no,adult,530334.0,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,normal,33.0,6.7,cloudy,2.3,no,0.0,0.0,0.0,yes
3,yes,young,5290409.0,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,distend_large,48.0,7.2,serosanguious,5.3,yes,2208.0,0.0,0.0,yes
4,no,adult,530255.0,37.3,104.0,35.0,cool,normal,dark_cyanotic,more_3_sec,...,distend_large,74.0,7.4,cloudy,2.3,no,4300.0,0.0,0.0,no


So, we can see that now, there is no missing values in the data set.

But, we have 16 categorical features. And we know that decision trees don't perform well with categorical features. So, let's 
convert the categorical features to numerical features.

In [13]:
data = pd.get_dummies(data)

In [14]:
print(data.shape)
data.head()

(299, 67)


Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,530101.0,38.5,66.0,28.0,5.0,45.0,8.4,2.3,11300.0,0.0,...,0,0,0,0,1,0,1,0,1,0
1,534817.0,39.2,88.0,20.0,5.0,50.0,85.0,2.0,2208.0,0.0,...,0,0,1,0,1,0,1,0,1,0
2,530334.0,38.3,40.0,24.0,5.0,33.0,6.7,2.3,0.0,0.0,...,0,1,0,0,1,0,1,0,0,1
3,5290409.0,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208.0,0.0,...,0,0,0,0,0,1,0,1,0,1
4,530255.0,37.3,104.0,35.0,5.0,74.0,7.4,2.3,4300.0,0.0,...,0,0,0,0,1,0,1,0,1,0


Now, we have all the data in numerical form.

Let's encode our target into numerical values and then, split our dataset into training and testing set in the ratio 80:20

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

x,y = data, target

encoder = LabelEncoder()
y = encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [16]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(239, 67)
(60, 67)
(239,)
(60,)


# Model building

Now, let's build 2 classifier models, one with Decision Tree and other a Random Forest, and compare the results.

Since Random Forest is an ensamble technique, we should expect it to give better results.

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
# Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier
classifier_1 = DecisionTreeClassifier().fit(x_train,y_train)

y_predict_1 = classifier_1.predict(x_test)

In [33]:
# Let's check model accuracy
accuracy = accuracy_score(y_test, y_predict_1)
print('Decision Tree accuracy: ', accuracy)
print()

Decision Tree accuracy:  0.6333333333333333



Now, let's check the result from a Random Forest classifier for the same data

In [34]:
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
classifier_2 = RandomForestClassifier().fit(x_train,y_train)

y_predict_2 = classifier_2.predict(x_test)

In [35]:
# Let's check model accuracy
accuracy = accuracy_score(y_test, y_predict_2)
print('Random Forest accuracy: ', accuracy)
print()

Random Forest accuracy:  0.7



So, we get the result as expected.