## 🐎 Horse Survival Prediction

Given *medical data about horses*, let's try to predict whether a given horse will **survive** or not. 

We will use a decision tree classifier and a random forest classifier to make our predictions.

Data source: https://www.kaggle.com/datasets/uciml/horse-colic?select=horse.csv

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('horse.csv')
data

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,...,55.0,65.0,,,euthanized,no,3205,0,0,no
295,no,adult,527702,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,...,44.0,,serosanguious,3.3,euthanized,yes,2208,0,0,yes
296,yes,adult,529386,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,...,60.0,6.8,,,died,yes,3205,0,0,no
297,yes,adult,530612,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,...,50.0,6.0,serosanguious,3.4,lived,yes,2208,0,0,yes


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

### Preprocessing

In [4]:
df = data.copy()

In [5]:
df

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,...,55.0,65.0,,,euthanized,no,3205,0,0,no
295,no,adult,527702,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,...,44.0,,serosanguious,3.3,euthanized,yes,2208,0,0,yes
296,yes,adult,529386,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,...,60.0,6.8,,,died,yes,3205,0,0,no
297,yes,adult,530612,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,...,50.0,6.0,serosanguious,3.4,lived,yes,2208,0,0,yes


In [6]:
# Dealing with missing values
df.isna().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [7]:
binary_features = [
    'surgery',
    'age',
    'surgical_lesion',
    'cp_data'
]

ordinal_features = [
    'temp_of_extremities',
    'peripheral_pulse',
    'capillary_refill_time',
    'pain',
    'peristalsis',
    'abdominal_distention',
    'nasogastric_tube',
    'nasogastric_reflux',
    'rectal_exam_feces'
]

nominal_features = [
    'hospital_number',
    'mucous_membrane',
    'abdomen',
    'abdomo_appearance'
]

In [8]:
# Missing value imputation
for column in df.columns:
    if column in df.select_dtypes('object').columns:
        if column not in nominal_features:
            df[column] = df[column].fillna(df[column].mode()[0])
    else:
        df[column] = df[column].fillna(df[column].mean())

df

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.500000,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.400000,,3.039604,died,no,11300,0,0,no
1,yes,adult,534817,39.200000,88.0,20.0,cool,normal,pale_cyanotic,less_3_sec,...,50.0,85.000000,cloudy,2.000000,euthanized,no,2208,0,0,no
2,no,adult,530334,38.300000,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.700000,,3.039604,lived,no,0,0,0,yes
3,yes,young,5290409,39.100000,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.200000,serosanguious,5.300000,died,yes,2208,0,0,yes
4,no,adult,530255,37.300000,104.0,35.0,cool,normal,dark_cyanotic,more_3_sec,...,74.0,7.400000,,3.039604,died,no,4300,0,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,38.168619,120.0,70.0,cold,normal,pale_cyanotic,more_3_sec,...,55.0,65.000000,,3.039604,euthanized,no,3205,0,0,no
295,no,adult,527702,37.200000,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,...,44.0,24.274436,serosanguious,3.300000,euthanized,yes,2208,0,0,yes
296,yes,adult,529386,37.500000,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,...,60.0,6.800000,,3.039604,died,yes,3205,0,0,no
297,yes,adult,530612,36.500000,100.0,24.0,cool,reduced,pale_pink,less_3_sec,...,50.0,6.000000,serosanguious,3.400000,lived,yes,2208,0,0,yes


In [9]:
df.isna().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities        0
peripheral_pulse           0
mucous_membrane           47
capillary_refill_time      0
pain                       0
peristalsis                0
abdominal_distention       0
nasogastric_tube           0
nasogastric_reflux         0
nasogastric_reflux_ph      0
rectal_exam_feces          0
abdomen                  118
packed_cell_volume         0
total_protein              0
abdomo_appearance        165
abdomo_protein             0
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [10]:
{column: list(df[column].unique()) for column in df.select_dtypes('object').columns}

{'surgery': ['no', 'yes'],
 'age': ['adult', 'young'],
 'temp_of_extremities': ['cool', 'normal', 'cold', 'warm'],
 'peripheral_pulse': ['reduced', 'normal', 'absent', 'increased'],
 'mucous_membrane': [nan,
  'pale_cyanotic',
  'pale_pink',
  'dark_cyanotic',
  'normal_pink',
  'bright_red',
  'bright_pink'],
 'capillary_refill_time': ['more_3_sec', 'less_3_sec', '3'],
 'pain': ['extreme_pain', 'mild_pain', 'depressed', 'severe_pain', 'alert'],
 'peristalsis': ['absent', 'hypomotile', 'hypermotile', 'normal'],
 'abdominal_distention': ['severe', 'slight', 'none', 'moderate'],
 'nasogastric_tube': ['slight', 'none', 'significant'],
 'nasogastric_reflux': ['none', 'less_1_liter', 'more_1_liter'],
 'rectal_exam_feces': ['decreased', 'absent', 'normal', 'increased'],
 'abdomen': ['distend_large', 'other', 'normal', nan, 'firm', 'distend_small'],
 'abdomo_appearance': [nan, 'cloudy', 'serosanguious', 'clear'],
 'outcome': ['died', 'euthanized', 'lived'],
 'surgical_lesion': ['no', 'yes'],


In [11]:
def binary_encode(df, columns, positive_values):
    df = df.copy()
    for column, positive_value in zip(columns, positive_values):
        df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, columns, orderings):
    df = df.copy()
    for column, ordering in zip(columns, orderings):
        df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def nominal_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [12]:
positive_values = [
    'yes',
    'adult',
    'yes',
    'yes'
]

orderings = [
    ['cold', 'cool', 'normal', 'warm'],
    ['absent', 'reduced', 'normal', 'increased'],
    ['less_3_sec', '3', 'more_3_sec'],
    ['alert', 'depressed', 'mild_pain', 'severe_pain', 'extreme_pain'],
    ['absent', 'hypomotile', 'normal', 'hypermotile'],
    ['none', 'slight', 'moderate', 'severe'],
    ['none', 'slight', 'significant'],
    ['none', 'less_1_liter', 'more_1_liter'],
    ['absent', 'decreased', 'normal', 'increased']
]

prefixes = [
    'HN',
    'MM', 
    'AB',
    'AA'
]

In [13]:
df = binary_encode(df, columns=binary_features, positive_values=positive_values)
df = ordinal_encode(df, columns=ordinal_features, orderings=orderings)
df = nominal_encode(df, columns=nominal_features, prefixes=prefixes)

In [14]:
df

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,capillary_refill_time,pain,peristalsis,...,MM_pale_cyanotic,MM_pale_pink,AB_distend_large,AB_distend_small,AB_firm,AB_normal,AB_other,AA_clear,AA_cloudy,AA_serosanguious
0,0,1,38.500000,66.0,28.0,1,1,2,4,0,...,False,False,True,False,False,False,False,False,False,False
1,1,1,39.200000,88.0,20.0,1,2,0,2,0,...,True,False,False,False,False,False,True,False,True,False
2,0,1,38.300000,40.0,24.0,2,2,0,2,1,...,False,True,False,False,False,True,False,False,False,False
3,1,0,39.100000,164.0,84.0,0,2,2,1,0,...,False,False,False,False,False,False,False,False,False,True
4,0,1,37.300000,104.0,35.0,1,2,2,2,1,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1,1,38.168619,120.0,70.0,0,2,2,1,0,...,True,False,True,False,False,False,False,False,False,False
295,0,1,37.200000,72.0,24.0,1,3,2,3,1,...,True,False,False,True,False,False,False,False,False,True
296,1,1,37.500000,72.0,30.0,0,1,0,3,0,...,True,False,True,False,False,False,False,False,False,False
297,1,1,36.500000,100.0,24.0,1,1,0,2,1,...,False,True,False,True,False,False,False,False,False,True


In [16]:
# Encode labels
label_mapping = {'lived': 0, 'died':1, 'euthanized': 2}
df['outcome'] = df['outcome'].replace(label_mapping)

  df['outcome'] = df['outcome'].replace(label_mapping)


In [17]:
y = df['outcome'].copy()
X = df.drop('outcome', axis=1).copy()

In [18]:
y.value_counts()

outcome
0    178
1     77
2     44
Name: count, dtype: int64

In [19]:
# Scale X with a standard scaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,capillary_refill_time,pain,peristalsis,...,MM_pale_cyanotic,MM_pale_pink,AB_distend_large,AB_distend_small,AB_firm,AB_normal,AB_other,AA_clear,AA_cloudy,AA_serosanguious
0,-1.229880,0.295420,0.506209,-0.218798,-0.155463,-0.473504,-1.088914,1.675999,1.731538,-1.185885,...,-0.398641,-0.485322,1.683251,-0.409840,-0.213201,-0.321436,-0.260494,-0.398641,-0.431866,-0.426401
1,0.813087,0.295420,1.575511,0.583463,-0.660914,-0.473504,0.671006,-0.601836,0.036841,-1.185885,...,2.508522,-0.485322,-0.594089,-0.409840,-0.213201,-0.321436,3.838859,-0.398641,2.315535,-0.426401
2,-1.229880,0.295420,0.200694,-1.166925,-0.408189,0.801970,0.671006,-0.601836,0.036841,-0.077824,...,-0.398641,2.060489,-0.594089,-0.409840,-0.213201,3.111040,-0.260494,-0.398641,-0.431866,-0.426401
3,0.813087,-3.385016,1.422753,3.354910,3.382695,-1.748977,0.671006,1.675999,-0.810507,-1.185885,...,-0.398641,-0.485322,-0.594089,-0.409840,-0.213201,-0.321436,-0.260494,-0.398641,-0.431866,2.345208
4,-1.229880,0.295420,-1.326880,1.166925,0.286807,-0.473504,0.671006,1.675999,0.036841,-0.077824,...,-0.398641,-0.485322,-0.594089,-0.409840,-0.213201,-0.321436,-0.260494,-0.398641,-0.431866,-0.426401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,0.813087,0.295420,0.000000,1.750388,2.498156,-1.748977,0.671006,1.675999,-0.810507,-1.185885,...,2.508522,-0.485322,1.683251,-0.409840,-0.213201,-0.321436,-0.260494,-0.398641,-0.431866,-0.426401
295,-1.229880,0.295420,-1.479638,0.000000,-0.408189,-0.473504,2.430926,1.675999,0.884190,-0.077824,...,2.508522,-0.485322,-0.594089,2.439977,-0.213201,-0.321436,-0.260494,-0.398641,-0.431866,2.345208
296,0.813087,0.295420,-1.021366,0.000000,-0.029100,-1.748977,-1.088914,-0.601836,0.884190,-1.185885,...,2.508522,-0.485322,1.683251,-0.409840,-0.213201,-0.321436,-0.260494,-0.398641,-0.431866,-0.426401
297,0.813087,0.295420,-2.548940,1.021059,-0.408189,-0.473504,-1.088914,-0.601836,0.036841,-0.077824,...,-0.398641,2.060489,-0.594089,2.439977,-0.213201,-0.321436,-0.260494,-0.398641,-0.431866,2.345208


### Training

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
X_train.shape, X_test.shape

((209, 320), (90, 320))

In [21]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print("Decision Tree Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

Decision Tree Accuracy: 63.33%


In [22]:
ensemble_model = RandomForestClassifier()
ensemble_model.fit(X_train, y_train)
print("Random Forest Accuracy: {:.2f}%".format(ensemble_model.score(X_test, y_test) * 100))

Random Forest Accuracy: 73.33%
