In [1]:
import pandas as pd
import matplotlib.pyplot as plot
%matplotlib inline

In [2]:
animals = pd.read_csv("horse.csv")

In [3]:
animals.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [4]:
animals.outcome

0            died
1      euthanized
2           lived
3            died
4            died
          ...    
294    euthanized
295    euthanized
296          died
297         lived
298    euthanized
Name: outcome, Length: 299, dtype: object

In [5]:
target = animals['outcome']

In [6]:
target.unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [7]:
animals =animals.drop(['outcome'],axis=1)

In [8]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [9]:
category_variables = list(animals.select_dtypes('object').columns)

In [10]:
category_variables

['surgery',
 'age',
 'temp_of_extremities',
 'peripheral_pulse',
 'mucous_membrane',
 'capillary_refill_time',
 'pain',
 'peristalsis',
 'abdominal_distention',
 'nasogastric_tube',
 'nasogastric_reflux',
 'rectal_exam_feces',
 'abdomen',
 'abdomo_appearance',
 'surgical_lesion',
 'cp_data']

In [16]:
animals_encoded = pd.get_dummies(animals, columns=category_variables)

In [19]:
animals_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 43 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   hospital_number          299 non-null    int64  
 1   rectal_temp              239 non-null    float64
 2   pulse                    275 non-null    float64
 3   respiratory_rate         241 non-null    float64
 4   nasogastric_reflux_ph    53 non-null     float64
 5   packed_cell_volume       270 non-null    float64
 6   total_protein            266 non-null    float64
 7   abdomo_protein           101 non-null    float64
 8   lesion_1                 299 non-null    int64  
 9   lesion_2                 299 non-null    int64  
 10  lesion_3                 299 non-null    int64  
 11  surgery_0                299 non-null    uint8  
 12  surgery_1                299 non-null    uint8  
 13  age_0                    299 non-null    uint8  
 14  age_1                    2

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X, y = animals_encoded,target

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [21]:
from sklearn.tree import DecisionTreeClassifier
print(X_train.shape)

(239, 43)


In [22]:
missing_perc = X_train.isnull().sum()/X_train.shape[0]

In [23]:
missing_perc

hospital_number            0.000000
rectal_temp                0.188285
pulse                      0.075314
respiratory_rate           0.196653
nasogastric_reflux_ph      0.820084
packed_cell_volume         0.087866
total_protein              0.092050
abdomo_protein             0.656904
lesion_1                   0.000000
lesion_2                   0.000000
lesion_3                   0.000000
surgery_0                  0.000000
surgery_1                  0.000000
age_0                      0.000000
age_1                      0.000000
temp_of_extremities_0      0.000000
temp_of_extremities_1      0.000000
peripheral_pulse_0         0.000000
peripheral_pulse_1         0.000000
mucous_membrane_0          0.000000
mucous_membrane_1          0.000000
capillary_refill_time_0    0.000000
capillary_refill_time_1    0.000000
pain_0                     0.000000
pain_1                     0.000000
peristalsis_0              0.000000
peristalsis_1              0.000000
abdominal_distention_0     0

In [24]:
columns_to_drop = list(missing_perc[missing_perc > 0.3].index)

In [25]:
columns_to_drop

['nasogastric_reflux_ph', 'abdomo_protein']

In [26]:
X_train.drop(columns_to_drop, axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [27]:
X_test.drop(columns_to_drop, axis=1, inplace=True)

In [28]:
X_train.isnull().sum()/X_train.shape[0]

hospital_number            0.000000
rectal_temp                0.188285
pulse                      0.075314
respiratory_rate           0.196653
packed_cell_volume         0.087866
total_protein              0.092050
lesion_1                   0.000000
lesion_2                   0.000000
lesion_3                   0.000000
surgery_0                  0.000000
surgery_1                  0.000000
age_0                      0.000000
age_1                      0.000000
temp_of_extremities_0      0.000000
temp_of_extremities_1      0.000000
peripheral_pulse_0         0.000000
peripheral_pulse_1         0.000000
mucous_membrane_0          0.000000
mucous_membrane_1          0.000000
capillary_refill_time_0    0.000000
capillary_refill_time_1    0.000000
pain_0                     0.000000
pain_1                     0.000000
peristalsis_0              0.000000
peristalsis_1              0.000000
abdominal_distention_0     0.000000
abdominal_distention_1     0.000000
nasogastric_tube_0         0

In [29]:
for cols in X_train.columns:
    try: 
        X_train[cols].fillna(X_train[cols].mode()[0], inplace=True)
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [30]:
X_train.isnull().sum()

hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
packed_cell_volume         0
total_protein              0
lesion_1                   0
lesion_2                   0
lesion_3                   0
surgery_0                  0
surgery_1                  0
age_0                      0
age_1                      0
temp_of_extremities_0      0
temp_of_extremities_1      0
peripheral_pulse_0         0
peripheral_pulse_1         0
mucous_membrane_0          0
mucous_membrane_1          0
capillary_refill_time_0    0
capillary_refill_time_1    0
pain_0                     0
pain_1                     0
peristalsis_0              0
peristalsis_1              0
abdominal_distention_0     0
abdominal_distention_1     0
nasogastric_tube_0         0
nasogastric_tube_1         0
nasogastric_reflux_0       0
nasogastric_reflux_1       0
rectal_exam_feces_0        0
rectal_exam_feces_1        0
abdomen_0                  0
abdomen_1     

In [31]:
for cols in X_test.columns:
    try: 
        X_test[cols].fillna(X_test[cols].mode()[0], inplace=True)
    except:
        pass

In [32]:
X_test.isnull().sum()

hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
packed_cell_volume         0
total_protein              0
lesion_1                   0
lesion_2                   0
lesion_3                   0
surgery_0                  0
surgery_1                  0
age_0                      0
age_1                      0
temp_of_extremities_0      0
temp_of_extremities_1      0
peripheral_pulse_0         0
peripheral_pulse_1         0
mucous_membrane_0          0
mucous_membrane_1          0
capillary_refill_time_0    0
capillary_refill_time_1    0
pain_0                     0
pain_1                     0
peristalsis_0              0
peristalsis_1              0
abdominal_distention_0     0
abdominal_distention_1     0
nasogastric_tube_0         0
nasogastric_tube_1         0
nasogastric_reflux_0       0
nasogastric_reflux_1       0
rectal_exam_feces_0        0
rectal_exam_feces_1        0
abdomen_0                  0
abdomen_1     

In [33]:
classifier = DecisionTreeClassifier()

In [34]:
classifier.fit(X_train,y_train)

DecisionTreeClassifier()

In [35]:
y_predict = classifier.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score

In [37]:
accuracy = accuracy_score(y_predict,y_test)

In [38]:
print(classifier.score(X_train,y_train))
print(accuracy)

1.0
0.5833333333333334


In [57]:
classifier = DecisionTreeClassifier(max_depth=3, min_samples_split=5, min_samples_leaf=5)

In [58]:
classifier.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, min_samples_split=5)

In [59]:
y_predict = classifier.predict(X_test)

In [60]:
from sklearn.metrics import accuracy_score

In [61]:
accuracy = accuracy_score(y_predict,y_test)

In [62]:
print(classifier.score(X_train,y_train))
print(accuracy)

0.7071129707112971
0.6


In [63]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

In [64]:
classifier.fit(X_train,y_train)

y_predict = classifier.predict(X_test)
accuracy = accuracy_score(y_predict,y_test)

print(classifier.score(X_train,y_train))
print(accuracy)

1.0
0.6666666666666666


In [75]:
classifier = RandomForestClassifier(n_estimators=100, max_depth=7)

In [76]:
classifier.fit(X_train,y_train)
y_predict = classifier.predict(X_test)
accuracy = accuracy_score(y_predict,y_test)
print(classifier.score(X_train,y_train))
print(accuracy)

0.9497907949790795
0.7
