Module 7 Case study 2

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [57]:
# Let’s attempt to predict the survival of a horse based on various observed medical conditions. 
# Load the data from ‘horses.csv’ and observe whether it contains missing values.
# [Hint: Pandas dataframe has a method isnull]

df = pd.read_csv('additional_resources_7_xsa_so3fyvt/horse.csv')
print(df.columns, df.shape)
df.head()

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2',
       'lesion_3', 'cp_data'],
      dtype='object') (299, 28)


Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [58]:
df.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [59]:
df.duplicated().sum()

np.int64(0)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [None]:
# Replace the missing values with the most frequent value in each column.
# [Hint: Refer to Imputer class in Scikit learn preprocessing module]

imputer = SimpleImputer(strategy='most_frequent')
df[:] = imputer.fit_transform(df)

# best practice - handle missing values prior to encoding

In [62]:
df.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
outcome                  0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

In [None]:
# This dataset contains many categorical features, replace them with label encoding.
# [Hint: Refer to get_dummies methods in pandas dataframe or Label encoder in scikit-learn]

encoder = LabelEncoder()
cols = df.select_dtypes(include='object')        # exclude numeric cols for encoding
for col in cols:
    df[col] = encoder.fit_transform(df[col])     #print(df[col].nunique()) - to check no. of unique values for encoding

df

# binary cols, categorical cols - useful for encoding

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,0,0,530101,38.5,66.0,28.0,1,3,3,2,...,45.0,8.4,1,2.0,0,0,11300,0,0,0
1,1,0,534817,39.2,88.0,20.0,1,2,4,1,...,50.0,85.0,1,2.0,1,0,2208,0,0,0
2,0,0,530334,38.3,40.0,24.0,2,2,5,1,...,33.0,6.7,1,2.0,2,0,0,0,0,1
3,1,1,5290409,39.1,164.0,84.0,0,2,2,2,...,48.0,7.2,2,5.3,0,1,2208,0,0,1
4,0,0,530255,37.3,104.0,35.0,1,2,2,2,...,74.0,7.4,1,2.0,0,0,4300,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1,0,533886,38.0,120.0,70.0,0,2,4,2,...,55.0,65.0,1,2.0,1,0,3205,0,0,0
295,0,0,527702,37.2,72.0,24.0,1,1,4,2,...,44.0,6.5,2,3.3,1,1,2208,0,0,1
296,1,0,529386,37.5,72.0,30.0,0,3,4,1,...,60.0,6.8,1,2.0,0,1,3205,0,0,0
297,1,0,530612,36.5,100.0,24.0,1,3,5,1,...,50.0,6.0,2,3.4,2,1,2208,0,0,1


In [None]:
# split train and test data

X = df.drop('outcome', axis=1)
y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(0.25))

In [92]:
# Decision Tree Classifier

# Train the model
dtmodel = DecisionTreeClassifier()
dtmodel.fit(X_train, y_train)

# Predict using the test data 
ypred = dtmodel.predict(X_test)

# Accuracy score
print('Accuracy score of DTC',accuracy_score(y_test,ypred))

# Cross validation
print('Cross validation score', cross_val_score(dtmodel,X, y, cv=5, scoring='accuracy').mean())


Accuracy score of DTC 0.5866666666666667
Cross validation score 0.6754237288135594


In [101]:
# Random Forest Classifier

# Train the model
rfmodel = RandomForestClassifier()
rfmodel.fit(X_train, y_train)

# Predict using the test data 
ypred2 = rfmodel.predict(X_test)

# Accuracy score
print('Accuracy score of RFC',accuracy_score(y_test,ypred2))

# Cross validation
print('Cross validation score', cross_val_score(rfmodel,X, y, cv=5, scoring='accuracy').mean())

Accuracy score of RFC 0.6666666666666666
Cross validation score 0.7191525423728814


In [103]:
print('Comparing the accuracy of Decision Tree and Random Forest Classifiers we see that RFC produces slightly better results.')

Comparing the accuracy of Decision Tree and Random Forest Classifiers we see that RFC produces slightly better results.


In [None]:
# df[[]] retains df structure - includes column name/header

print(df['respiratory_rate'])
df[['respiratory_rate']]    

0      28.0
1      20.0
2      24.0
3      84.0
4      35.0
       ... 
294    70.0
295    24.0
296    30.0
297    24.0
298    20.0
Name: respiratory_rate, Length: 299, dtype: float64


Unnamed: 0,respiratory_rate
0,28.0
1,20.0
2,24.0
3,84.0
4,35.0
...,...
294,70.0
295,24.0
296,30.0
297,24.0
