In [184]:
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


In [185]:
# Pre processing

# Had to add the column names to the data file
# Forcing data types to str to further examine data, forcing lesion attributes as strings because leading zero's will disappear

data = pd.read_csv('horse-colic.data', delim_whitespace=True, na_values="?", dtype={'lesion_1':str, 'lesion_2':str, 'lesion_3':str})

nullRatio = data.isna().sum() / len(data)*100
nullRatio.sort_values(ascending=False)


nasogastric_reflux_ph    82.333333
abdomo_protein           66.000000
abdomo_appearance        55.000000
abdomen                  39.333333
nasogastric_reflux       35.333333
nasogastric_tube         34.666667
rectal_exam_feces        34.000000
peripheral_pulse         23.000000
rectal_temp              20.000000
respiratory_rate         19.333333
temp_of_extremities      18.666667
abdominal_distention     18.666667
pain                     18.333333
mucous_membrane          15.666667
peristalsis              14.666667
total_protein            11.000000
capillary_refill_time    10.666667
packed_cell_volume        9.666667
pulse                     8.000000
surgery                   0.333333
outcome                   0.333333
lesion_3                  0.000000
surgical_lesion           0.000000
lesion_1                  0.000000
lesion_2                  0.000000
hospital_number           0.000000
age                       0.000000
cp_data                   0.000000
dtype: float64

In [186]:
dataBefore = data[['rectal_temp','pulse','respiratory_rate','peripheral_pulse','total_protein','packed_cell_volume']]
dataBefore.describe()

Unnamed: 0,rectal_temp,pulse,respiratory_rate,peripheral_pulse,total_protein,packed_cell_volume
count,240.0,276.0,242.0,231.0,267.0,271.0
mean,38.167917,71.913043,30.417355,2.017316,24.456929,46.295203
std,0.732289,28.630557,17.642231,1.042428,27.475009,10.419335
min,35.4,30.0,8.0,1.0,3.3,23.0
25%,37.8,48.0,18.5,1.0,6.5,38.0
50%,38.2,64.0,24.5,2.0,7.5,45.0
75%,38.5,88.0,36.0,3.0,57.0,52.0
max,40.8,184.0,96.0,4.0,89.0,75.0


In [187]:
print('Data dimensions are (samples, features) : ', data.shape)
print('Data types : ')
print(data.dtypes.value_counts())
data.head()

adult = data[data.age==1]
young = data[data.age==9]


# Discretisizing data, there are outliers in the data because young horses have a narrower temperature range and a much higher pulse range.

adult['rectal_temp_bin'] = pd.cut(x = adult['rectal_temp'],
                        bins = [0,38.3,999],
                        labels = ['0','1']) # Low temp, normal temp, high temp

adult['pulse_bin'] = pd.cut(x = adult['pulse'],   
                        bins = [0,45,999],
                        labels = ['0','1']) # low to normal, abnormally high



young['rectal_temp_bin'] = pd.cut(x = young['rectal_temp'],
                        bins = [0,38.9,999],
                        labels = ['0','1']) # Low to normal temp, high temp


young['pulse_bin'] = pd.cut(x = young['pulse'],   
                        bins = [0,60,999],
                        labels = ['0','1']) #  low to normal, abnormally high



allHorses = pd.concat([young, adult])

allHorses['extremeties'] = pd.cut(x = allHorses['temp_of_extremities'],   
                        bins = [0,2,4],
                        labels = ['0','1']) # normal, indicate shock

allHorses['mucous'] = pd.cut(x = allHorses['mucous_membrane'],   
                        bins = [0,2,6],
                        labels = ['0','1']) # normal, early shock or dangerous

allHorses['packedCell'] = pd.cut(x = allHorses['packed_cell_volume'],   
                        bins = [0,30,999],
                        labels = ['0','1']) #low and normal, high

allHorses['proteinLevel'] = pd.cut(x = allHorses['total_protein'],   
                        bins = [0, 7.5, 999],
                        labels = ['0','1']) #low and normal, high

allHorses['death'] = pd.cut(x = allHorses['outcome'],   
                        bins = [0,1,3],
                        labels = ['0','1']) #low, normal, high

Data dimensions are (samples, features) :  (300, 28)
Data types : 
float64    21
int64       4
object      3
dtype: int64


<H3> I will remove the following attributes </H3>

1. Hospital numbers, 284 unique values of 300 total. No statistically relevant information to be gained
2. Nasogastric reflux, Abdomo protein and abdomo appearance all have a huge amount of missing data, from 55% - 82% missing values
3. respiratory rate, since it is documented to be doubtful. I lack the domain knowledge, but most values seem to be elevated.
4. cp_data indicates if there was research done. Of no value since the additional research data is not included
5. lesion_1, lesion_2, lesion_3 are descriptors of the types of lesions. I lack the domain knowledge to use this variable so I will use the "surgical_lesion" which indicates if a lesion is present.
6. I will filter out young horses because they only represent 8% of the datase, the heart rate and temperature readings are fluctuating a lot. They have a 50% mortality rate compared to adults which have around 40%

In [188]:

# Removing attributes because of reasons stated above
allHorses = allHorses.drop(columns=['hospital_number', 'nasogastric_reflux', 'respiratory_rate', 'abdomo_protein','abdomo_appearance', 'cp_data', 'lesion_1','lesion_2','lesion_3']) # 'abdomo_appearance',
# removing attributes because of discretiziation
allHorses = allHorses.drop(columns=['rectal_temp','pulse','temp_of_extremities','peripheral_pulse','mucous_membrane', 'packed_cell_volume'])
allHorses.head()


adult = allHorses[allHorses.age == 1]


In [189]:
test = adult.drop(columns=['age'])
test['surgery'] = np.where(test['surgery']== 1.0, True, False)
test['capillary_refill_time'] = np.where(test['capillary_refill_time']== 2.0, True, False) # true = long time, adverse effect
test.pain = test.pain.fillna(1) # if there is a null value I assume that the horse did not show pain levels, because it was omitted in the data
test['pain'] = np.where(test['pain']!=1, True, False)
test.peristalsis = test.peristalsis.fillna(2.0)
test['peristalsis'] = np.where(test['peristalsis']!=2, True, False) # True for abnormal values

test.abdominal_distention = test.abdominal_distention.fillna(1.0)
test['abdominal_distention'] = np.where(test['abdominal_distention']!=1.0, True, False) # True for abnormal values
test['death'] = np.where(test['death']=='1', True, False) # True for abnormal values
test['surgical_lesion'] = np.where(test['surgical_lesion']==1, True, False) # True for abnormal values

test.rectal_temp_bin = test.rectal_temp_bin.fillna('0')

test['rectal_temp_bin'] = np.where(test['rectal_temp_bin']=='1', True, False) # True for abnormal values

test.pulse_bin = test.pulse_bin.fillna('0')
test['pulse_bin'] = np.where(test['pulse_bin'] == '1', True, False) # True for abnormal values


In [190]:

test.mucous = test.mucous.fillna('0')
test.packedCell = test.packedCell.fillna('0')
test.proteinLevel = test.proteinLevel.fillna('0')
test.extremities = test.proteinLevel.fillna('0')



test['mucous'] = np.where(test['mucous']=='1', True, False) # True for abnormal values
test['packedCell'] = np.where(test['packedCell']=='1', True, False) # True for abnormal values
test['proteinLevel'] = np.where(test['proteinLevel']=='1', True, False) # True for abnormal values
test['extremeties'] = np.where(test['extremeties']=='1', True, False) # True for abnormal values

test.drop(columns=['nasogastric_tube','nasogastric_reflux_ph','rectal_exam_feces','abdomen','total_protein','outcome']).to_csv('test.csv',index=False)


In [191]:
# test[['surgery','capillary_refill_time','pain','peristalsis','abdominal_distention','death']].to_csv('test.csv',index=False)

In [192]:
adult.drop(columns=['age']).to_csv('adultHorses.csv', index=False)

In [193]:
test = allHorses[['surgery','age','surgical_lesion','outcome']]
test = test[test.outcome==2]
test.surgery = test.surgery.fillna('0')
test.age = test.age.fillna('0')
test.surgical_lesion = test.surgical_lesion.fillna('0')
test.outcome = test.outcome.fillna('0')



pca = PCA(n_components=4)
pca.fit(test)


PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [194]:
pd.DataFrame(pca.components_)

Unnamed: 0,0,1,2,3
0,-0.023179,0.999667,0.011332,-0.0
1,0.904357,0.016134,0.426471,0.0
2,-0.426146,-0.020134,0.90443,0.0
3,0.0,0.0,0.0,1.0


In [195]:
data[['abdominal_distention','outcome']] # mætti skoða þetta betur, 1
data[data.abdominal_distention==4].outcome.value_counts()
#  13: abdominal distension
#          - An IMPORTANT parameter.
#          - possible values
#               1 = none (61 live, 7 die, 7 euthanized)
#               2 = slight (47 live, 14 die, 4 euthanized)
#               3 = moderate (26 live, 27 die, 12 euthanized)
#               4 = severe (13 live, 15 die, 10 euthanized)
#          - an animal with abdominal distension is likely to be painful and
#            have reduced gut motility.
#          - a horse with severe abdominal distension is likely to require
#            surgery just tio relieve the pressure

2.0    15
1.0    13
3.0    10
Name: outcome, dtype: int64