## Echocardiogram dataset

dataset from: https://archive.ics.uci.edu/ml/datasets/Echocardiogram

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./echocardiogram.data', header=None)
df.columns = ['survival', 'still-alive', 'age-at-heart-attack', 'pericardial-effusion',
             'fractional-shortening', 'epss', 'lvdd', 'wall-motion-score', 'wall-motion-index',
             'mult', 'name', 'group', 'alive_at_1']

In [3]:
df.head()

Unnamed: 0,survival,still-alive,age-at-heart-attack,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-score,wall-motion-index,mult,name,group,alive_at_1
0,11,0,71,0,0.26,9.0,4.6,14,1.0,1.0,name,1,0
1,19,0,72,0,0.38,6.0,4.1,14,1.7,0.588,name,1,0
2,16,0,55,0,0.26,4.0,3.42,14,1.0,1.0,name,1,0
3,57,0,60,0,0.253,12.062,4.603,16,1.45,0.788,name,1,0
4,19,1,57,0,0.16,22.0,5.75,18,2.25,0.571,name,1,0


In [4]:
df.shape

(132, 13)

In [5]:
df.isnull().sum()

survival                 0
still-alive              0
age-at-heart-attack      0
pericardial-effusion     0
fractional-shortening    0
epss                     0
lvdd                     0
wall-motion-score        0
wall-motion-index        0
mult                     0
name                     0
group                    0
alive_at_1               0
dtype: int64

In [6]:
# Drop redundant column,
# according to data description group,
# mult -- a derivate var which can be ignored
# group -- meaningless, ignore it
df = df.drop(['group', 'name', 'mult'], axis=1)

In [7]:
# Let's inspect the datatype of our feature
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 10 columns):
survival                 132 non-null object
still-alive              132 non-null object
age-at-heart-attack      132 non-null object
pericardial-effusion     132 non-null object
fractional-shortening    132 non-null object
epss                     132 non-null object
lvdd                     132 non-null object
wall-motion-score        132 non-null object
wall-motion-index        132 non-null object
alive_at_1               132 non-null object
dtypes: object(10)
memory usage: 10.4+ KB


According to the data description, all features suppose to be numerical except names, thus we need to examine the missing attribute values: (denoted by "?").

In [8]:
# Printing the no. of missing values from every features
def missing_value(df, val):
    for i in df.columns:
        if (df[i].values == val).any():
            print(i,':', df[i].value_counts()[val])

In [9]:
missing_value(df, '?')

survival : 2
still-alive : 1
age-at-heart-attack : 5
pericardial-effusion : 1
fractional-shortening : 8
epss : 15
lvdd : 11
wall-motion-score : 4
wall-motion-index : 1
alive_at_1 : 58


Inspecting missing value from features step by step

In [10]:
# Since we do not know the no. of months the patient survived, we can just drop the rows
# (avoid guessing or using the mode since other predictor variables do have an effect on the survival)
df = df.drop(df[df['survival'] == '?'].index)

In [11]:
# Printing the no. of missing values from every features after removed missing value from survival
missing_value(df, '?')

age-at-heart-attack : 5
fractional-shortening : 7
epss : 14
lvdd : 10
wall-motion-score : 3
wall-motion-index : 1
alive_at_1 : 57


In [12]:
# still-alive has been removed as well along with survival, looks like these 2 variable are correlated (EDA later)

# Dropping age-at-heart-attack since these are numerical value we couldn't infer much from their statistics 
df = df.drop(df[df['age-at-heart-attack'] == '?'].index)

# rerun the no. of missing values from every features after removed missing value from age-at-heart-attack
missing_value(df, '?')

fractional-shortening : 6
epss : 14
lvdd : 9
wall-motion-score : 3
wall-motion-index : 1
alive_at_1 : 53


In [13]:
# pericardial-effusion has been removed along with age-at-heart-attack, possibly correlated (EDA later)

# Let's drop the missing value from the feature fractional-shortening
df = df.drop(df[df['fractional-shortening'] == '?'].index)

# rerun the no. of missing values from every features after removed missing value from fractional-shortening
missing_value(df, '?')

epss : 10
lvdd : 4
wall-motion-score : 2
wall-motion-index : 1
alive_at_1 : 51


In [14]:
# 4 and 5 rows from epss and lvdd has been removed respectively along with fractional-shortening as well, possibly correlated (EDA later)

# Let's drop the value '?' from the feature epss
df = df.drop(df[df['epss'] == '?'].index)

# rerun the no. of missing values from every features after removed missing value from epss
missing_value(df, '?')

lvdd : 1
wall-motion-score : 2
wall-motion-index : 1
alive_at_1 : 46


In [15]:
# 3 samples has been removed from lvdd along with epss, possibly correlated (EDA later)

# Let's drop the value '?' from the feature lvdd
df = df.drop(df[df['lvdd'] == '?'].index)

# rerun the no. of missing values from every features after removed missing value from lvdd
missing_value(df, '?')

wall-motion-score : 2
wall-motion-index : 1
alive_at_1 : 46


In [16]:
# According to data description we shall use just wall-motion-index instead of wall-motion-score so,
# Possibly to examine the relationship between wall-motion-score and wall-motion-index
df = df.drop(df[df['wall-motion-score'] == '?'].index)

# rerun the no. of missing values from every features after removed missing value from wall-motion-score
missing_value(df, '?')

alive_at_1 : 45


There is still 45 missing values from the response; hence we should exclude these missing samples for prediction later on.

In [17]:
df_new = df.copy()

In [18]:
# These are all the samples with missing response
missing_response = df_new[df_new['alive_at_1'] == '?']

In [19]:
#df.groupby(['survival', 'still-alive'])[['alive_at_1']].count()

In [20]:
# Drop all the samples with missing response
df_new = df_new.drop(df_new[df_new['alive_at_1'] == '?'].index)

In [21]:
df_new.shape, missing_response.shape

((61, 10), (45, 10))

In [22]:
from sklearn.model_selection import train_test_split
X = df_new.iloc[:, :-1]
y = df_new['alive_at_1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Model with Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train, y_train)

print('\nTrain Confusion Matrix\n\n', pd.crosstab(y_train, lr.predict(X_train),
           rownames=['Actual'], colnames=['Predicted']), sep='')
print('\nTrain accuracy score:', round(accuracy_score(y_train, lr.predict(X_train))))

print('\nTest Confusion Matrix\n\n', pd.crosstab(y_test, lr.predict(X_test),
           rownames=['Actual'], colnames=['Predicted']), sep='')
print('\nTest accuracy score:', round(accuracy_score(y_test, lr.predict(X_test))))


Train Confusion Matrix

Predicted   0   1
Actual           
0          31   0
1           0  11

Train accuracy score: 1.0

Test Confusion Matrix

Predicted   0  1
Actual          
0          13  0
1           0  6

Test accuracy score: 1.0


Looks like we got perfect accuracy on train and test using Logistic Regression

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print('\nTrain Confusion Matrix\n\n', pd.crosstab(y_train, rf.predict(X_train),
           rownames=['Actual'], colnames=['Predicted']), sep='')
print('\nTrain accuracy score:', round(accuracy_score(y_train, rf.predict(X_train))))

print('\nTest Confusion Matrix\n\n', pd.crosstab(y_test, rf.predict(X_test),
           rownames=['Actual'], colnames=['Predicted']), sep='')
print('\nTest accuracy score:', round(accuracy_score(y_test, rf.predict(X_test))))


Train Confusion Matrix

Predicted   0   1
Actual           
0          31   0
1           0  11

Train accuracy score: 1.0

Test Confusion Matrix

Predicted   0  1
Actual          
0          13  0
1           0  6

Test accuracy score: 1.0


Random Forest yielded perfect score as well.

In [24]:
#df.groupby(['survival', 'still-alive'])[['alive_at_1']].count()

The following are our predicted data on the missing response.

In [None]:
# Let's predict on our missing_response data
prediction = lr.predict(missing_response.iloc[:, :-1])