In [19]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import explained_variance_score
from sklearn.decomposition import PCA

In [51]:
heart = pd.read_csv('../mydata/heart.csv')
heart.head(), heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


(   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
 0   63    1   3       145   233    1        0      150      0      2.3      0   
 1   37    1   2       130   250    0        1      187      0      3.5      0   
 2   41    0   1       130   204    0        0      172      0      1.4      2   
 3   56    1   1       120   236    0        1      178      0      0.8      2   
 4   57    0   0       120   354    0        1      163      1      0.6      2   
 
    ca  thal  target  
 0   0     1       1  
 1   0     2       1  
 2   0     2       1  
 3   0     2       1  
 4   0     2       1  ,
 None)

### Using all available variables

In [74]:
X = heart.drop(['target'], axis=1).values
y = heart['target'].values

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance Score:', explained_variance_score(y_test, y_pred, multioutput='uniform_average'))

Mean Absolute Error: 0.2981361736043105
Mean Squared Error: 0.1309909385704733
Root Mean Squared Error: 0.3619267033122498
Explained Variance Score: 0.47141230607339435


In [43]:
X = heart.drop(['target'], axis=1)

pca = PCA()
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
print(pca.explained_variance_)

result_df = pd.DataFrame(zip(heart.columns, pca.explained_variance_ratio_,pca.singular_values_,pca.explained_variance_), columns=['Variable', 'Variance Explained Ratio','Singular Value','Variance Explained'])
result_df.sort_values('Variance Explained Ratio', inplace=True, ascending=False)
result_df.head(20)

[7.47564199e-01 1.50370221e-01 8.45968470e-02 1.62159604e-02
 3.84085658e-04 2.81234229e-04 2.28993383e-04 9.98328415e-05
 7.72143117e-05 5.87402295e-05 5.02927817e-05 4.09679747e-05
 3.14113086e-05]
[902.30333787 404.67766479 303.53264388 132.89224968  20.45230975
  17.50098149  15.79209627  10.42713587   9.17016645   7.99827139
   7.40084222   6.67960166   5.84886142]
[2.69586528e+03 5.42264942e+02 3.05073066e+02 5.84779802e+01
 1.38508932e+00 1.01418660e+00 8.25795710e-01 3.60017094e-01
 2.78450175e-01 2.11828958e-01 1.81365780e-01 1.47738670e-01
 1.13275430e-01]


Unnamed: 0,Variable,Variance Explained Ratio,Singular Value,Variance Explained
0,age,0.747564,902.303338,2695.865277
1,sex,0.15037,404.677665,542.264942
2,cp,0.084597,303.532644,305.073066
3,trestbps,0.016216,132.89225,58.47798
4,chol,0.000384,20.45231,1.385089
5,fbs,0.000281,17.500981,1.014187
6,restecg,0.000229,15.792096,0.825796
7,thalach,0.0001,10.427136,0.360017
8,exang,7.7e-05,9.170166,0.27845
9,oldpeak,5.9e-05,7.998271,0.211829


### Selecting only the top 4 variables

In [48]:
X = heart[['age','sex','cp','trestbps']]
y = heart['target']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance Score:', explained_variance_score(y_test, y_pred, multioutput='uniform_average'))

Mean Absolute Error: 0.38162421435059324
Mean Squared Error: 0.18253522437460765
Root Mean Squared Error: 0.4272414122888928
Explained Variance Score: 0.26019533021946895


## Trying out some outlier removal techniques to improve the results

### IsolationForest

In [75]:
from sklearn.ensemble import IsolationForest

X = heart[['age','sex','cp','trestbps']]
y = heart['target']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
X_train = X_train[yhat!=-1].reset_index(drop=True)
y_train = y_train[yhat!=-1].reset_index(drop=True)

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance Score:', explained_variance_score(y_test, y_pred, multioutput='uniform_average'))



Mean Absolute Error: 0.3740375865481862
Mean Squared Error: 0.17966534010946378
Root Mean Squared Error: 0.4238694847585325
Explained Variance Score: 0.27180054865831293


### EllipticEnvelope

In [81]:
from sklearn.covariance import EllipticEnvelope

X = heart[['age','sex','cp','trestbps']]
y = heart['target']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)

X_train = X_train[yhat!=-1].reset_index(drop=True)
y_train = y_train[yhat!=-1].reset_index(drop=True)

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance Score:', explained_variance_score(y_test, y_pred, multioutput='uniform_average'))



Mean Absolute Error: 0.3821790225707704
Mean Squared Error: 0.1826146745626652
Root Mean Squared Error: 0.42733438261233464
Explained Variance Score: 0.25985961719392303


### LocalOutlierFactor

In [84]:
from sklearn.covariance import EllipticEnvelope

X = heart[['age','sex','cp','trestbps']]
y = heart['target']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)

X_train = X_train[yhat!=-1].reset_index(drop=True)
y_train = y_train[yhat!=-1].reset_index(drop=True)

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance Score:', explained_variance_score(y_test, y_pred, multioutput='uniform_average'))



Mean Absolute Error: 0.38243078992934726
Mean Squared Error: 0.1827169037808292
Root Mean Squared Error: 0.4274539785530475
Explained Variance Score: 0.25946419801144327


### OneClassSVM

In [85]:
from sklearn.svm import OneClassSVM

X = heart[['age','sex','cp','trestbps']]
y = heart['target']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

ocs = OneClassSVM()
yhat = ocs.fit_predict(X_train)

X_train = X_train[yhat!=-1].reset_index(drop=True)
y_train = y_train[yhat!=-1].reset_index(drop=True)

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance Score:', explained_variance_score(y_test, y_pred, multioutput='uniform_average'))



Mean Absolute Error: 0.3806347254530158
Mean Squared Error: 0.18710249144570715
Root Mean Squared Error: 0.43255345501533926
Explained Variance Score: 0.2421069898141972
