In [1]:
"""
Created on Wed Mar  8 12:04:48 2023
@author: Ahmad Al Musawi
"""

from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import stats
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score



In [2]:
def split_labels(df, cols):
    '''split the dataframe into predicting table and labels
       df: given dataset
       cols: list of labels
    '''
    return df[[i for i in df if i not in cols]], df[cols]

In [3]:
def LinearSVM(X_train,y_train, X_test):
    clf = LinearSVC(random_state=42)
    clf.fit(X_train, y_train)
    
    # make predictions on the test set
    y_pred = clf.predict(X_test)
    return y_pred

def GaussianSVM(X_train,y_train, X_test):
#     print('implementing SVM...')
    clf = SVC(kernel='rbf', C=1.0) # Gaussian radial basis function (RBF) kernel
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def PolySVM(X_train,y_train, X_test):
#     print('implementing SVM...')
    clf = SVC(kernel='poly', degree=2, coef0=1, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def SigmoidSVM(X_train,y_train, X_test):
#     print('implementing SVM...')
    clf = SVC(kernel='sigmoid', gamma='scale', random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def NaiveBayes(X_train,y_train, X_test):
#     print('implementing Naive Bayes...')
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def Logistic(X_train,y_train, X_test):
#     print('implementing Logistic Regression...')
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    
    # make predictions on the test set
    y_pred = clf.predict(X_test)    
    return y_pred

   
def CART(X_train,y_train, X_test):
#     print('implementing CART...')
    clf = DecisionTreeClassifier(max_depth=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def kNN(X_train,y_train, X_test):
#     print('implementing kNN...')
    from sklearn.neighbors import KNeighborsClassifier

    # assuming X is your data and k is the number of clusters
    clf = KNeighborsClassifier(n_neighbors=10)
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

In [4]:
def one_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test 

def get_classification_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    num_classes = cm.shape[0]
    sensitivity = []
    specificity = []
    for i in range(num_classes):
        tp = cm[i,i]
        fn = np.sum(cm[i,:]) - tp
        fp = np.sum(cm[:,i]) - tp
        tn = np.sum(cm) - tp - fp - fn
        
        sensitivity_i = tp / (tp + fn)
        specificity_i = tn / (tn + fp)
        sensitivity.append(sensitivity_i)
        specificity.append(specificity_i)
    
    macro_sensitivity = np.mean(sensitivity)
    macro_specificity = np.mean(specificity)
    
    return accuracy, macro_sensitivity, macro_specificity

def predict(X,Y):
    X_train, X_test, y_train, y_test = one_split(X, Y)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    pred_Y = [pred(X_train,y_train, X_test) for pred in predictors]
    return [get_classification_metrics(y_test, p) for p in pred_Y]


# Handling Outliers

In [5]:
def remove_outliers(df):
    # calculate the IQR for each column
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = pd.DataFrame()
    for col in df.columns:
        lower = Q1[col] - 1.5 * IQR[col]
        upper = Q3[col] + 1.5 * IQR[col]
        df_clean[col] = df[(df[col] >= lower) & (df[col] <= upper)][col]
    return df_clean

# Data imputation

In [6]:
from sklearn.impute import SimpleImputer
def data_imputation(X):
    imputer = SimpleImputer(strategy='mean')
    imputer.fit(X)
    X_imputed = pd.DataFrame(imputer.transform(X), columns=X.columns)
    return X_imputed


In [7]:
# df1 = remove_outliers(df1)
# df1 = data_imputation(df1)

# df2 = remove_outliers(df2)
# df2 = data_imputation(df2)

# df1 = pd.read_excel('cleveland data.xlsx')
df1 = pd.read_csv('heart.csv')

# Preprocessing
dataset = df1
from pandas import get_dummies
a = pd.get_dummies(dataset['sex'], prefix = "sex")
b = pd.get_dummies(dataset['cp'], prefix = "cp")
c = pd.get_dummies(dataset['fbs'], prefix = "fbs")
d = pd.get_dummies(dataset['restecg'], prefix = "restecg")
e = pd.get_dummies(dataset['exang'], prefix = "exang")
f = pd.get_dummies(dataset['slope'], prefix = "slope")
g = pd.get_dummies(dataset['ca'], prefix = "ca")
h = pd.get_dummies(dataset['thal'], prefix = "thal")

frames = [dataset, a, b, c, d, e, f, g, h]
dataset2 = pd.concat(frames, axis = 1)
dataset2 = dataset2.drop(columns = ['sex','cp', 'fbs', 'restecg','exang','slope','ca','thal'])

df1 = dataset2
X1, Y1 = split_labels(df1, ['target'])

# nX1 = stats.zscore(X1)



In [8]:
# Load the text file into a DataFrame
# df1 = pd.read_csv('processed.cleveland.data', delimiter=',', header=None)
# df1 = pd.read_excel('cleveland data.xlsx')

df2 = pd.read_excel('CTG.xls', sheet_name = 'Raw Data')

df2 = df2[[i for i in df2 if i not in ['FileName','Date','SegFile']]]

X2, Y2 = split_labels(df2, ['NSP'])

nX2 = stats.zscore(X2)

In [9]:


# predictors = [ NaiveBayes, Logistic, CART, kNN]
# predictorsTXT = ['NaiveBayes', 'Logistic', 'CART', 'kNN']
predictors = [LinearSVM, GaussianSVM, PolySVM, SigmoidSVM, NaiveBayes, Logistic, CART, kNN]
predictorsTXT = ['LinearSVM', 'GaussianSVM', 'PolySVM', 'SigmoidSVM', 'NaiveBayes', 'Logistic', 'CART', 'kNN']

results1 = predict(X1, Y1)
results2 = predict(X2, Y2)

print('Heart Disease')
acc, sen, spe  = [],[],[]
for a, s, e in results1:
    acc.append(a)
    sen.append(s)
    spe.append(e)
print(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))


print('Heart Disease2')
acc, sen, spe = [],[],[]
for a, s, e  in results2:
    acc.append(a)
    sen.append(s)
    spe.append(e)
print(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Heart Disease
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.814935     0.816323     0.816323
1  GaussianSVM  0.918831     0.919484     0.919484
2      PolySVM  0.870130     0.870837     0.870837
3   SigmoidSVM  0.795455     0.797877     0.797877
4   NaiveBayes  0.733766     0.740239     0.740239
5     Logistic  0.798701     0.799755     0.799755
6         CART  0.879870     0.882593     0.882593
7          kNN  0.863636     0.863704     0.863704
Heart Disease2
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.989028     0.979953     0.990475
1  GaussianSVM  0.985893     0.973352     0.985780
2      PolySVM  0.989028     0.979953     0.990475
3   SigmoidSVM  0.957680     0.933625     0.968573
4   NaiveBayes  0.913793     0.947695     0.960680
5     Logistic  0.989028     0.979953     0.990475
6         CART  0.971787     0.964675     0.978467
7          kNN  0.985893     0.973352     0.985780


In [10]:
def PCA_model(X, y=None, n = 2):
#     print("PCA model")
    pca = PCA(n_components=n)
    pca.fit(X)
    X_pca = pca.transform(X)
    print(pca.explained_variance_ratio_)
    selected_features = pca.components_
    print(f'PCA\tNoF = {len(selected_features)}')
    return X_pca

def Kernel_PCA(X, y=None, n = 2):
    from sklearn.decomposition import KernelPCA
#     print("Kernal PCA model")
    pca = KernelPCA(n_components=n, kernel='rbf')
    pca.fit_transform(X)
    return pca

def CE_Model(X, y=None, n=2):
#     print('CE Model')
    embedding = SpectralEmbedding(n_components=n)
    X_CE = embedding.fit_transform(X)
    print(f'CE\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE

def CE2(X, y=None, n=2):
#     print('CE Model: Laplacian Eigenmaps')
    embedding = SpectralEmbedding(n_components=n, affinity='nearest_neighbors', n_neighbors=10, eigen_solver='arpack')
    X_CE = embedding.fit_transform(X)
    print(f'CE2\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE

def LLE(X, y=None, n=2):
    from sklearn.manifold import LocallyLinearEmbedding
#     print('CE Model: Locally Linear Embedding')
    embedding = LocallyLinearEmbedding(n_components=n, n_neighbors=10)
    X_CE = embedding.fit_transform(X)
    print(f'LLE\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE

def Isomap(X, y=None, n=2):
    from sklearn.manifold import Isomap
#     print('CE Model: Isomap')
    embedding =  Isomap(n_components=n, n_neighbors=10)
    X_CE = embedding.fit_transform(X)
    print(f'ISOMAP\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE

def TSNE(X, y=None, n=2):
    from sklearn.manifold import TSNE
#     print('CE Model: TSNE')
    embedding = TSNE(n_components=2, perplexity=30, n_iter=1000)
    X_CE = embedding.fit_transform(X)
    print(f'TSNE\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE



def CFS(X, y, n=2):
#     print('CFS Model')
    selector = SelectKBest(score_func=f_regression, k=5)
    X_new = selector.fit_transform(X, y)
    return X_new

def LLCFS(X, y=None,n=2):
#     print('LLCFS Model')
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X

def ILFS(X, y):
    # create a linear regression model
#     print('ILFS Model')
    model = LinearRegression()
    
    # define the search space
    k_features = np.arange(1, X.shape[1]+1)
    
    # create a sequential feature selector object
    selector = SequentialFeatureSelector(model, k_features=k_features, forward=True, scoring='r2', cv=5)
    
    # perform incremental feature selection
    selector.fit(X, y)
    
    # print the selected feature indices
    print("Indices of selected features:", selector.k_feature_idx_)

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, average_precision_score

In [12]:
# performing experiment 2    

DR = [ CFS, LLCFS, PCA_model, CE_Model, CE2, LLE, Isomap, TSNE,]
DR_TXT = [ 'CFS', 'LLCFS', 'PCA_model', 'CE_Model', 'CE2', 'LLE', 'Isomap', 'TSNE', ]


NoF = X1.shape[1]-3 # Number of features
X1s = [d(X1, np.ravel(Y1), NoF) for d in DR] # dimension reduction
results1 = [predict(x1, np.ravel(Y1)) for x1 in X1s] # Machine learning models

finals = []
for i in range(len(DR)):
    rd = DR_TXT[i]
    print(rd)
    acc, sen, spe = [],[],[]
    for a, s, e in results1[i]:
        acc.append(a)
        sen.append(s)
        spe.append(e)
    finals.append(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))


finalR = {}
for i in predictorsTXT:
    D = pd.DataFrame() # create an empty DataFrame to hold the filtered rows     
    for df in finals:
        row = df[df['model'] == i]
        D = D.append(row) # filter rows that match a certain condition and append them to D
    D['DR'] = DR_TXT
    D = D.drop('model', axis=1)
    finalR[i] = D

for i in finalR:
    print(i)
    print(finalR[i][['DR', 'accuracy' , 'sensitivity',  'specificity']])
    print()

[7.45250270e-01 1.51989441e-01 8.49212797e-02 1.64983040e-02
 3.79041061e-04 1.68144814e-04 1.31375727e-04 1.07866598e-04
 9.27031645e-05 8.02546887e-05 7.18502447e-05 6.69134612e-05
 6.06049535e-05 4.64435282e-05 3.62096578e-05 2.45265630e-05
 2.29950682e-05 1.94622818e-05 1.90456946e-05 5.57160064e-06
 5.27786510e-06 2.41843846e-06 4.29083350e-33 4.29083350e-33
 4.29083350e-33 4.29083350e-33 4.29083350e-33]
PCA	NoF = 27
CE	Old shape = (1025, 30)		 new shape = (1025, 27)		 components = 27




CE2	Old shape = (1025, 30)		 new shape = (1025, 27)		 components = 27
LLE	Old shape = (1025, 30)		 new shape = (1025, 27)		 components = 27


  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


ISOMAP	Old shape = (1025, 30)		 new shape = (1025, 27)		 components = 27




TSNE	Old shape = (1025, 30)		 new shape = (1025, 2)		 components = 27


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


CFS
LLCFS
PCA_model
CE_Model
CE2
LLE
Isomap
TSNE
LinearSVM
          DR  accuracy  sensitivity  specificity
0        CFS  0.818182     0.819889     0.819889
0      LLCFS  0.814935     0.816323     0.816323
0  PCA_model  0.824675     0.826179     0.826179
0   CE_Model  0.665584     0.664704     0.664704
0        CE2  0.678571     0.679604     0.679604
0        LLE  0.707792     0.706851     0.706851
0     Isomap  0.694805     0.696172     0.696172
0       TSNE  0.665584     0.665126     0.665126

GaussianSVM
          DR  accuracy  sensitivity  specificity
1        CFS  0.824675     0.826390     0.826390
1      LLCFS  0.918831     0.919484     0.919484
1  PCA_model  0.899351     0.899561     0.899561
1   CE_Model  0.801948     0.802267     0.802267
1        CE2  0.724026     0.723629     0.723629
1        LLE  0.717532     0.716496     0.716496
1     Isomap  0.720779     0.722806     0.722806
1       TSNE  0.688312     0.687350     0.687350

PolySVM
          DR  accuracy  sensitivity  

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [13]:
nX2 = stats.zscore(X2)

NoF = X2.shape[1]-3 # Number of features
X2s = [d(X2, np.ravel(Y2), NoF) for d in DR]

results2 = [predict(x2, np.ravel(Y2)) for x2 in X2s]
finals = []
for i in range(len(DR)):
    rd = DR_TXT[i]
    acc, sen, spe = [],[],[]
    for a, s, e in results2[i]:
        acc.append(a)
        sen.append(s)
        spe.append(e)
    finals.append(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))
    
    
finalR = {}
for i in predictorsTXT:
    D = pd.DataFrame() # create an empty DataFrame to hold the filtered rows     
    for df in finals:
        row = df[df['model'] == i]
        D = D.append(row) # filter rows that match a certain condition and append them to D
    D['DR'] = DR_TXT
    D = D.drop('model', axis=1)
    finalR[i] = D

for i in finalR:
    print(i)
    print(finalR[i][['DR', 'accuracy' , 'sensitivity',  'specificity']])
    print()

  correlation_coefficient /= X_norms


[9.68713086e-01 2.74234961e-02 1.81937840e-03 7.64856806e-04
 5.10486842e-04 2.95086213e-04 1.99060233e-04 1.07638969e-04
 8.67742844e-05 3.23837027e-05 1.76585343e-05 1.28489755e-05
 4.04401212e-06 3.54801270e-06 2.88068144e-06 2.62541850e-06
 2.14270955e-06 1.17896369e-06 2.54867404e-07 1.58914087e-07
 1.03668102e-07 7.68200538e-08 6.24574875e-08 5.89534241e-08
 3.44613336e-08 2.36407207e-08 1.76699439e-08 1.57927953e-08
 1.07041155e-08 5.87836588e-09 1.66851486e-09 4.77179627e-33
 4.77179627e-33]
PCA	NoF = 33
CE	Old shape = (2126, 36)		 new shape = (2126, 33)		 components = 33
CE2	Old shape = (2126, 36)		 new shape = (2126, 33)		 components = 33
LLE	Old shape = (2126, 36)		 new shape = (2126, 33)		 components = 33
ISOMAP	Old shape = (2126, 36)		 new shape = (2126, 33)		 components = 33




TSNE	Old shape = (2126, 36)		 new shape = (2126, 2)		 components = 33


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


LinearSVM
          DR  accuracy  sensitivity  specificity
0        CFS  0.942006     0.896713     0.930412
0      LLCFS  0.989028     0.979953     0.990475
0  PCA_model  0.987461     0.976652     0.988127
0   CE_Model  0.780564     0.345191     0.674815
0        CE2  0.769592     0.350999     0.684283
0        LLE  0.789969     0.399228     0.715361
0     Isomap  0.789969     0.370249     0.697533
0       TSNE  0.777429     0.333333     0.666667

GaussianSVM
          DR  accuracy  sensitivity  specificity
1        CFS  0.952978     0.896160     0.931304
1      LLCFS  0.985893     0.973352     0.985780
1  PCA_model  0.987461     0.974024     0.986401
1   CE_Model  0.789969     0.394398     0.697720
1        CE2  0.783699     0.357048     0.688143
1        LLE  0.808777     0.470865     0.739640
1     Isomap  0.794671     0.427100     0.715122
1       TSNE  0.777429     0.333333     0.666667

PolySVM
          DR  accuracy  sensitivity  specificity
2        CFS  0.952978     0.896160  

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
