In [1]:
"""
Created on Wed Mar  8 12:04:48 2023
@author: Ahmad Al Musawi
"""

from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import stats
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score



In [2]:
def preprocessing(df):
    print('preprocessing...')
    return df

def split_labels(df, cols):
    '''split the dataframe into predicting table and labels
       df: given dataset
       cols: list of labels
    '''
    return df[[i for i in df if i not in cols]], df[cols]

In [3]:
def LinearSVM(X_train,y_train, X_test):
    svm = LinearSVC(random_state=42)
    svm.fit(X_train, y_train)
    
    # make predictions on the test set
    y_pred = svm.predict(X_test)
    return y_pred

def GaussianSVM(X_train,y_train, X_test):
#     print('implementing SVM...')
    clf = SVC(kernel='rbf', C=1.0) # Gaussian radial basis function (RBF) kernel
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def PolySVM(X_train,y_train, X_test):
#     print('implementing SVM...')
    clf = SVC(kernel='poly', degree=2, coef0=1, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def SigmoidSVM(X_train,y_train, X_test):
#     print('implementing SVM...')
    clf = SVC(kernel='sigmoid', gamma='scale', random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def NaiveBayes(X_train,y_train, X_test):
#     print('implementing Naive Bayes...')
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    return y_pred

def Logistic(X_train,y_train, X_test):
#     print('implementing Logistic Regression...')
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    
    # make predictions on the test set
    y_pred = lr.predict(X_test)    
    return y_pred

def CART(X_train,y_train, X_test):
#     print('implementing CART...')
    clf = DecisionTreeClassifier(max_depth=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def kNN(X_train,y_train, X_test):
#     print('implementing kNN...')
    from sklearn.neighbors import KNeighborsClassifier

    # assuming X is your data and k is the number of clusters
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(X_train, y_train)
    return knn.predict(X_test)

In [4]:
# Load the text file into a DataFrame
# df1 = pd.read_csv('processed.cleveland.data', delimiter=',', header=None)
df1 = pd.read_excel('cleveland data.xlsx')
df2 = pd.read_excel('CTG.xls', sheet_name = 'Raw Data')

df2 = df2[[i for i in df2 if i not in ['FileName','Date','SegFile']]]
print(df1.head())
print()
print(df2.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

   ca  thal  num  
0   0     6    0  
1   3     3    2  
2   2     7    1  
3   0     3    0  
4   0     3    0  

     b     e  LBE   LB  AC  FM  UC  ASTV  MSTV  ALTV  ...  C  D  E  AD  DE  \
0  240   357  120  120   0   0   0    73   0.5    43  ...  0  0  0   0   0   
1    5   632  132  132   4   0   4    17   2.1     0  ...  0  0  0   1   0   
2  177   779  133  133   2   0   5    16   2.1     0  ...  0  0  0   1   0   
3  411  1192  134  134   2   0   6    16   2.4     0  ...  0  0  0   1   0   
4  533  

In [5]:
def one_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test 

def get_classification_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    num_classes = cm.shape[0]
    sensitivity = []
    specificity = []
    for i in range(num_classes):
        tp = cm[i,i]
        fn = np.sum(cm[i,:]) - tp
        fp = np.sum(cm[:,i]) - tp
        tn = np.sum(cm) - tp - fp - fn
        
        sensitivity_i = tp / (tp + fn)
        specificity_i = tn / (tn + fp)
        sensitivity.append(sensitivity_i)
        specificity.append(specificity_i)
    
    macro_sensitivity = np.mean(sensitivity)
    macro_specificity = np.mean(specificity)
    
    return accuracy, macro_sensitivity, macro_specificity

def predict(X,Y):
    X_train, X_test, y_train, y_test = one_split(X, Y)
    pred_Y = [pred(X_train,y_train, X_test) for pred in predictors]
    return [get_classification_metrics(y_test, p) for p in pred_Y]


# Handling Outliers

In [6]:
def remove_outliers(df):
    # calculate the IQR for each column
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = pd.DataFrame()
    for col in df.columns:
        lower = Q1[col] - 1.5 * IQR[col]
        upper = Q3[col] + 1.5 * IQR[col]
        df_clean[col] = df[(df[col] >= lower) & (df[col] <= upper)][col]
    return df_clean

# Data imputation

In [13]:
from sklearn.impute import SimpleImputer
def data_imputation(X):
    imputer = SimpleImputer(strategy='mean')
    imputer.fit(X)
    X_imputed = pd.DataFrame(imputer.transform(X), columns=X.columns)
    return X_imputed


In [14]:
X1, Y1 = split_labels(df1, ['num'])
X2, Y2 = split_labels(df2, ['CLASS'])

X1 = remove_outliers(X1)
X1 = data_imputation(X1)
nX1 = stats.zscore(X1)

X2 = remove_outliers(X2)
X2 = data_imputation(X2)
nX2 = stats.zscore(X2)

# predictors = [ NaiveBayes, Logistic, CART, kNN]
# predictorsTXT = ['NaiveBayes', 'Logistic', 'CART', 'kNN']
predictors = [LinearSVM, GaussianSVM, PolySVM, SigmoidSVM, NaiveBayes, Logistic, CART, kNN]
predictorsTXT = ['LinearSVM', 'GaussianSVM', 'PolySVM', 'SigmoidSVM', 'NaiveBayes', 'Logistic', 'CART', 'kNN']

results1 = predict(X1, Y1)
results2 = predict(X2, Y2)

print('Heart Disease')
acc, sen, spe = [],[],[]
for a, s, e in results1:
    acc.append(a)
    sen.append(s)
    spe.append(e)
print(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))


print('Heart Disease2')
acc, sen, spe = [],[],[]
for a, s, e  in results2:
    acc.append(a)
    sen.append(s)
    spe.append(e)
print(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Heart Disease
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.538462     0.216667     0.813254
1  GaussianSVM  0.527473     0.200000     0.800000
2      PolySVM  0.527473     0.200000     0.800000
3   SigmoidSVM  0.527473     0.200000     0.800000
4   NaiveBayes  0.516484     0.311961     0.872932
5     Logistic  0.582418     0.318824     0.872913
6         CART  0.516484     0.254559     0.862713
7          kNN  0.527473     0.231029     0.830503
Heart Disease2
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.528213     0.436226     0.943564
1  GaussianSVM  0.266458     0.100847     0.900213
2      PolySVM  0.271160     0.104301     0.901031
3   SigmoidSVM  0.233542     0.120081     0.904146
4   NaiveBayes  0.746082     0.759912     0.972915
5     Logistic  0.489028     0.311814     0.936286
6         CART  0.865204     0.694194     0.984264
7          kNN  0.341693     0.218845     0.916834


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [15]:
def PCA_model(X, y=None, n = 2):
#     print("PCA model")
    pca = PCA(n_components=n)
    pca.fit(X)
    X_pca = pca.transform(X)
    print(pca.explained_variance_ratio_)
    selected_features = pca.components_
    print(f'PCA\tNoF = {len(selected_features)}')
    return X_pca

def Kernel_PCA(X, y=None, n = 2):
    from sklearn.decomposition import KernelPCA
#     print("Kernal PCA model")
    pca = KernelPCA(n_components=n, kernel='rbf')
    pca.fit_transform(X)
    return pca

def CE_Model(X, y=None, n=2):
#     print('CE Model')
    embedding = SpectralEmbedding(n_components=n)
    X_CE = embedding.fit_transform(X)
    print(f'CE\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE

def CE2(X, y=None, n=2):
#     print('CE Model: Laplacian Eigenmaps')
    embedding = SpectralEmbedding(n_components=n, affinity='nearest_neighbors', n_neighbors=10, eigen_solver='arpack')
    X_CE = embedding.fit_transform(X)
    print(f'CE2\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE

def LLE(X, y=None, n=2):
    from sklearn.manifold import LocallyLinearEmbedding
#     print('CE Model: Locally Linear Embedding')
    embedding = LocallyLinearEmbedding(n_components=n, n_neighbors=10)
    X_CE = embedding.fit_transform(X)
    print(f'LLE\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE

def Isomap(X, y=None, n=2):
    from sklearn.manifold import Isomap
#     print('CE Model: Isomap')
    embedding =  Isomap(n_components=n, n_neighbors=10)
    X_CE = embedding.fit_transform(X)
    print(f'ISOMAP\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE

def TSNE(X, y=None, n=2):
    from sklearn.manifold import TSNE
#     print('CE Model: TSNE')
    embedding = TSNE(n_components=2, perplexity=30, n_iter=1000)
    X_CE = embedding.fit_transform(X)
    print(f'TSNE\tOld shape = {X.shape}\t\t new shape = {X_CE.shape}\t\t components = {n}')
    return X_CE



def CFS(X, y, n=2):
#     print('CFS Model')
    selector = SelectKBest(score_func=f_regression, k=5)
    X_new = selector.fit_transform(X, y)
    return X_new

def LLCFS(X, y=None,n=2):
#     print('LLCFS Model')
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X

def ILFS(X, y):
    # create a linear regression model
#     print('ILFS Model')
    model = LinearRegression()
    
    # define the search space
    k_features = np.arange(1, X.shape[1]+1)
    
    # create a sequential feature selector object
    selector = SequentialFeatureSelector(model, k_features=k_features, forward=True, scoring='r2', cv=5)
    
    # perform incremental feature selection
    selector.fit(X, y)
    
    # print the selected feature indices
    print("Indices of selected features:", selector.k_feature_idx_)

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, average_precision_score

In [48]:
# performing experiment 2    

DR = [ CFS, LLCFS, PCA_model, CE_Model, CE2, LLE, Isomap, TSNE,]
DR_TXT = [ 'CFS', 'LLCFS', 'PCA_model', 'CE_Model', 'CE2', 'LLE', 'Isomap', 'TSNE', ]

# for lol in range(2, nX1.shape[1]):

NoF = nX1.shape[1]-3 # Number of features
X1s = [d(X1, np.ravel(Y1), NoF) for d in DR] # dimension reduction
results1 = [predict(x1, np.ravel(Y1)) for x1 in X1s] # Machine learning models

finals = []
for i in range(len(DR)):
    rd = DR_TXT[i]
    print(rd)
    acc, sen, spe = [],[],[]
    for a, s, e in results1[i]:
        acc.append(a)
        sen.append(s)
        spe.append(e)
    finals.append(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))


finalR = {}
for i in predictorsTXT:
    D = pd.DataFrame() # create an empty DataFrame to hold the filtered rows     
    for df in finals:
        row = df[df['model'] == i]
        D = D.append(row) # filter rows that match a certain condition and append them to D
    D['DR'] = DR_TXT
    D = D.drop('model', axis=1)
    finalR[i] = D

for i in finalR:
    print(i)
    print(finalR[i][['DR', 'accuracy' , 'sensitivity',  'specificity']])
    print()

  correlation_coefficient /= X_norms


[7.10146162e-01 1.84093080e-01 8.20609547e-02 2.12060503e-02
 1.32595793e-03 3.80166837e-04 3.03337886e-04 1.59616295e-04
 1.33089911e-04 7.78932435e-05]
PCA	NoF = 10
CE	Old shape = (303, 13)		 new shape = (303, 10)		 components = 10
CE2	Old shape = (303, 13)		 new shape = (303, 10)		 components = 10
LLE	Old shape = (303, 13)		 new shape = (303, 10)		 components = 10
ISOMAP	Old shape = (303, 13)		 new shape = (303, 10)		 components = 10
TSNE	Old shape = (303, 13)		 new shape = (303, 2)		 components = 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


CFS
LLCFS
PCA_model
CE_Model
CE2
LLE
Isomap
TSNE
LinearSVM
          DR  accuracy  sensitivity  specificity
0        CFS  0.571429     0.255294     0.832244
0      LLCFS  0.637363     0.371961     0.880598
0  PCA_model  0.560440     0.331225     0.878966
0   CE_Model  0.527473     0.200000     0.800000
0        CE2  0.527473     0.200000     0.800000
0        LLE  0.538462     0.211765     0.808548
0     Isomap  0.439560     0.301029     0.832505
0       TSNE  0.527473     0.200000     0.800000

GaussianSVM
          DR  accuracy  sensitivity  specificity
1        CFS  0.527473     0.200000     0.800000
1      LLCFS  0.593407     0.307794     0.869491
1  PCA_model  0.538462     0.211765     0.804651
1   CE_Model  0.538462     0.211765     0.818524
1        CE2  0.538462     0.211765     0.818290
1        LLE  0.527473     0.200000     0.804068
1     Isomap  0.527473     0.200000     0.805845
1       TSNE  0.527473     0.200000     0.800000

PolySVM
          DR  accuracy  sensitivity  

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [50]:
nX2 = stats.zscore(X2)

NoF = nX2.shape[1]-3 # Number of features
X2s = [d(X2, np.ravel(Y2), NoF) for d in DR]

results2 = [predict(x2, np.ravel(Y2)) for x2 in X2s]
finals = []
for i in range(len(DR)):
    rd = DR_TXT[i]
    acc, sen, spe = [],[],[]
    for a, s, e in results2[i]:
        acc.append(a)
        sen.append(s)
        spe.append(e)
    finals.append(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))
    
    
finalR = {}
for i in predictorsTXT:
    D = pd.DataFrame() # create an empty DataFrame to hold the filtered rows     
    for df in finals:
        row = df[df['model'] == i]
        D = D.append(row) # filter rows that match a certain condition and append them to D
    D['DR'] = DR_TXT
    D = D.drop('model', axis=1)
    finalR[i] = D

for i in finalR:
    print(i)
    print(finalR[i][['DR', 'accuracy' , 'sensitivity',  'specificity']])
    print()

  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


[9.70211400e-01 2.74629966e-02 1.48359088e-03 4.46908849e-04
 1.58541891e-04 8.10838008e-05 5.88920834e-05 2.96551435e-05
 1.70482439e-05 1.65405893e-05 9.67363516e-06 7.64448221e-06
 6.60004303e-06 3.00993914e-06 2.52564009e-06 2.05147205e-06
 9.78104687e-07 5.89967452e-07 1.12432538e-07 8.82006134e-08
 6.81284103e-08 4.77926180e-33 4.77926180e-33 4.77926180e-33
 4.77926180e-33 4.77926180e-33 4.77926180e-33 4.77926180e-33
 4.77926180e-33 4.77926180e-33 4.77926180e-33 4.77926180e-33
 4.77926180e-33]
PCA	NoF = 33
CE	Old shape = (2126, 36)		 new shape = (2126, 33)		 components = 33
CE2	Old shape = (2126, 36)		 new shape = (2126, 33)		 components = 33
LLE	Old shape = (2126, 36)		 new shape = (2126, 33)		 components = 33
ISOMAP	Old shape = (2126, 36)		 new shape = (2126, 33)		 components = 33




TSNE	Old shape = (2126, 36)		 new shape = (2126, 2)		 components = 33


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/s

LinearSVM
          DR  accuracy  sensitivity  specificity
0        CFS  0.642633     0.410568     0.956995
0      LLCFS  0.866771     0.776182     0.984684
0  PCA_model  0.573668     0.353292     0.949094
0   CE_Model  0.264890     0.100000     0.900000
0        CE2  0.264890     0.100000     0.900000
0        LLE  0.308777     0.140283     0.908689
0     Isomap  0.148903     0.101308     0.900173
0       TSNE  0.175549     0.076618     0.899687

GaussianSVM
          DR  accuracy  sensitivity  specificity
1        CFS  0.692790     0.499750     0.963640
1      LLCFS  0.876176     0.763755     0.985603
1  PCA_model  0.291536     0.116397     0.904491
1   CE_Model  0.305643     0.156358     0.910279
1        CE2  0.294671     0.142320     0.908760
1        LLE  0.365204     0.199483     0.918400
1     Isomap  0.294671     0.121477     0.905745
1       TSNE  0.283699     0.111704     0.903646

PolySVM
          DR  accuracy  sensitivity  specificity
2        CFS  0.694357     0.506612  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [None]:
DR