In [1]:
"""
Created on Wed Mar  8 12:04:48 2023
@author: Ahmad Al Musawi
"""

from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import stats
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score



In [2]:
def preprocessing(df):
    print('preprocessing...')
    return df

def split_labels(df, cols):
    '''split the dataframe into predicting table and labels
       df: given dataset
       cols: list of labels
    '''
    return df[[i for i in df if i not in cols]], df[cols]

In [3]:
def LinearSVM(X_train,y_train, X_test):
    svm = LinearSVC(random_state=42)
    svm.fit(X_train, y_train)
    
    # make predictions on the test set
    y_pred = svm.predict(X_test)
    return y_pred

def GaussianSVM(X_train,y_train, X_test):
    print('implementing SVM...')
    clf = SVC(kernel='rbf', C=1.0) # Gaussian radial basis function (RBF) kernel
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def PolySVM(X_train,y_train, X_test):
    print('implementing SVM...')
    clf = SVC(kernel='poly', degree=2, coef0=1, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def SigmoidSVM(X_train,y_train, X_test):
    print('implementing SVM...')
    clf = SVC(kernel='sigmoid', gamma='scale', random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def NaiveBayes(X_train,y_train, X_test):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    return y_pred

def Logistic(X_train,y_train, X_test):
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    
    # make predictions on the test set
    y_pred = lr.predict(X_test)    
    return y_pred

def CART(X_train,y_train, X_test):
    clf = DecisionTreeClassifier(max_depth=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred



In [4]:
def PCA_model(X, y=None):
    print("PCA model")
    pca = PCA(n_components=2)
    pca.fit(X)
    X_pca = pca.transform(X)
    df_pca = pd.DataFrame(data=X_pca)
    #print(df_pca.shape)
    return df_pca

def CE_Model(X, y=None):
    print('CE Model')
    embedding = SpectralEmbedding(n_components=3)
    X_CE = embedding.fit_transform(X)
    
    #print(X_CE.shape)
    return X_CE

def CFS(X, y):
    print('CFS Model')
    selector = SelectKBest(score_func=f_regression, k=5)
    X_new = selector.fit_transform(X, y)
    return X_new

def LLCFS(X, y=None):
    print('LLCFS Model')
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X

def ILFS(X, y):
    # create a linear regression model
    print('ILFS Model')
    model = LinearRegression()
    
    # define the search space
    k_features = np.arange(1, X.shape[1]+1)
    
    # create a sequential feature selector object
    selector = SequentialFeatureSelector(model, k_features=k_features, forward=True, scoring='r2', cv=5)
    
    # perform incremental feature selection
    selector.fit(X, y)
    
    # print the selected feature indices
    print("Indices of selected features:", selector.k_feature_idx_)

def one_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test 

def get_classification_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    num_classes = cm.shape[0]
    sensitivity = []
    specificity = []
    for i in range(num_classes):
        tp = cm[i,i]
        fn = np.sum(cm[i,:]) - tp
        fp = np.sum(cm[:,i]) - tp
        tn = np.sum(cm) - tp - fp - fn
        
        sensitivity_i = tp / (tp + fn)
        specificity_i = tn / (tn + fp)
        sensitivity.append(sensitivity_i)
        specificity.append(specificity_i)
    
    macro_sensitivity = np.mean(sensitivity)
    macro_specificity = np.mean(specificity)
    
    return accuracy, macro_sensitivity, macro_specificity

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, average_precision_score

In [5]:
# Load the text file into a DataFrame
# df1 = pd.read_csv('processed.cleveland.data', delimiter=',', header=None)
df1 = pd.read_excel('cleveland data.xlsx')
df2 = pd.read_excel('CTG.xls', sheet_name = 'Raw Data')

df2 = df2[[i for i in df2 if i not in ['FileName','Date','SegFile']]]
print(df1)
print()
print(df2)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope  ca  thal  num  
0        3   0     6    0  
1        2   3     3    2  
2  

In [6]:
predictors = [LinearSVM, GaussianSVM, PolySVM, SigmoidSVM, NaiveBayes, Logistic, CART]
predictorsTXT = ['LinearSVM', 'GaussianSVM', 'PolySVM', 'SigmoidSVM', 'NaiveBayes', 'Logistic', 'CART']

def predict(X,Y):
    X_train, X_test, y_train, y_test = one_split(X, Y)
    pred_Y = [pred(X_train,y_train, X_test) for pred in predictors]
    return [get_classification_metrics(y_test, p) for p in pred_Y]
    

In [9]:
X1, Y1 = split_labels(df1, ['num'])
X2, Y2 = split_labels(df2, ['CLASS'])

results1 = predict(X1, Y1)
results2 = predict(X2, Y2)

print(results1)

print('Heart Disease')
acc, sen, spe = [],[],[]
for a, s, e in results1:
    acc.append(a)
    sen.append(s)
    spe.append(e)
print(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))


print('Heart Disease2')
acc, sen, spe = [],[],[]
for a, s, e  in results2:
    acc.append(a)
    sen.append(s)
    spe.append(e)
print(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)


implementing SVM...
implementing SVM...
implementing SVM...


  y = column_or_1d(y, warn=True)


implementing SVM...
implementing SVM...
implementing SVM...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[(0.47540983606557374, 0.2, 0.8120192307692307), (0.47540983606557374, 0.2, 0.8), (0.47540983606557374, 0.2, 0.8), (0.47540983606557374, 0.2, 0.8), (0.4918032786885246, 0.36130268199233717, 0.8767527028805224), (0.5573770491803278, 0.33779419813902567, 0.8742459590673877), (0.5409836065573771, 0.33773946360153256, 0.8709140066282923)]
Heart Disease
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.475410     0.200000     0.812019
1  GaussianSVM  0.475410     0.200000     0.800000
2      PolySVM  0.475410     0.200000     0.800000
3   SigmoidSVM  0.475410     0.200000     0.800000
4   NaiveBayes  0.491803     0.361303     0.876753
5     Logistic  0.557377     0.337794     0.874246
6         CART  0.540984     0.337739     0.870914
Heart Disease2
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.399061     0.330943     0.933304
1  GaussianSVM  0.272300     0.102740     0.900753
2      PolySVM  0.305164     0.125179     0.905571
3   SigmoidSVM  0.2230

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# performing experiment 2    

nX1 = stats.zscore(X1)
nX2 = stats.zscore(X2)

DR = [PCA_model, CE_Model, CFS, LLCFS]

DR_TXT = ['PCA_model', 'CE_Model', 'CFS', 'LLCFS']

X1s = [d(X1, Y1) for d in DR]
results1 = []
for x1 in X1s:
    results1.append(predict(x1, Y1))

for i in range(4):
    rd = DR_TXT[i]
    print(rd)
    acc, sen, spe = [],[],[]
    for a, s, e in results1[i]:
        acc.append(a)
        sen.append(s)
        spe.append(e)
    print(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


PCA model
CE Model
CFS Model
LLCFS Model
implementing SVM...
implementing SVM...
implementing SVM...
implementing SVM...
implementing SVM...
implementing SVM...
implementing SVM...
implementing SVM...
implementing SVM...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


implementing SVM...
implementing SVM...
implementing SVM...
PCA_model
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.508197     0.301779     0.847151
1  GaussianSVM  0.475410     0.200000     0.800000
2      PolySVM  0.475410     0.200000     0.800000
3   SigmoidSVM  0.508197     0.238889     0.821644
4   NaiveBayes  0.540984     0.323262     0.840095
5     Logistic  0.540984     0.323262     0.839993
6         CART  0.426230     0.189080     0.814819
CE_Model
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.475410     0.200000     0.800000
1  GaussianSVM  0.475410     0.200000     0.800000
2      PolySVM  0.475410     0.200000     0.800000
3   SigmoidSVM  0.491803     0.228571     0.815679
4   NaiveBayes  0.491803     0.216667     0.817092
5     Logistic  0.475410     0.200000     0.800000
6         CART  0.426230     0.247756     0.815572
CFS
         model  accuracy  sensitivity  specificity
0    LinearSVM  0.196721     0.200000     0.800000

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [11]:
X2s = [d(X2, Y2) for d in DR]
results2 = []
for x2 in X2s:
    results2.append(predict(x2, Y2))
print(results2)
for i in range(4):
    rd = DR_TXT[i]
    print(rd)
    acc, sen, spe = [],[],[]
    for a, s, e in results2[i]:
        acc.append(a)
        sen.append(s)
        spe.append(e)
    print(pd.DataFrame({'model': predictorsTXT,'accuracy': acc, 'sensitivity': sen, 'specificity': spe}))

PCA model
CE Model
CFS Model
LLCFS Model


  y = column_or_1d(y, warn=True)
  correlation_coefficient /= X_norms
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


implementing SVM...
implementing SVM...
implementing SVM...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


implementing SVM...
implementing SVM...
implementing SVM...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


implementing SVM...
implementing SVM...
implementing SVM...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


implementing SVM...
implementing SVM...
implementing SVM...
[[(0.14553990610328638, 0.09498444918055589, 0.8973812437043472), (0.27699530516431925, 0.10794280221100697, 0.9020638120142369), (0.28169014084507044, 0.10920451814467676, 0.902965424565991), (0.2347417840375587, 0.12424264618929362, 0.9057714441722636), (0.2863849765258216, 0.11265435277691946, 0.9038081917653127), (0.2699530516431925, 0.10207997161854407, 0.9007005838611555), (0.2981220657276995, 0.1645845910878355, 0.9082493927849404)], [(0.2676056338028169, 0.1, 0.9), (0.2699530516431925, 0.10826724345109348, 0.9021446211956127), (0.2605633802816901, 0.10229512136505647, 0.9004530761967022), (0.24178403755868544, 0.11910808222992794, 0.9050552578245666), (0.24882629107981222, 0.09544580629656332, 0.8999504170338017), (0.2676056338028169, 0.1, 0.9), (0.2535211267605634, 0.1116381707693892, 0.901673755651028)], [(0.8333333333333334, 0.7680807410683572, 0.9809512635083089), (0.8286384976525821, 0.7666744475568005, 0.98044028