# Retrieving the Dataset

In [1]:
# claiming the dataset
import pandas as pd

cxr_dataset = pd.read_csv("../features.csv")
cxr_dataset.head()

Unnamed: 0.1,Unnamed: 0,area,perimeter,eccentricity,major axis,minor axis,contrast,homogeneity,energy,correlation,entropy,mean,variance,skewness,uniformity,snr,label
0,0,6518,526.592929,0.88773,198.105599,91.200867,157.943366,0.854284,0.841933,0.801273,1.533093,49.499035,658.165072,0.008017,120,1.929431,COVID-19
1,1,9375,647.806133,0.828264,205.513966,115.157352,168.538035,0.792434,0.767159,0.826483,2.141924,45.94283,615.794529,0.033062,112,1.851398,COVID-19
2,2,9413,634.8772,0.856131,210.678586,108.870141,165.910877,0.801393,0.770478,0.850527,2.152794,49.395408,654.809157,-0.01058,102,1.93032,COVID-19
3,3,9801,707.546248,0.800774,216.524568,129.690916,180.991978,0.78175,0.751488,0.826203,2.251974,46.978399,555.900959,-0.062853,111,1.992505,COVID-19
4,4,9314,957.504617,0.904771,260.378375,110.89482,217.065132,0.798402,0.772575,0.842419,2.166329,56.588928,738.553988,-0.046816,141,2.082286,COVID-19


In [2]:
drop_rows = cxr_dataset.loc[cxr_dataset['label'] == 'No Finding']
cxr_dataset.drop(drop_rows.index, inplace=True)
cxr_dataset['label'].unique()

array(['COVID-19', 'Pneumonia'], dtype=object)

# Splitting the dataset to train and test

In [3]:
# Getting the features needed

#get the feature columns
col_features = list(cxr_dataset.columns)
#get the label column
label = list(cxr_dataset.columns).pop()
#deleting label column from col
del col_features[0]
del col_features[len(col_features) - 1]

features = cxr_dataset.loc[:, col_features].values
labels = cxr_dataset.loc[:, label].values

display(pd.DataFrame(features))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,6518.0,526.592929,0.887730,198.105599,91.200867,157.943366,0.854284,0.841933,0.801273,1.533093,49.499035,658.165072,0.008017,120.0,1.929431
1,9375.0,647.806133,0.828264,205.513966,115.157352,168.538035,0.792434,0.767159,0.826483,2.141924,45.942830,615.794529,0.033062,112.0,1.851398
2,9413.0,634.877200,0.856131,210.678586,108.870141,165.910877,0.801393,0.770478,0.850527,2.152794,49.395408,654.809157,-0.010580,102.0,1.930320
3,9801.0,707.546248,0.800774,216.524568,129.690916,180.991978,0.781750,0.751488,0.826203,2.251974,46.978399,555.900959,-0.062853,111.0,1.992505
4,9314.0,957.504617,0.904771,260.378375,110.894820,217.065132,0.798402,0.772575,0.842419,2.166329,56.588928,738.553988,-0.046816,141.0,2.082286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,11182.0,703.002092,0.800872,216.271813,129.511278,203.974686,0.755260,0.718953,0.801348,2.516232,42.564953,632.976885,0.325367,107.0,1.691836
196,9618.0,712.700577,0.833417,223.550149,123.543806,260.003644,0.782794,0.758596,0.822393,2.220828,57.168335,733.536040,-0.321502,119.0,2.110789
197,9338.0,642.090404,0.848470,212.157649,112.283192,216.963897,0.792258,0.767125,0.832281,2.159193,54.375191,671.337345,-0.137500,116.0,2.098603
198,10792.0,692.960461,0.854878,220.512460,114.408454,184.125978,0.766835,0.730245,0.838390,2.429811,46.855780,624.884493,-0.098092,102.0,1.874404


In [4]:
from sklearn.model_selection import train_test_split

# Separating test and train dataset
train_features, test_features, train_lbl, test_lbl = train_test_split( features, labels, test_size=0.2, random_state=0)

In [5]:
#Standardizing the Dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fitting of training dataset only
scaler.fit(train_features)

# Apply transform to both the training set and the test set.
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

# Applying Principal Component Analysis

In [6]:
# Feeding it to Principal Component Analysis
from sklearn.decomposition import PCA

# 5 different variance
variances = ["100%", "99%", "95%", "90%", "85%"]
pca = [PCA(), PCA(.99), PCA(.95), PCA(.90), PCA(.85)]
# results of each PCA variance
pca_fit = []

for p in pca:
    pca_content = {"fit": None, "transforms": {"train": None, "test": None}}
    pca_content["fit"] = p.fit(train_features)
    pca_content["transforms"]["train"] = p.transform(train_features)
    pca_content["transforms"]["test"] = p.transform(test_features)
    pca_fit.append(pca_content)

In [7]:
# checking the number of Principal Component
print("From 14 features")
for i in range(len(variances)):
    print(f"Principal Components with {variances[i]} variance:", pca_fit[i]["fit"].n_components_)

From 14 features
Principal Components with 100% variance: 15
Principal Components with 99% variance: 8
Principal Components with 95% variance: 6
Principal Components with 90% variance: 5
Principal Components with 85% variance: 4


# Applying Linear Discriminant Analysis

### Libraries Needed

In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

import numpy as np

lda = LinearDiscriminantAnalysis()

### With Principal Component Analysis

In [9]:
# Evaluating of Model
for i in range(len(pca_fit)):
    # define model evaluation method
    cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    # evaluate model
    scores = cross_val_score(lda, pca_fit[i]["transforms"]["train"], np.ravel(train_lbl), scoring='accuracy', cv=cv)
    # summarize result
    print('Variance of %s:\nMean Accuracy: %.4f (%.4f)\n' % (variances[i], np.mean(scores), np.std(scores)))

Variance of 100%:
Mean Accuracy: 0.5250 (0.1053)

Variance of 99%:
Mean Accuracy: 0.5125 (0.1244)

Variance of 95%:
Mean Accuracy: 0.5375 (0.1431)

Variance of 90%:
Mean Accuracy: 0.5625 (0.1152)

Variance of 85%:
Mean Accuracy: 0.5125 (0.1000)



In [10]:
#Train the model
results = [] # where all the results will place

for i in range(len(pca_fit)):
    lda.fit(pca_fit[i]["transforms"]["train"], np.ravel(train_lbl))
    predictions = lda.predict(pca_fit[i]["transforms"]["test"])
    results.append(predictions)

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

for i in range(len(pca_fit)):
    print("Variance of", variances[i])
    print("Accuracy Score -", accuracy_score(test_lbl, results[i]))
    
    print("Confusion Matrix")
    print(confusion_matrix(test_lbl, results[i]))
    
    print("Classification Report")
    print(classification_report(test_lbl, results[i]))
    
    print()

Variance of 100%
Accuracy Score - 0.725
Confusion Matrix
[[18  0]
 [11 11]]
Classification Report
              precision    recall  f1-score   support

    COVID-19       0.62      1.00      0.77        18
   Pneumonia       1.00      0.50      0.67        22

    accuracy                           0.73        40
   macro avg       0.81      0.75      0.72        40
weighted avg       0.83      0.72      0.71        40


Variance of 99%
Accuracy Score - 0.65
Confusion Matrix
[[16  2]
 [12 10]]
Classification Report
              precision    recall  f1-score   support

    COVID-19       0.57      0.89      0.70        18
   Pneumonia       0.83      0.45      0.59        22

    accuracy                           0.65        40
   macro avg       0.70      0.67      0.64        40
weighted avg       0.72      0.65      0.64        40


Variance of 95%
Accuracy Score - 0.625
Confusion Matrix
[[16  2]
 [13  9]]
Classification Report
              precision    recall  f1-score   support

### Without Principal Component Analysis

In [12]:
# Performing towards the untransformed dataset

# define model evaluation method
cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# evaluate model
scores = cross_val_score(lda, train_features, np.ravel(train_lbl), scoring='accuracy', cv=cv)

# summarize result
print('Linear Discriminant Analysis Cross Validation Score:\nMean Accuracy - %.4f (%.4f)\n' % (np.mean(scores), np.std(scores)))

Linear Discriminant Analysis Cross Validation Score:
Mean Accuracy - 0.5250 (0.1053)



In [13]:
lda.fit(train_features, np.ravel(train_lbl))
predictions = lda.predict(test_features)

In [14]:
print("Linear Disriminant Analysis Performance")
print("Accuracy Score -", accuracy_score(test_lbl, predictions))

print("Confusion Matrix")
print(confusion_matrix(test_lbl, predictions))

print("Classification Report")
print(classification_report(test_lbl, predictions))

Linear Disriminant Analysis Performance
Accuracy Score - 0.725
Confusion Matrix
[[18  0]
 [11 11]]
Classification Report
              precision    recall  f1-score   support

    COVID-19       0.62      1.00      0.77        18
   Pneumonia       1.00      0.50      0.67        22

    accuracy                           0.73        40
   macro avg       0.81      0.75      0.72        40
weighted avg       0.83      0.72      0.71        40

