# Retrieving the Dataset

In [1]:
# claiming the dataset
import pandas as pd

cxr_dataset = pd.read_csv("features-covid-pneu.csv")
cxr_dataset.head()

Unnamed: 0,area,perimeter,eccentricity,major axis,minor axis,contrast,homogeneity,energy,correlation,entropy,mean,variance,skewness,uniformity,snr,label
0,8363,598.534055,0.697577,164.863765,118.126564,296.982764,0.828278,0.784168,0.94417,2.109482,123.556858,599.227516,-0.284395,114,5.047438,COVID-19
1,9469,659.362482,0.793141,190.814257,116.213102,122.146985,0.81268,0.757011,0.923784,2.385392,57.059534,894.897091,-0.121557,122,1.9074,COVID-19
2,11396,669.345238,0.806961,203.127645,119.968148,226.278844,0.774707,0.70815,0.942846,2.879118,87.926816,1395.984641,-0.145711,161,2.35332,COVID-19
3,12532,779.002092,0.55065,187.560325,156.563338,529.477236,0.762863,0.67747,0.942298,2.933532,143.056416,549.747615,0.039205,106,6.101346,COVID-19
4,5902,529.663997,0.83781,189.916672,103.687221,197.688291,0.890493,0.847027,0.945379,1.526313,117.577601,428.891555,-0.396287,98,5.677418,COVID-19


# Splitting the dataset to train and test

In [2]:
# Getting the features needed

#get the feature columns
col_features = list(cxr_dataset.columns)
#get the label column
label = list(cxr_dataset.columns).pop()
#deleting label column from col
del col_features[len(col_features) - 1]

features = cxr_dataset.loc[:, col_features].values
labels = cxr_dataset.loc[:, label].values

display(pd.DataFrame(features))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,8363.0,598.534055,0.697577,164.863765,118.126564,296.982764,0.828278,0.784168,0.944170,2.109482,123.556858,599.227516,-0.284395,114.0,5.047438
1,9469.0,659.362482,0.793141,190.814257,116.213102,122.146985,0.812680,0.757011,0.923784,2.385392,57.059534,894.897091,-0.121557,122.0,1.907400
2,11396.0,669.345238,0.806961,203.127645,119.968148,226.278844,0.774707,0.708150,0.942846,2.879118,87.926816,1395.984641,-0.145711,161.0,2.353320
3,12532.0,779.002092,0.550650,187.560325,156.563338,529.477236,0.762863,0.677470,0.942298,2.933532,143.056416,549.747615,0.039205,106.0,6.101346
4,5902.0,529.663997,0.837810,189.916672,103.687221,197.688291,0.890493,0.847027,0.945379,1.526313,117.577601,428.891555,-0.396287,98.0,5.677418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,9192.0,706.918831,0.791030,187.965310,114.992969,246.659749,0.824464,0.763028,0.924342,2.305524,89.993473,829.674892,0.468450,118.0,3.124331
196,13351.0,750.818326,0.762557,212.662501,137.575791,341.433367,0.723151,0.663199,0.926467,3.315241,87.552429,1906.638008,-0.031844,172.0,2.005091
197,10947.0,816.173665,0.497377,185.397114,160.838387,302.408945,0.781180,0.718152,0.934239,2.719245,101.556134,882.705970,-0.307008,142.0,3.418205
198,13240.0,727.546248,0.797353,211.565280,127.682566,244.350804,0.745893,0.660957,0.935057,3.245768,80.988066,1278.044722,-0.089425,139.0,2.265416


In [3]:
from sklearn.model_selection import train_test_split

# Separating test and train dataset
train_features, test_features, train_lbl, test_lbl = train_test_split( features, labels, test_size=0.2, random_state=0)

In [4]:
#Standardizing the Dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fitting of training dataset only
scaler.fit(train_features)

# Apply transform to both the training set and the test set.
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

# Applying Principal Component Analysis

In [5]:
# Feeding it to Principal Component Analysis
from sklearn.decomposition import PCA

# 5 different variance
variances = ["100%", "99%", "95%", "90%", "85%"]
pca = [PCA(1), PCA(.99), PCA(.95), PCA(.90), PCA(.85)]
# results of each PCA variance
pca_fit = []

for p in pca:
    pca_content = {"fit": None, "transforms": {"train": None, "test": None}}
    pca_content["fit"] = p.fit(train_features)
    pca_content["transforms"]["train"] = p.transform(train_features)
    pca_content["transforms"]["test"] = p.transform(test_features)
    pca_fit.append(pca_content)

In [6]:
# checking the number of Principal Component
print("From 14 features")
for i in range(len(variances)):
    print(f"Principal Components with {variances[i]} variance:", pca_fit[i]["fit"].n_components_)

From 14 features
Principal Components with 100% variance: 1
Principal Components with 99% variance: 9
Principal Components with 95% variance: 6
Principal Components with 90% variance: 5
Principal Components with 85% variance: 4


# Applying Linear Discriminant Analysis

### Libraries Needed

In [7]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

import numpy as np

lda = LinearDiscriminantAnalysis()

### With Principal Component Analysis

In [8]:
# Evaluating of Model
for i in range(len(pca_fit)):
    # define model evaluation method
    cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    # evaluate model
    scores = cross_val_score(lda, pca_fit[i]["transforms"]["train"], np.ravel(train_lbl), scoring='accuracy', cv=cv)
    # summarize result
    print('Variance of %s:\nMean Accuracy: %.4f (%.4f)\n' % (variances[i], np.mean(scores), np.std(scores)))

Variance of 100%:
Mean Accuracy: 0.4938 (0.0337)

Variance of 99%:
Mean Accuracy: 0.6125 (0.1364)

Variance of 95%:
Mean Accuracy: 0.5938 (0.1161)

Variance of 90%:
Mean Accuracy: 0.5625 (0.1425)

Variance of 85%:
Mean Accuracy: 0.5938 (0.1459)



In [9]:
#Train the model
results = [] # where all the results will place

for i in range(len(pca_fit)):
    lda.fit(pca_fit[i]["transforms"]["train"], np.ravel(train_lbl))
    predictions = lda.predict(pca_fit[i]["transforms"]["test"])
    results.append(predictions)

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

for i in range(len(pca_fit)):
    print("Variance of", variances[i])
    print("Accuracy Score -", accuracy_score(test_lbl, results[i]))
    
    print("Confusion Matrix")
    print(confusion_matrix(test_lbl, results[i]))
    
    print("Classification Report")
    print(classification_report(test_lbl, results[i]))
    
    print()

Variance of 100%
Accuracy Score - 0.45
Confusion Matrix
[[18  0]
 [22  0]]
Classification Report
              precision    recall  f1-score   support

    COVID-19       0.45      1.00      0.62        18
   Pneumonia       0.00      0.00      0.00        22

    accuracy                           0.45        40
   macro avg       0.23      0.50      0.31        40
weighted avg       0.20      0.45      0.28        40


Variance of 99%
Accuracy Score - 0.7
Confusion Matrix
[[13  5]
 [ 7 15]]
Classification Report
              precision    recall  f1-score   support

    COVID-19       0.65      0.72      0.68        18
   Pneumonia       0.75      0.68      0.71        22

    accuracy                           0.70        40
   macro avg       0.70      0.70      0.70        40
weighted avg       0.71      0.70      0.70        40


Variance of 95%
Accuracy Score - 0.625
Confusion Matrix
[[11  7]
 [ 8 14]]
Classification Report
              precision    recall  f1-score   support



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Without Principal Component Analysis

In [11]:
# Performing towards the untransformed dataset

# define model evaluation method
cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# evaluate model
scores = cross_val_score(lda, train_features, np.ravel(train_lbl), scoring='accuracy', cv=cv)

# summarize result
print('Linear Discriminant Analysis Cross Validation Score:\nMean Accuracy - %.4f (%.4f)\n' % (np.mean(scores), np.std(scores)))

Linear Discriminant Analysis Cross Validation Score:
Mean Accuracy - 0.6188 (0.1025)



In [12]:
lda.fit(train_features, np.ravel(train_lbl))
predictions = lda.predict(test_features)

In [13]:
print("Linear Disriminant Analysis Performance")
print("Accuracy Score -", accuracy_score(test_lbl, predictions))

print("Confusion Matrix")
print(confusion_matrix(test_lbl, predictions))

print("Classification Report")
print(classification_report(test_lbl, predictions))

Linear Disriminant Analysis Performance
Accuracy Score - 0.7
Confusion Matrix
[[14  4]
 [ 8 14]]
Classification Report
              precision    recall  f1-score   support

    COVID-19       0.64      0.78      0.70        18
   Pneumonia       0.78      0.64      0.70        22

    accuracy                           0.70        40
   macro avg       0.71      0.71      0.70        40
weighted avg       0.71      0.70      0.70        40

