# Retrieving the Dataset

In [1]:
# claiming the dataset
import pandas as pd

cxr_dataset = pd.read_csv("features-healthy-pneu.csv")
cxr_dataset.head()

Unnamed: 0,area,perimeter,eccentricity,major axis,minor axis,contrast,homogeneity,energy,correlation,entropy,mean,variance,skewness,uniformity,snr,label
0,13459,769.404112,0.779348,209.685418,131.387009,204.721759,0.753029,0.655293,0.934412,3.155457,76.84828,706.521152,0.291785,115,2.891156,Pneumonia
1,10873,750.818326,0.833472,226.605124,125.213454,267.101909,0.797456,0.720304,0.952481,2.621196,115.401729,608.303803,-0.539949,120,4.678989,Pneumonia
2,15930,701.646753,0.764028,210.573735,135.858493,311.126683,0.692684,0.594192,0.958219,3.833121,114.331827,1462.671059,-0.086234,165,2.989467,Pneumonia
3,14592,812.132034,0.719198,210.328721,146.137489,529.961407,0.725397,0.626005,0.953067,3.322109,153.205661,532.98025,-0.007271,127,6.636197,Pneumonia
4,13897,788.232539,0.77866,216.578902,135.891612,390.489899,0.756644,0.644418,0.951109,3.183786,128.025113,769.739385,0.679418,108,4.614486,Pneumonia


# Splitting the dataset to train and test

In [2]:
# Getting the features needed

#get the feature columns
col_features = list(cxr_dataset.columns)
#get the label column
label = list(cxr_dataset.columns).pop()
#deleting label column from col
del col_features[len(col_features) - 1]

features = cxr_dataset.loc[:, col_features].values
labels = cxr_dataset.loc[:, label].values

display(pd.DataFrame(features))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,13459.0,769.404112,0.779348,209.685418,131.387009,204.721759,0.753029,0.655293,0.934412,3.155457,76.848280,706.521152,0.291785,115.0,2.891156
1,10873.0,750.818326,0.833472,226.605124,125.213454,267.101909,0.797456,0.720304,0.952481,2.621196,115.401729,608.303803,-0.539949,120.0,4.678989
2,15930.0,701.646753,0.764028,210.573735,135.858493,311.126683,0.692684,0.594192,0.958219,3.833121,114.331827,1462.671059,-0.086234,165.0,2.989467
3,14592.0,812.132034,0.719198,210.328721,146.137489,529.961407,0.725397,0.626005,0.953067,3.322109,153.205661,532.980250,-0.007271,127.0,6.636197
4,13897.0,788.232539,0.778660,216.578902,135.891612,390.489899,0.756644,0.644418,0.951109,3.183786,128.025113,769.739385,0.679418,108.0,4.614486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,14388.0,761.161472,0.627040,191.246217,148.978404,384.562915,0.719998,0.631385,0.931202,3.434403,102.508618,1021.830409,0.193422,138.0,3.206793
196,10458.0,710.232539,0.725283,198.499311,136.656937,138.038844,0.847926,0.730934,0.949088,2.218025,82.779021,107.456905,0.332197,49.0,7.985519
197,10938.0,731.688383,0.571715,177.165621,145.355895,390.779950,0.773507,0.718126,0.929993,2.714636,113.064637,885.195950,0.025558,131.0,3.800206
198,14671.0,832.315801,0.537399,200.188207,168.824383,159.016281,0.794901,0.624005,0.949332,2.885598,81.125349,101.646546,0.312934,53.0,8.046560


In [3]:
from sklearn.model_selection import train_test_split

# Separating test and train dataset
train_features, test_features, train_lbl, test_lbl = train_test_split( features, labels, test_size=0.2, random_state=0)

In [4]:
#Standardizing the Dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fitting of training dataset only
scaler.fit(train_features)

# Apply transform to both the training set and the test set.
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

# Applying Principal Component Analysis

In [5]:
# Feeding it to Principal Component Analysis
from sklearn.decomposition import PCA

# 5 different variance
variances = ["100%", "99%", "95%", "90%", "85%"]
pca = [PCA(1), PCA(.99), PCA(.95), PCA(.90), PCA(.85)]
# results of each PCA variance
pca_fit = []

for p in pca:
    pca_content = {"fit": None, "transforms": {"train": None, "test": None}}
    pca_content["fit"] = p.fit(train_features)
    pca_content["transforms"]["train"] = p.transform(train_features)
    pca_content["transforms"]["test"] = p.transform(test_features)
    pca_fit.append(pca_content)

In [6]:
# checking the number of Principal Component
print("From 14 features")
for i in range(len(variances)):
    print(f"Principal Components with {variances[i]} variance:", pca_fit[i]["fit"].n_components_)

From 14 features
Principal Components with 100% variance: 1
Principal Components with 99% variance: 9
Principal Components with 95% variance: 6
Principal Components with 90% variance: 5
Principal Components with 85% variance: 4


# Applying Linear Discriminant Analysis

### Libraries Needed

In [7]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

import numpy as np

lda = LinearDiscriminantAnalysis()

### With Principal Component Analysis

In [8]:
# Evaluating of Model
for i in range(len(pca_fit)):
    # define model evaluation method
    cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    # evaluate model
    scores = cross_val_score(lda, pca_fit[i]["transforms"]["train"], np.ravel(train_lbl), scoring='accuracy', cv=cv)
    # summarize result
    print('Variance of %s:\nMean Accuracy: %.4f (%.4f)\n' % (variances[i], np.mean(scores), np.std(scores)))

Variance of 100%:
Mean Accuracy: 0.6000 (0.1159)

Variance of 99%:
Mean Accuracy: 0.7562 (0.1099)

Variance of 95%:
Mean Accuracy: 0.7500 (0.0927)

Variance of 90%:
Mean Accuracy: 0.7438 (0.1324)

Variance of 85%:
Mean Accuracy: 0.6937 (0.1491)



In [9]:
#Train the model
results = [] # where all the results will place

for i in range(len(pca_fit)):
    lda.fit(pca_fit[i]["transforms"]["train"], np.ravel(train_lbl))
    predictions = lda.predict(pca_fit[i]["transforms"]["test"])
    results.append(predictions)

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

for i in range(len(pca_fit)):
    print("Variance of", variances[i])
    print("Accuracy Score -", accuracy_score(test_lbl, results[i]))
    
    print("Confusion Matrix")
    print(confusion_matrix(test_lbl, results[i]))
    
    print("Classification Report")
    print(classification_report(test_lbl, results[i]))
    
    print()

Variance of 100%
Accuracy Score - 0.575
Confusion Matrix
[[13  9]
 [ 8 10]]
Classification Report
              precision    recall  f1-score   support

  No Finding       0.62      0.59      0.60        22
   Pneumonia       0.53      0.56      0.54        18

    accuracy                           0.57        40
   macro avg       0.57      0.57      0.57        40
weighted avg       0.58      0.57      0.58        40


Variance of 99%
Accuracy Score - 0.75
Confusion Matrix
[[19  3]
 [ 7 11]]
Classification Report
              precision    recall  f1-score   support

  No Finding       0.73      0.86      0.79        22
   Pneumonia       0.79      0.61      0.69        18

    accuracy                           0.75        40
   macro avg       0.76      0.74      0.74        40
weighted avg       0.76      0.75      0.74        40


Variance of 95%
Accuracy Score - 0.75
Confusion Matrix
[[20  2]
 [ 8 10]]
Classification Report
              precision    recall  f1-score   support


### Without Principal Component Analysis

In [11]:
# Performing towards the untransformed dataset

# define model evaluation method
cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# evaluate model
scores = cross_val_score(lda, train_features, np.ravel(train_lbl), scoring='accuracy', cv=cv)

# summarize result
print('Linear Discriminant Analysis Cross Validation Score:\nMean Accuracy - %.4f (%.4f)\n' % (np.mean(scores), np.std(scores)))

Linear Discriminant Analysis Cross Validation Score:
Mean Accuracy - 0.7125 (0.1159)



In [12]:
lda.fit(train_features, np.ravel(train_lbl))
predictions = lda.predict(test_features)

In [13]:
print("Linear Disriminant Analysis Performance")
print("Accuracy Score -", accuracy_score(test_lbl, predictions))

print("Confusion Matrix")
print(confusion_matrix(test_lbl, predictions))

print("Classification Report")
print(classification_report(test_lbl, predictions))

Linear Disriminant Analysis Performance
Accuracy Score - 0.775
Confusion Matrix
[[20  2]
 [ 7 11]]
Classification Report
              precision    recall  f1-score   support

  No Finding       0.74      0.91      0.82        22
   Pneumonia       0.85      0.61      0.71        18

    accuracy                           0.78        40
   macro avg       0.79      0.76      0.76        40
weighted avg       0.79      0.78      0.77        40

