# Retrieving the Dataset

In [1]:
# claiming the dataset
import pandas as pd

cxr_dataset = pd.read_csv("../features.csv")
cxr_dataset.head()

Unnamed: 0.1,Unnamed: 0,area,perimeter,eccentricity,major axis,minor axis,contrast,homogeneity,energy,correlation,entropy,mean,variance,skewness,uniformity,snr,label
0,0,14009,872.918831,0.772842,220.775083,140.103534,765.908286,0.677188,0.650629,0.857881,3.379262,93.933662,2234.668457,-0.084953,204,1.987077,COVID-19
1,1,15372,814.47518,0.730223,221.170957,151.106051,424.12825,0.658225,0.613227,0.868435,3.489235,70.139269,1276.482359,-0.204411,142,1.963152,COVID-19
2,2,16571,834.534055,0.801781,236.579803,141.384497,458.618048,0.649952,0.591044,0.898604,3.782812,81.605947,1705.915375,-0.209387,167,1.9758,COVID-19
3,3,17063,1125.67114,0.483926,220.717711,193.152064,733.003509,0.613511,0.5675,0.816494,3.897109,74.981767,1561.562807,0.042438,162,1.897474,COVID-19
4,4,13075,1034.960461,0.879634,262.28857,124.757907,406.461545,0.713722,0.676923,0.858812,3.010218,71.963921,1088.206028,-0.278774,143,2.18152,COVID-19


In [2]:
drop_rows = cxr_dataset.loc[cxr_dataset['label'] == 'Pneumonia']
cxr_dataset.drop(drop_rows.index, inplace=True)
cxr_dataset['label'].unique()

array(['COVID-19', 'No Finding', nan], dtype=object)

# Splitting the dataset to train and test

In [3]:
# Getting the features needed

#get the feature columns
col_features = list(cxr_dataset.columns)
#get the label column
label = list(cxr_dataset.columns).pop()
#deleting label column from col
del col_features[0]
del col_features[len(col_features) - 1]

features = cxr_dataset.loc[:, col_features].values
labels = cxr_dataset.loc[:, label].values

display(pd.DataFrame(features))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,14009.0,872.918831,0.772842,220.775083,140.103534,765.908286,0.677188,0.650629,0.857881,3.379262,93.933662,2234.668457,-0.084953,204.0,1.987077
1,15372.0,814.475180,0.730223,221.170957,151.106051,424.128250,0.658225,0.613227,0.868435,3.489235,70.139269,1276.482359,-0.204411,142.0,1.963152
2,16571.0,834.534055,0.801781,236.579803,141.384497,458.618048,0.649952,0.591044,0.898604,3.782812,81.605947,1705.915375,-0.209387,167.0,1.975800
3,17063.0,1125.671140,0.483926,220.717711,193.152064,733.003509,0.613511,0.567500,0.816494,3.897109,74.981767,1561.562807,0.042438,162.0,1.897474
4,13075.0,1034.960461,0.879634,262.288570,124.757907,406.461545,0.713722,0.676923,0.858812,3.010218,71.963921,1088.206028,-0.278774,143.0,2.181520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,15556.0,1015.653896,0.640768,242.049702,185.829988,223.637786,0.660449,0.605939,0.826710,3.308360,44.163774,511.329247,-0.079256,100.0,1.953061
196,14041.0,772.759451,0.793892,226.060095,137.457868,214.151923,0.695627,0.645357,0.832257,3.066808,44.345364,605.196593,0.106919,110.0,1.802603
197,17248.0,945.085353,0.611615,227.983427,180.370303,194.608256,0.631773,0.559814,0.811121,3.547618,37.803748,403.298135,0.284334,101.0,1.882443
198,17236.0,918.842712,0.689811,240.821141,174.352064,218.873845,0.623339,0.560574,0.776001,3.552407,35.716039,435.134642,0.484559,114.0,1.712188


In [4]:
from sklearn.model_selection import train_test_split

# Separating test and train dataset
train_features, test_features, train_lbl, test_lbl = train_test_split( features, labels, test_size=0.2, random_state=0)

In [5]:
#Standardizing the Dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fitting of training dataset only
scaler.fit(train_features)

# Apply transform to both the training set and the test set.
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

# Applying Principal Component Analysis

In [6]:
# Feeding it to Principal Component Analysis
from sklearn.decomposition import PCA

# 5 different variance
variances = ["100%", "99%", "95%", "90%", "85%"]
pca = [PCA(), PCA(.99), PCA(.95), PCA(.90), PCA(.85)]
# results of each PCA variance
pca_fit = []

for p in pca:
    pca_content = {"fit": None, "transforms": {"train": None, "test": None}}
    pca_content["fit"] = p.fit(train_features)
    pca_content["transforms"]["train"] = p.transform(train_features)
    pca_content["transforms"]["test"] = p.transform(test_features)
    pca_fit.append(pca_content)

In [7]:
# checking the number of Principal Component
print("From 14 features")
for i in range(len(variances)):
    print(f"Principal Components with {variances[i]} variance:", pca_fit[i]["fit"].n_components_)

From 14 features
Principal Components with 100% variance: 15
Principal Components with 99% variance: 8
Principal Components with 95% variance: 5
Principal Components with 90% variance: 4
Principal Components with 85% variance: 4


# Applying Linear Discriminant Analysis

### Libraries Needed

In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

import numpy as np

lda = LinearDiscriminantAnalysis()

### With Principal Component Analysis

In [9]:
# Evaluating of Model
for i in range(len(pca_fit)):
    # define model evaluation method
    cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    # evaluate model
    scores = cross_val_score(lda, pca_fit[i]["transforms"]["train"], np.ravel(train_lbl), scoring='accuracy', cv=cv)
    # summarize result
    print('Variance of %s:\nMean Accuracy: %.4f (%.4f)\n' % (variances[i], np.mean(scores), np.std(scores)))

ValueError: Input contains NaN

In [None]:
#Train the model
results = [] # where all the results will place

for i in range(len(pca_fit)):
    lda.fit(pca_fit[i]["transforms"]["train"], np.ravel(train_lbl))
    predictions = lda.predict(pca_fit[i]["transforms"]["test"])
    results.append(predictions)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

for i in range(len(pca_fit)):
    print("Variance of", variances[i])
    print("Accuracy Score -", accuracy_score(test_lbl, results[i]))
    
    print("Confusion Matrix")
    print(confusion_matrix(test_lbl, results[i]))
    
    print("Classification Report")
    print(classification_report(test_lbl, results[i]))
    
    print()

### Without Principal Component Analysis

In [None]:
# Performing towards the untransformed dataset

# define model evaluation method
cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# evaluate model
scores = cross_val_score(lda, train_features, np.ravel(train_lbl), scoring='accuracy', cv=cv)

# summarize result
print('Linear Discriminant Analysis Cross Validation Score:\nMean Accuracy - %.4f (%.4f)\n' % (np.mean(scores), np.std(scores)))

In [None]:
lda.fit(train_features, np.ravel(train_lbl))
predictions = lda.predict(test_features)

In [None]:
print("Linear Disriminant Analysis Performance")
print("Accuracy Score -", accuracy_score(test_lbl, predictions))

print("Confusion Matrix")
print(confusion_matrix(test_lbl, predictions))

print("Classification Report")
print(classification_report(test_lbl, predictions))