# **IMPORTING LIBRARIES**

In [1]:
import numpy as np
from tensorflow.keras.preprocessing import image
import cv2 as cv
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import GridSearchCV, train_test_split
from skimage.io import imread
print("Files imported successfully")
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 30*30 + 1)

Files imported successfully


In [2]:
def load_image_files(container_path, dimension=(64, 64)):
    image_dir = Path(container_path)
    folders = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = [fo.name for fo in folders]

    descr = "A image classification dataset"
    images = []
    flat_data = []
    target = []
    count = 0
    train_img = []
    for i, direc in enumerate(folders):
        for file in direc.iterdir():
            count += 1
            img = imread(file)
            img_pred = cv.resize(img, (50, 50), interpolation=cv.INTER_AREA)
            img_pred = image.img_to_array(img_pred)
            img_pred = img_pred / 255
            train_img.append(img_pred)

    X = np.array(train_img)

    return X

In [3]:
X = []
X = load_image_files("lung_image_sets")

In [4]:
y0 = np.zeros(5000)

y1 = np.ones(5000)

y2 = np.zeros(5000)

In [5]:
y = []
y = np.concatenate((y0,y1,y2), axis=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, random_state=42, test_size=0.5)
print("X_train: "+str(X_train.shape))
print("X_test: "+str(X_test.shape))
print("X_val: "+str(X_val.shape))
print("y_train: "+str(y_train.shape))
print("y_test: "+str(y_test.shape))
print("y_val: "+str(y_val.shape))

X_train: (12000, 50, 50, 3)
X_test: (1500, 50, 50, 3)
X_val: (1500, 50, 50, 3)
y_train: (12000,)
y_test: (1500,)
y_val: (1500,)


In [6]:
from builtins import range
from builtins import object

num_training = X_train.shape[0]
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]

num_test = X_test.shape[0]
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]

num_val = X_val.shape[0]
mask = list(range(num_val))
X_val = X_val[mask]
y_val = y_val[mask]


In [7]:
# Reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
X_val = np.reshape(X_val, (X_val.shape[0], -1))

In [8]:
print("X_train: "+str(X_train.shape))
print("X_test: "+str(X_test.shape))
print("X_val: "+str(X_val.shape))
print("y_train: "+str(y_train.shape))
print("y_test: "+str(y_test.shape))
print("y_val: "+str(y_val.shape))

X_train: (12000, 7500)
X_test: (1500, 7500)
X_val: (1500, 7500)
y_train: (12000,)
y_test: (1500,)
y_val: (1500,)


# **KNN**

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier

In [10]:
knn = KNeighborsClassifier(3) # Define classifier
knn.fit(X_train, y_train) # Train model

KNeighborsClassifier(n_neighbors=3)

In [11]:
# Make predictions
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

In [12]:
# Training set performance
knn_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
knn_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
knn_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score


In [13]:
# Test set performance
knn_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
knn_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
knn_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

In [14]:
print('Model performance for Training set')
print('- Accuracy: %s' % knn_train_accuracy)
print('- MCC: %s' % knn_train_mcc)
print('- F1 score: %s' % knn_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % knn_test_accuracy)
print('- MCC: %s' % knn_test_mcc)
print('- F1 score: %s' % knn_test_f1)

Model performance for Training set
- Accuracy: 0.95125
- MCC: 0.8924034566344392
- F1 score: 0.9502052884299153
----------------------------------
Model performance for Test set
- Accuracy: 0.884
- MCC: 0.7422785189281594
- F1 score: 0.876308058203758


# **RANDOM FOREST**

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10) # Define classifier
rf.fit(X_train, y_train) # Train model

RandomForestClassifier(n_estimators=10)

In [16]:
# Make predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

In [17]:
# Training set performance
rf_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
rf_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
rf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

In [18]:
# Test set performance
rf_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
rf_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
rf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

In [19]:
print('Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- MCC: %s' % rf_train_mcc)
print('- F1 score: %s' % rf_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- MCC: %s' % rf_test_mcc)
print('- F1 score: %s' % rf_test_f1)

Model performance for Training set
- Accuracy: 0.9994166666666666
- MCC: 0.9986926597558919
- F1 score: 0.9994165411662213
----------------------------------
Model performance for Test set
- Accuracy: 0.9886666666666667
- MCC: 0.9743708610924107
- F1 score: 0.9886210206045977


# **DECISION TREE**

In [20]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5) # Define classifier
dt.fit(X_train, y_train) # Train model

DecisionTreeClassifier(max_depth=5)

In [21]:
# Make predictions
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

In [22]:
# Training set performance
dt_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
dt_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
dt_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

In [23]:
# Test set performance
dt_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
dt_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
dt_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

In [24]:
print('Model performance for Training set')
print('- Accuracy: %s' % dt_train_accuracy)
print('- MCC: %s' % dt_train_mcc)
print('- F1 score: %s' % dt_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % dt_test_accuracy)
print('- MCC: %s' % dt_test_mcc)
print('- F1 score: %s' % dt_test_f1)

Model performance for Training set
- Accuracy: 0.9854166666666667
- MCC: 0.967253058693916
- F1 score: 0.9853962548586535
----------------------------------
Model performance for Test set
- Accuracy: 0.9766666666666667
- MCC: 0.9469578856278917
- F1 score: 0.9765726894800538


# **NAIVE BAYES CLASSIFIER**

In [25]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train) # Train model

GaussianNB()

In [26]:
# Make predictions
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [27]:
# Training set performance
clf_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
clf_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
clf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

In [28]:
# Test set performance
clf_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
clf_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
clf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

In [29]:
print('Model performance for Training set')
print('- Accuracy: %s' % clf_train_accuracy)
print('- MCC: %s' % clf_train_mcc)
print('- F1 score: %s' % clf_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % clf_test_accuracy)
print('- MCC: %s' % clf_test_mcc)
print('- F1 score: %s' % clf_test_f1)

Model performance for Training set
- Accuracy: 0.9790833333333333
- MCC: 0.9531924633026071
- F1 score: 0.9789504302967131
----------------------------------
Model performance for Test set
- Accuracy: 0.9793333333333333
- MCC: 0.9531432561058978
- F1 score: 0.9792150967736462


# **SVC**

In [30]:
from sklearn.svm import SVC

svc=SVC()
svc.fit(X_train,y_train)

SVC()

In [31]:
# Make predictions
y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)

In [32]:
# Training set performance
svc_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
svc_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
svc_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

In [33]:
# Test set performance
svc_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
svc_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
svc_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

In [34]:
print('Model performance for Training set')
print('- Accuracy: %s' % svc_train_accuracy)
print('- MCC: %s' % svc_train_mcc)
print('- F1 score: %s' % svc_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % svc_test_accuracy)
print('- MCC: %s' % svc_test_mcc)
print('- F1 score: %s' % svc_test_f1)

Model performance for Training set
- Accuracy: 0.98875
- MCC: 0.9748037915859492
- F1 score: 0.9887133006352452
----------------------------------
Model performance for Test set
- Accuracy: 0.982
- MCC: 0.9592084147072265
- F1 score: 0.9819072765755927


# **LOGISTIC REGRESSION**

In [35]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [36]:
# Make predictions
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [37]:
# Training set performance
lr_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
lr_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
lr_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

In [38]:
# Test set performance
lr_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
lr_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
lr_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

In [39]:
print('Model performance for Training set')
print('- Accuracy: %s' % lr_train_accuracy)
print('- MCC: %s' % lr_train_mcc)
print('- F1 score: %s' % lr_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % lr_test_accuracy)
print('- MCC: %s' % lr_test_mcc)
print('- F1 score: %s' % lr_test_f1)

Model performance for Training set
- Accuracy: 0.9995
- MCC: 0.9988793787533877
- F1 score: 0.9995000306477854
----------------------------------
Model performance for Test set
- Accuracy: 0.9846666666666667
- MCC: 0.9651908767928614
- F1 score: 0.9846465771904925


# **CATBOOST**

In [40]:
from catboost import CatBoostClassifier

cat=CatBoostClassifier(iterations=50,learning_rate=0.1)
cat.fit(X_train,y_train)

0:	learn: 0.4867926	total: 1.36s	remaining: 1m 6s
1:	learn: 0.3526675	total: 2.43s	remaining: 58.4s
2:	learn: 0.2586874	total: 3.47s	remaining: 54.3s
3:	learn: 0.2000106	total: 4.5s	remaining: 51.7s
4:	learn: 0.1596906	total: 5.07s	remaining: 45.6s
5:	learn: 0.1299378	total: 5.97s	remaining: 43.8s
6:	learn: 0.1080735	total: 6.96s	remaining: 42.7s
7:	learn: 0.0909536	total: 7.96s	remaining: 41.8s
8:	learn: 0.0785441	total: 8.96s	remaining: 40.8s
9:	learn: 0.0691730	total: 10.1s	remaining: 40.4s
10:	learn: 0.0607536	total: 11.2s	remaining: 39.6s
11:	learn: 0.0553272	total: 11.8s	remaining: 37.4s
12:	learn: 0.0504201	total: 12.6s	remaining: 35.8s
13:	learn: 0.0458955	total: 13.5s	remaining: 34.8s
14:	learn: 0.0431726	total: 14.5s	remaining: 33.8s
15:	learn: 0.0407029	total: 15.5s	remaining: 32.9s
16:	learn: 0.0382038	total: 16.4s	remaining: 31.9s
17:	learn: 0.0366612	total: 17.4s	remaining: 30.9s
18:	learn: 0.0347472	total: 18.3s	remaining: 29.9s
19:	learn: 0.0327361	total: 18.9s	remainin

<catboost.core.CatBoostClassifier at 0x2ed21fece20>

In [41]:
# Make predictions
y_train_pred = cat.predict(X_train)
y_test_pred = cat.predict(X_test)

In [42]:
# Training set performance
cat_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
cat_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
cat_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

In [43]:
# Test set performance
cat_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
cat_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
cat_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

In [44]:
print('Model performance for Training set')
print('- Accuracy: %s' % cat_train_accuracy)
print('- MCC: %s' % cat_train_mcc)
print('- F1 score: %s' % cat_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % cat_test_accuracy)
print('- MCC: %s' % cat_test_mcc)
print('- F1 score: %s' % cat_test_f1)

Model performance for Training set
- Accuracy: 0.9995
- MCC: 0.9988793787533877
- F1 score: 0.9995000306477854
----------------------------------
Model performance for Test set
- Accuracy: 0.9926666666666667
- MCC: 0.9833769966407652
- F1 score: 0.9926531494217088


# **XGBoost**

In [45]:
from xgboost import XGBClassifier

xgb=XGBClassifier(booster='gblinear',learning_rate=1,n_estimators=100)
xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gblinear', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None, gpu_id=-1,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=0,
              num_parallel_tree=None, predictor=None, random_state=0,
              reg_alpha=0, reg_lambda=0, ...)

In [50]:
# Make predictions
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

In [51]:
# Training set performance
xgb_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
xgb_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
xgb_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

In [52]:
# Test set performance
xgb_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
xgb_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
xgb_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

In [53]:
print('Model performance for Training set')
print('- Accuracy: %s' % xgb_train_accuracy)
print('- MCC: %s' % xgb_train_mcc)
print('- F1 score: %s' % xgb_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % xgb_test_accuracy)
print('- MCC: %s' % xgb_test_mcc)
print('- F1 score: %s' % xgb_test_f1)

Model performance for Training set
- Accuracy: 0.9053333333333333
- MCC: 0.785941737477248
- F1 score: 0.9048065132980564
----------------------------------
Model performance for Test set
- Accuracy: 0.7986666666666666
- MCC: 0.5518206421477979
- F1 score: 0.8002057469418611
