# Baseline model

Bertrand Thia (bt2513)

---

In [21]:
# importing the librairies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from keras.callbacks.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

# Loading the data

In [22]:
train_labels = pd.read_csv('./train.csv')

In [23]:
train_labels['label'].value_counts()

normal       350
bacterial    350
viral        350
covid         77
Name: label, dtype: int64

In [24]:
train_labels['label'].value_counts(normalize= True)

normal       0.310559
bacterial    0.310559
viral        0.310559
covid        0.068323
Name: label, dtype: float64

As we saw in the previous notebook, our dataset is imbalanced and the number of covid cases is very low. To address this issue, we will try class weigthing and oversampling later.

* ### Data preparation 

Repeating the steps from the previous notebook:

In [25]:
# Reading the images in the folder and connecting them to their label
n_rows = train_labels.shape[0]
img_list = [np.array(Image.open('./train/' + train_labels.loc[i, 'filename'])) for i in range(n_rows)]
X = train_labels.drop(columns= ['filename'])
X['pix_value'] = img_list
X = X[['id', 'pix_value', 'label']]

# Resizing the images
H = []
W = []
for k in range(X.shape[0]):
    shape = X.loc[k, 'pix_value'].shape
    H.append(shape[0])
    W.append(shape[1])
new_W = int(np.median(W))
new_H = int(H[W.index(np.median(W))])
resized_pix_list = []
for k in range(X.shape[0]):
    dim = (new_H, new_W)
    resized_pix_list.append(cv2.resize(X.loc[k, 'pix_value'], dim))
X = X.assign(resized_pix = resized_pix_list)

# Denoising the images using a blurring technique
blurred_pix_list = []
for k in range(X.shape[0]):
    blurred_pix_list.append(cv2.GaussianBlur(X.loc[k, 'resized_pix'], (5, 5), 0))
X = X.assign(blurred_pix = blurred_pix_list)

X = X[['id', 'pix_value', 'resized_pix', 'blurred_pix', 'label']]
X.head()

Unnamed: 0,id,pix_value,resized_pix,blurred_pix,label
0,0,"[[100, 90, 79, 73, 70, 66, 66, 69, 73, 73, 73,...","[[98, 82, 72, 67, 66, 72, 73, 73, 74, 75, 76, ...","[[88, 83, 75, 70, 70, 71, 73, 74, 75, 77, 79, ...",normal
1,1,"[[110, 130, 128, 124, 141, 142, 130, 132, 142,...","[[112, 129, 126, 137, 142, 130, 136, 137, 134,...","[[123, 125, 130, 134, 136, 135, 135, 136, 137,...",viral
2,2,"[[119, 119, 118, 116, 114, 112, 109, 108, 104,...","[[119, 119, 119, 119, 119, 118, 117, 117, 116,...","[[119, 119, 119, 119, 119, 118, 118, 117, 116,...",viral
3,3,"[[5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,...","[[5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 4, 4,...","[[5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 4,...",bacterial
4,4,"[[109, 108, 106, 107, 108, 109, 108, 106, 109,...","[[109, 108, 106, 107, 108, 109, 108, 106, 108,...","[[108, 108, 107, 107, 108, 108, 107, 107, 108,...",viral


# Data preprocessing

Let's split our data into a training set and a validation set,and then standardize them. 

* ### Splitting into training set and validation set 

In [26]:
# Keeping the ids for later and separating the features from the target

ids = X['id']
X_resized = X[['resized_pix']]
X_blurred = X[['blurred_pix']]
y = LabelEncoder().fit_transform(X['label']) # encoding the target to classes 0, 1, 2, 3

In [27]:
np.unique(X['label'].values, return_counts= True) # Order of the label encoding

(array(['bacterial', 'covid', 'normal', 'viral'], dtype=object),
 array([350,  77, 350, 350]))

In [28]:
# One split for the resized data

X_train_r, X_val_r, y_train_r, y_val_r = train_test_split(X_resized, y, test_size= 0.2, 
                                                          random_state= 0, stratify= y)

In [29]:
# One split for the denoised data to compare later

X_train_b, X_val_b, y_train_b, y_val_b = train_test_split(X_blurred, y, test_size= 0.2, 
                                                          random_state= 0, stratify= y)

* ### Standardization

Considering the heterogenity of the images observed in the previous study, we are going to standardize our data to improve the uniformity. 

In [30]:
mean_list, std_list = [], []
for i in range(X_train_r.shape[0]):
    mean_list.append(np.mean(X_train_r.iloc[i, 0]))
    std_list.append(np.std(X_train_r.iloc[i, 0]))

In [31]:
mean, std = np.mean(mean_list), np.mean(std_list)
X_train_r['std_pix'] = X_train_r.apply(lambda x: (x['resized_pix'] - mean) / std, axis= 1)
X_val_r['std_pix'] = X_val_r.apply(lambda x: (x['resized_pix'] - mean) / std, axis= 1)

X_train_r, X_val_r = X_train_r[['std_pix']], X_val_r[['std_pix']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


* ### Formatting the inputs 

In [32]:
resized_train, resized_val = [], []

for i in range(X_train_r.shape[0]):
    resized_train.append(X_train_r.iloc[i, 0].reshape(1, -1).flatten())
for i in range(X_val_r.shape[0]): 
    resized_val.append(X_val_r.iloc[i, 0].reshape(1, -1).flatten())
resized_train, resized_val = np.array(resized_train), np.array(resized_val)
print(resized_train.shape)

(901, 706680)


In [33]:
X_train_r2 = resized_train.reshape(X_train_r.shape[0], *(new_H, new_W, 1))
X_val_r2 = resized_val.reshape(X_val_r.shape[0], *(new_H, new_W, 1))
X_train_r, X_val_r = X_train_r2, X_val_r2
print('Size of X_train_r:', X_train_r.shape)

Size of X_train_r: (901, 755, 936, 1)


# Model building

In [34]:
bas_model = Sequential()
bas_model.add(Conv2D(32, 3, 3, input_shape= (new_H, new_W, 1), activation= 'relu')) 
bas_model.add(MaxPooling2D(pool_size= (2, 2)))
bas_model.add(Flatten())
bas_model.add(Dense(output_dim= 32, activation= 'relu')) 
bas_model.add(Dense(output_dim= 4, activation= 'sigmoid'))

  
  """
  


In [35]:
bas_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 753, 934, 32)      320       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 376, 467, 32)      0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 5618944)           0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                179806240 
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 132       
Total params: 179,806,692
Trainable params: 179,806,692
Non-trainable params: 0
_________________________________________________________________


In [36]:
bas_model.compile(loss= 'sparse_categorical_crossentropy',
                 optimizer= Adam(), 
                 metrics = ['accuracy'])

* ### Training 

In [37]:
epochs= 5
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, restore_best_weights= True)

In [None]:
bas_model.fit(X_train_r, y_train_r, batch_size= 32, 
              epochs= epochs, verbose= 1, validation_data = (X_val_r, y_val_r), callbacks= [es])

Train on 901 samples, validate on 226 samples
Epoch 1/5

---

# Appendix