## Image Classification
This notebook implements baseline solution for defects classification problem in manufactured
steel


In [3]:
# Load required packages #
import numpy as np
import pandas as pd
from keras.preprocessing import image
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense , Dropout , Flatten 
from keras.layers import Conv2D, MaxPooling2D , ZeroPadding2D
from keras.utils import to_categorical 
from sklearn.metrics import accuracy_score 

In [4]:
# define function to load and scale input image #
def image_load(path):
    img=image.load_img(path)
    img=image.img_to_array(img)
    img=img/255
    return(img)

In [7]:
##define folder locations for train and test data##
train_image_dir='E:/severstal/data/train_images/'
test_image_dir='E:/severstal/data/test_images/'
base_dir='E:/severstal/data/'

In [9]:
#read label file#
df_label=pd.read_csv(base_dir+'train.csv')
#split first column to seperate image file name and its class#
df_label['class']=df_label['ImageId_ClassId'].apply(lambda x:x.split("_")[1])
df_label['fname']=df_label['ImageId_ClassId'].apply(lambda x:x.split("_")[0])

In [10]:
#creating seperate dfs for clean and image with at least one defect#
image_with_labels=df_label[~pd.isnull(df_label['EncodedPixels'])]
image_wo_defects=df_label[pd.isnull(df_label['EncodedPixels'])]

In [13]:
#unique filenames with defects
fnames=image_with_labels.fname.unique()
#concatenate defects classes per image#
class_list=[image_with_labels[image_with_labels['fname']==fid]['class'].tolist() for fid in fnames]
#create data frame with one row per image with defect @
uniq_image_with_defects=pd.DataFrame({'image_file':fnames,'class_labels':class_list})
#get unique clean image file names#
clean_fnames=image_wo_defects[~image_wo_defects['fname'].isin(fnames)].fname.unique()


In [14]:
#create 5th class as clean image#
a=[list('5') for clean_f in clean_fnames]
clean_df=pd.DataFrame({'image_file':clean_fnames,'class_labels':a})
#concat image with defects and clean image df #
all_image_with_labels=pd.concat([uniq_image_with_defects,clean_df],axis=0,ignore_index=True)

In [15]:
#implement one hot encoding for multi labels#
mlb=MultiLabelBinarizer()
x=mlb.fit_transform(all_image_with_labels['class_labels'])
all_image_labels=pd.DataFrame()
all_image_labels=all_image_with_labels.join(pd.DataFrame(x,columns=mlb.classes_))

In [None]:
#read all images from train set#
train_image_mat=[]

for i in range(all_image_labels.shape[0]):
    path=train_image_dir+str(all_image_labels.image_file[i])
    train_image_mat.append(image_load(path))


In [None]:
#create input and label data#
#split train data into dev and test set# 
X=np.array(train_image_mat)
Y=np.array(all_image_labels.drop(['class_labels','image_file'],axis=1))

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42,test_size=0.1)
X_valid,X_test,Y_valid,Y_test=train_test_split(X_test,Y_test,random_state=42,test_size=0.5)


# Build CNN model
### model architecture defined using references from this blog and coursera material https://www.analyticsvidhya.com/blog/2019/04/build-first-multi-label-image-classification-model-python/

In [19]:
model=Sequential()
model.add(ZeroPadding2D((2,2),input_shape=(256,1600,3)))
model.add(Conv2D(filters=16,kernel_size=(3,3),activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=32,kernel_size=(5,5),activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=64,kernel_size=(5,5),activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128,activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64,activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(5,activation="sigmoid"))



Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d_1 (ZeroPaddin (None, 260, 1604, 3)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 258, 1602, 16)     448       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 129, 801, 16)      0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 129, 801, 16)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 125, 797, 32)      12832     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 62, 398, 32)       0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 62, 398, 32)       0         
__________

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(X_train,Y_train,epochs=5,validation_data=(X_valid,Y_valid),batch_size=64)

In [None]:
#Generate predictions#
Y_pred=model.predict(X_test)
Y_pred=(Y_pred>0.5)
print(accuracy_score(Y_pred,Y_test))