## Make prediction for X-ray Body Part Classifier competition

In [1]:
import os
from google.colab import drive
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.regularizers import l2
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, BatchNormalization, AveragePooling2D
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
import json

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


### Download CSV files with features (X) and target (y)

In [3]:
X = pd.read_csv('/content/drive/MyDrive/train_X.csv',  header=None)

In [4]:
y = pd.read_csv('/content/drive/MyDrive/train_y.csv')

### Show classes and number of instances in them

In [5]:
len(y['0'].unique())

41

In [6]:
y.groupby(['0'])['0'].count()

0
0      80
1      41
2      77
3     724
4       9
5      23
6      70
7      15
8      12
9      31
10     19
11    102
12     19
13     39
14    120
15     67
16     40
17     23
18     10
19      7
20     15
21     63
22     45
23     42
24     11
25      5
26      1
27      1
28      7
29      4
30      2
31      1
32      1
33      2
34      3
35      1
36      1
37      2
38      1
39      1
40      1
Name: 0, dtype: int64

##### The classes are imbalanced, one way to avoid overfitting is to use the RandomOverSampler.

In [7]:
y=list(y['0'])

In [8]:
X = pd.DataFrame(X).to_numpy()

In [9]:
ros = RandomOverSampler( sampling_strategy = 'minority')
X_resampled, y_resampled = ros.fit_resample(X, y)           # create similar ones in minority classes

In [10]:
print('Rows number before RandomOverSampler:', X.shape[0])
print('Rows number after RandomOverSampler:', X_resampled.shape[0])

Rows number before RandomOverSampler: 1738
Rows number after RandomOverSampler: 2461


In [11]:
X_resampled = np.reshape(X_resampled, (2461, 84, 84))       # return values to image matrix

### Split data for training and testing

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X_resampled, y_resampled, test_size = 0.1, random_state = 92) 

### Make categories for target

In [13]:
y_train_cat = keras.utils.to_categorical(y_train, 41)                          
y_test_cat = keras.utils.to_categorical(y_test, 41)

In [14]:
y_cat = keras.utils.to_categorical(y_resampled, 41)

### Prepare arrays with features

In [15]:
X_train = np.expand_dims(X_train, axis=3)
X_test = np.expand_dims(X_test, axis=3)

In [16]:
X = np.expand_dims(X_resampled, axis=3)

### Make model

In [17]:
epo = 52                                               # number of epochs
batch = 32
l2_lambda = 0.0006                                     # L2-regularisation
myOpt = keras.optimizers.Adam(learning_rate = 0.0003)

### Create a convolutional neural network

In [18]:
model = keras.Sequential([
    Conv2D(32, (3,3),
           padding = 'same',
           activation = 'relu',
           kernel_regularizer = l2(l2_lambda),
           input_shape = (84, 84, 1)),
    BatchNormalization (),
    AveragePooling2D(pool_size = (4, 4)),
    Conv2D(64, (3,3), padding = 'same',
           activation='relu'),
    BatchNormalization (),
    AveragePooling2D(pool_size = (4, 4)),
    Flatten(),
    Dense(256, activation = 'relu'),
    Dropout(0.5),
    Dense(41,  activation = 'softmax')
])

model.compile(optimizer = myOpt,
             loss = 'categorical_crossentropy',
             metrics = 'accuracy')


his = model.fit(X, y_cat,               # make the model on all data, without separation into tests, so as not to lose cases
                batch_size = batch,
                epochs = epo,
                validation_split = 0.1,)

model.evaluate(X_test, y_test_cat)

Epoch 1/52
Epoch 2/52
Epoch 3/52
Epoch 4/52
Epoch 5/52
Epoch 6/52
Epoch 7/52
Epoch 8/52
Epoch 9/52
Epoch 10/52
Epoch 11/52
Epoch 12/52
Epoch 13/52
Epoch 14/52
Epoch 15/52
Epoch 16/52
Epoch 17/52
Epoch 18/52
Epoch 19/52
Epoch 20/52
Epoch 21/52
Epoch 22/52
Epoch 23/52
Epoch 24/52
Epoch 25/52
Epoch 26/52
Epoch 27/52
Epoch 28/52
Epoch 29/52
Epoch 30/52
Epoch 31/52
Epoch 32/52
Epoch 33/52
Epoch 34/52
Epoch 35/52
Epoch 36/52
Epoch 37/52
Epoch 38/52
Epoch 39/52
Epoch 40/52
Epoch 41/52
Epoch 42/52
Epoch 43/52
Epoch 44/52
Epoch 45/52
Epoch 46/52
Epoch 47/52
Epoch 48/52
Epoch 49/52
Epoch 50/52
Epoch 51/52
Epoch 52/52


[0.035626694560050964, 0.9919028282165527]

##### The result is high because the algorithm has seen the test set.

### Make file with the prediction for competition

In [None]:
comp = pd.read_csv('/content/drive/MyDrive/comp.csv',  header=None)       # dataset with images for prediction

In [None]:
comp = pd.DataFrame(comp).to_numpy()                                    # prepare data
comp = np.reshape(comp, (743, 84, 84)) 
comp = np.expand_dims(comp, axis=3)

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')   # download the sample Kaggle file to submit to the competition

In [None]:
test_df = test_df.drop(['Target'], axis=1)                              # drop column with samples

In [None]:
with open('/content/codes.json', 'r') as j:                             # download file with multivalue codes
     codes = json.loads(j.read())

In [278]:
predictions = model.predict(comp)                                       # make prediction
y_pred = np.argmax(predictions, axis=1)
len(y_pred)

743

In [None]:
pred_list=[]                                                            # decoding 'y'
for i in y_pred:
  if str(i) in codes.keys():
    pred_list.append(' '.join(codes[str(i)]))
  else:
    pred_list.append(str(i))

In [None]:
test_df['Target'] = pred_list                                           # add to 'sample_submission'

In [None]:
test_df.Target.value_counts() 

3        355
14        57
11        50
21        40
0         39
6         35
15        21
1         20
2         17
9         16
13 20     15
16        11
9 21      11
17         8
7          8
13         8
4          5
5          5
12         4
19         3
20         3
10         3
1 6        2
8          2
18         2
0 3        1
0 15       1
1 12       1
Name: Target, dtype: int64

In [279]:
pd.DataFrame(test_df).to_csv("sample_submission.csv", header = True , index = False)   # save and send

##### My result in competitions is 82.6%