In [1]:
import numpy as np
import pandas as pd

In [2]:
df_desc = pd.read_csv('x_ray_image_recognition_data/product_description_and_categories.csv')
df_train = pd.read_csv('x_ray_image_recognition_data/train.csv')

In [3]:
df_desc.head()

Unnamed: 0,product_description,category,sub_category
0,"Neo Fresh 245 L, 3 Star Double Door Frost Free...",Electronics and Appliances,Home and Kitchen Appliances
1,"Professional 340 L, 3 Star Double Door Frost F...",Electronics and Appliances,Home and Kitchen Appliances
2,"Protton 300 L, 3 Door Frost Free Refrigerator-...",Electronics and Appliances,Home and Kitchen Appliances
3,Ace Stainfree 8 Kg Semi Automatic Washing Mach...,Electronics and Appliances,Washing Machines
4,"Protton 260 L, 3 Door Frost Free Refrigerator-...",Electronics and Appliances,Washing Machines


In [4]:
df_train.head()

Unnamed: 0,x_ray_image_file_name,product_description,x_ray_product_description_match_status
0,1.jpg,Acer Aspire SW3-016 10.1-inch Laptop (Atom x5-...,True
1,2.jpg,Apple iPhone 6S 64 GB (Golden),True
2,4.jpg,"7th ,8th(2) , 1st , 5th",True
3,5.jpg,10th Class,True
4,6.jpg,"4th , 5th & English Class",True


In [5]:
X_category = df_desc.iloc[:,1].values
X_desc = df_desc.iloc[:,0].values

In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import webcolors
import string
#nltk.download('stopwords')

In [7]:
def process_text_special(text):
    # handle format rid- , , product- , ,
    text = re.sub('"', '' ,text)
    if text.split(' ')[0] == "rid-":
        text = re.sub('\(', '', text)
        text = re.sub('\)', '', text)
        text = text.strip(")")
        w = text.split(' ')
        t = []
        for i in range(len(w)):
            if w[i] == "rid-":
                i = i+1
                while(True):
                    t.append(w[i])
                    i += 1
                    if w[i] == "product-":
                        break
            if w[i] == "product-":
                for j in range(i+1, len(w)):
                    t.append(w[j])
                break
        t = ' '.join(t)
        t = t.split(',')
        t = [ti.strip() for ti in t]
        t = [ti.lower() for ti in t if ti != '']
        return t
    text = re.sub(r"[\(\[]*?[\)\]]", "", text)
    #handling , , , ,
    if len(text.split(',')) > 3:
        t = text.split(',')
        t = [ti.strip() for ti in t]
        t = [ti.lower() for ti in t if ti != '']
        return t

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[\(\[]*?[\)\]]", "", text)
    text = re.sub("months", "", text)
    text = re.sub(r'\d+', "", text) # remove numbers
    words = text.split()
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in stripped]
    words = [word.lower() for word in stemmed]
    words = [word for word in words if word not in webcolors.CSS3_NAMES_TO_HEX] # remove colors 
    stop_words = stopwords.words('english')
    words = [w for w in words if not w in stop_words]
    text = ' '.join(words)
    return text

In [None]:
X_train_desc = []
y_train_category = []
for i in range(len(X_desc)):
    print("Processing text %d" %(i+1))
    text = X_desc[i]
    if text.split(' ')[0] == "rid-" or len(text.split(',')) > 3:
        t = process_text_special(text)
        for j in t:
            _t = clean_text(j)
            X_train_desc.append(_t)
            y_train_category.append(X_category[i])
    else:
        _t = clean_text(text)
        X_train_desc.append(_t)
        y_train_category.append(X_category[i])

In [10]:
len(X_train_desc)

81299

In [11]:
max_len = -1
for i in X_train_desc:
    max_len = max(max_len, len(i))
print(max_len)

21603


In [12]:
count = 0
threshold = 100
for i in X_train_desc:
    if(len(i)<= threshold):
        count += 1
print(count)

69929


In [13]:
threshold = 100
X_train_desc_th = []
y_train_category_th = []
for i in range(len(X_train_desc)):
    if(len(X_train_desc[i]) <= threshold):
        X_train_desc_th.append(X_train_desc[i])
        y_train_category_th.append(y_train_category[i])

In [14]:
vocab = set()
for i in X_train_desc_th:
    for j in i.split():
        vocab.add(j)
print(len(vocab))

28426


In [15]:
vocab = list(vocab)

In [16]:
vocab_size = len(vocab)
print(vocab_size)

28426


In [17]:
word2int = {}
int2word = {}
for i in range(len(vocab)):
    word2int[vocab[i]] = i
    int2word[i] = vocab[i]

In [18]:
word2int['<UNK>'] = len(word2int)

In [19]:
X_train_seq = []
for i in X_train_desc_th:
    _t  = []
    for j in i.split():
        _t.append(word2int[j])
    X_train_seq.append(_t)

In [20]:
max_length = -1
for i in X_train_seq:
    max_length = max(max_length, len(i))
print (max_length)

25


In [21]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Flatten
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from keras.optimizers import SGD

Using TensorFlow backend.


In [22]:
X_train_seq = pad_sequences(X_train_seq, max_length)

In [23]:
X_train_seq = np.array(X_train_seq)

In [24]:
X_train_seq.shape

(69929, 25)

In [25]:
word2int_category = {}
int2word_category = {}
category = set()
sub_category = set()
for i in range(len(y_train_category_th)):
    category.add(y_train_category_th[i])
category = list(category)
for i in range(len(category)):
    word2int_category[category[i]] = i
    int2word_category[i] = category[i]

In [26]:
int2word_category

{0: 'Movies, Music and Video Games',
 1: 'Mobile Phone, Tablets and Accessories',
 2: 'Electronics and Appliances',
 3: 'Health and Wellness',
 4: 'Stationery and Office Products',
 5: 'Sports and Outdoors',
 6: 'Industrial and Scientific Goods',
 7: 'Books, Software and E-learning',
 8: 'Apparel and Accessories',
 9: 'Baby Care',
 10: 'Stationery and office Products',
 11: ' Gifts ',
 12: 'Grocery and Gourmet Food',
 13: 'Camera and Photos',
 14: 'Toys and Games',
 15: 'Uncategorized',
 16: 'Computers, Laptops and Accessories',
 17: 'Watches, Eyewear and Jewellery',
 18: 'Shoes and Footwear',
 19: 'Tools and Hardware',
 20: 'Handbags, Bags and Luggage',
 21: 'Pet Supplies',
 22: 'Automotive',
 23: 'Beauty Products and Personal Care',
 24: 'Musical Instruments',
 25: 'Home and Kitchen'}

In [27]:
y_train_category_seq = []
for i in range(len(y_train_category_th)):
    #print(y_train_category_th[i])
    y_train_category_seq.append(word2int_category[y_train_category_th[i]])

In [28]:
y_train_category_seq = to_categorical(y_train_category_seq)

In [29]:
y_train_category_seq[:14]

array([[ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  0.,  0

In [42]:
from sklearn.model_selection import train_test_split

X_train_seq_1 , X_test_seq_1, y_train_category_seq_1, y_test_category_seq_1 = train_test_split(X_train_seq,
                                                                                               y_train_category_seq,
                                                                                               test_size = 0.2, 
                                                                                               random_state = 0)

In [43]:
X_train_seq.shape

(69929, 25)

In [44]:
vocab_size=len(word2int)
input_length=25

In [45]:
model_category = Sequential()
model_category.add(Embedding(vocab_size, 64, input_length=input_length))
model_category.add(Flatten())
model_category.add(Dense(256,kernel_initializer='normal', activation='relu', input_dim = input_length))
model_category.add(Dense(256,kernel_initializer='normal', activation='relu'))
#model_category.add(Dense(1024,kernel_initializer='normal', activation='relu'))
model_category.add(Dense(26,activation='sigmoid'))
model_category.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model_category.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 64)            1819328   
_________________________________________________________________
flatten_3 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               409856    
_________________________________________________________________
dense_8 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_9 (Dense)              (None, 26)                6682      
Total params: 2,301,658
Trainable params: 2,301,658
Non-trainable params: 0
_________________________________________________________________
None


In [46]:
model_category.fit(X_train_seq, y_train_category_seq,
                   epochs=2,  
                   validation_data=[X_test_seq_1, y_test_category_seq_1])

Train on 69929 samples, validate on 13986 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fe8d45c5588>

In [47]:
pred = model_category.predict(X_test_seq_1)

correct_ans = 0
wrong_ans = 0
ans = []
for i in range(len(pred)):
    max_pos = 0
    max_val = -10000000000000000000000
    for j in range(len(pred[i])):
        if max_val < pred[i][j]:
            max_val = pred[i][j]
            max_pos = j
    ans.append(max_pos)
    if y_test_category_seq_1[i][max_pos] == 1:
        correct_ans += 1
    else:
        wrong_ans += 1
print(correct_ans)
print(wrong_ans)
print("Accuracy", 1.0 * correct_ans / (wrong_ans + correct_ans))

13488
498
Accuracy 0.9643929643929644


In [48]:
desc_train_load = df_train.iloc[:,1].values
image_path_load = df_train.iloc[:,0].values
y_train_load = df_train.iloc[:,-1].values

In [49]:
count_true = 0
count_false = 0
for i in y_train_load:
    if i == True:
        count_true += 1
    else:
        count_false += 1
print(count_true)
print(count_false)

103464
9498


In [50]:
# handling nan
nan_count = 0
a = desc_train_load[48066] #nan values
for i in range(len(desc_train_load)):
    if desc_train_load[i] is a:
        desc_train_load[i] = "none"
        nan_count += 1
print("%d nan fixed",  (nan_count))

%d nan fixed 2


In [None]:
desc_train = []
for i in range(len(desc_train_load)):
    print("Processing text",i+1)
    desc_train.append(clean_text(desc_train_load[i]))

In [52]:
y_desc_category = []
image_path = []
y_train = []

for i in range(len(desc_train)):
    if desc_train[i] != 'none' and y_train_load[i] != False:
        _t  = []
        for j in desc_train[i].split():
            if j not in word2int:
                _t.append(word2int['<UNK>'])
            else:
                _t.append(word2int[j])
        y_desc_category.append(_t)
        image_path.append(image_path_load[i])

In [53]:
y_desc_category = pad_sequences(y_desc_category, max_length)

In [54]:
y_desc_category_result = model_category.predict(y_desc_category)

In [55]:
y_cat = []
for i in range(len(y_desc_category_result)):
    max_pos = 0
    max_val = -1000000000000000000
    for j in range(len(y_desc_category_result[i])):
        if max_val < y_desc_category_result[i][j]:
            max_val = y_desc_category_result[i][j]
            max_pos = j
    y_cat.append(max_pos)

In [56]:
cat_count = {}
for i in range(0,26):
    cat_count[i] = 0
for i in range(len(y_cat)):
        cat_count[y_cat[i]] += 1

In [57]:
cat_count

{0: 0,
 1: 52574,
 2: 1751,
 3: 419,
 4: 462,
 5: 1280,
 6: 7,
 7: 1246,
 8: 2459,
 9: 4,
 10: 0,
 11: 77,
 12: 0,
 13: 2206,
 14: 240,
 15: 14293,
 16: 14400,
 17: 7811,
 18: 1094,
 19: 70,
 20: 221,
 21: 1,
 22: 437,
 23: 775,
 24: 3,
 25: 1632}

In [58]:
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dropout
from keras.optimizers import SGD
from keras.preprocessing import image
from keras.utils import np_utils
from keras.constraints import maxnorm
#from keras import backend as K
#K.set_image_dim_ordering('th')

In [None]:
X_train = []
y_train = []

for i in range(len(image_path)):
    print("Processing",image_path[i], i+1)
    train_image = image.load_img('x_ray_image_recognition_data/x_ray_images/train_xray_images/'+image_path[i], target_size = (32, 32))
    train_image = image.img_to_array(train_image)
    train_image = np.expand_dims(train_image, axis = 0)
    X_train.append(train_image)
    y_train.append(y_desc_category_result[i])

X_train = np.array(X_train)
y_train = np.array(y_train)

print(X_train.shape)
print(y_train.shape)

In [60]:
import pickle

In [59]:
pickle.dump(X_train, open("X_train.pickle", "wb"))
pickle.dump(y_train, open("y_train.pickle", "wb"))

In [61]:
X_train = pickle.load(open("X_train.pickle", "rb"))
y_train = pickle.load(open("y_train.pickle", "rb"))

In [62]:
y_train = np.array(y_cat)
y_train = np_utils.to_categorical(y_train)
y_train = y_train.astype('int32')
print(y_train.shape)

(103462, 26)


In [63]:
X_temp = []
for i in range(len(X_train)):
    X_temp.append(X_train[i].reshape(32,32,3))
X_train = np.array(X_temp)
print(X_train.shape)

(103462, 32, 32, 3)


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

In [65]:
#normalize inputs from 0-255 to 0.0 - 1.0
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train = X_train / 255.0
X_test = X_test / 255.0

In [66]:
y_test[0]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int32)

In [None]:
# creating model
classifier = Sequential()
classifier.add(Conv2D(32, (3, 3), input_shape=(32,32,3), activation='relu', padding='same'))
classifier.add(Dropout(0.2))
classifier.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
classifier.add(MaxPooling2D(pool_size=(2, 2)))
classifier.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
classifier.add(Dropout(0.2))
classifier.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
classifier.add(MaxPooling2D(pool_size=(2, 2)))
classifier.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
classifier.add(Dropout(0.2))
classifier.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
classifier.add(MaxPooling2D(pool_size=(2, 2)))
classifier.add(Flatten())
classifier.add(Dropout(0.2))
classifier.add(Dense(1024, activation='relu', kernel_constraint=maxnorm(3)))
classifier.add(Dropout(0.2))
classifier.add(Dense(512, activation='relu', kernel_constraint=maxnorm(3)))
classifier.add(Dropout(0.2))
classifier.add(Dense(26, activation='softmax'))

# Compiling model
epochs = 10
lrate = 0.01
decay = lrate/epochs
sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
classifier.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
print(classifier.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 32, 32, 32)        896       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 32, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 32, 32)        9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 16, 16, 64)        18496     
_________________________________________________________________
dropout_2 (Dropout)          (None, 16, 16, 64)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 16, 16, 64)        36928     
__________

In [None]:
classifier.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=32)

Train on 82769 samples, validate on 20693 samples
Epoch 1/10
Epoch 2/10
15584/82769 [====>.........................] - ETA: 13:32 - loss: 1.1817 - acc: 0.6540

In [92]:
classifier.predict(X_test).astype('int32')[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int32)

In [73]:
df_test = pd.read_csv('x_ray_image_recognition_data/test.csv')

In [74]:
desc_test_load = df_test.iloc[:, 1].values
image_path_load_test = df_test.iloc[:, 0].values

In [75]:
y_pred = []

In [None]:
desc_test = []
for i in range(len(desc_test_load)):
    print("Processing text",i+1)
    desc_test.append(clean_text(desc_test_load[i]))

In [77]:
y_desc_category_test = []

for i in range(len(desc_test)):
    _t  = []
    for j in desc_test[i].split():
        if j not in word2int:
            _t.append(word2int['<UNK>'])
        else:
            _t.append(word2int[j])
    y_desc_category_test.append(_t)

In [78]:
y_desc_category_test = pad_sequences(y_desc_category_test, max_length)

In [79]:
y_desc_category_result_test = model_category.predict(y_desc_category_test)

In [82]:
y_cat_test = []
for i in range(len(y_desc_category_result_test)):
    max_pos = 0
    max_val = -1000000000000000000
    for j in range(len(y_desc_category_result_test[i])):
        if max_val < y_desc_category_result_test[i][j]:
            max_val = y_desc_category_result_test[i][j]
            max_pos = j
    y_cat_test.append(max_pos)

In [None]:
count_true = 0
y_pred = []
for i in range(len(y_cat_test)):
    test_image = image.load_img('x_ray_image_recognition_data/x_ray_images/test_xray_images/'+image_path_load_test[i], target_size = (32, 32))
    test_image = image.img_to_array(test_image)
    test_image = np.expand_dims(test_image, axis = 0)
    result = classifier.predict(test_image).astype('int32')
    max_pos = 0
    max_val = -1000000000000000000
    for j in range(len(result[0])):
        if max_val < result[0][j]:
            max_val = result[0][j]
            max_pos = j
    if y_cat_test[i] == max_pos:
        y_pred.append('True')
        count_true += 1
    else:
        y_pred.append('False')
    print(result[0])
    print(i, y_cat_test[i], max_pos, count_true)
print(count_true)

In [94]:
df = pd.DataFrame()
df['x_ray_image_file_name'] = image_path_load_test
df['x_ray_product_description_match_status'] = y_pred
df.to_csv('output_3.csv')