In [11]:
# libriaries used are loaded

from google.colab import drive

import os

import pandas as pd
from tqdm.notebook import tqdm
from PIL import Image
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pprint import pprint


from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Bidirectional, GRU, Dropout
from tensorflow.keras.optimizers import Adam

!pip install sentencepiece
import sentencepiece as spm




In [12]:
# grab dataset from google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip -q '/content/drive/MyDrive/img_test.zip'
!unzip -q '/content/drive/MyDrive/img_train.zip'

!cp '/content/drive/MyDrive/test_x.csv' 'test_x.csv'
!cp '/content/drive/MyDrive/train_xy.csv' 'train_xy.csv'

xy_train_df = pd.read_csv('train_xy.csv')
x_test_df = pd.read_csv('test_x.csv')

In [14]:
# preprocess image data

def load_image(file):
    try:
        # open an image file when the path is made from correct path
        image = Image.open(
            file
        # image is converted to 'LA' and resizes the image
        ).convert('LA').resize((64, 64))
        # create numpy array of image
        arr = np.array(image)
    except:
        # returns a new array for given shape below, filled with zeroes
        arr = np.zeros((64, 64, 2))
    return arr


# loading images:
# loads images that are in the xy_train_df, identifies them using the column 'image'.
# tqdm is used to show progress of the loop by predicting the remaining time
x_image = np.array([load_image(i) for i in tqdm(xy_train_df.image)])

# loading summary: (force convert some of the non-string cell to string)
x_text = xy_train_df.summary.astype('str')


# So to summerize here we are creating image training set have images that 
# repectively match those in xy_train summary column

# we also create test training set by grabbing the summary column of xy_train

HBox(children=(FloatProgress(value=0.0, max=7627.0), HTML(value='')))




In [29]:
# labels:
# creating the target column which is price categories (0,1,2)
y_price = xy_train_df.price
# categorizes variables into numbers for the column 'type' in xy_train
# I think we are saving/using this column to make the problem a multi-target model
# even though during prediction we only use 'price' for the submission file
y_type = xy_train_df.type.astype('category').cat.codes

# sanity checks :) 
len_price = len(y_price.unique())
len_type = len(y_type.unique())
print('unique values for price category', len_price, y_price.unique())
print('unique values for type category', len_type, y_type.unique())

# splitting our image and text data 80/20:

x_tr_image, x_vl_image, x_tr_text, x_vl_text, y_tr_price, y_vl_price, y_tr_type, y_vl_type = train_test_split(
    x_image, 
    x_text,
    y_price,
    y_type,
    test_size=0.2)

print(np.shape(x_tr_image))
print(np.shape(x_vl_image))
print(np.shape(y_tr_price))
print(np.shape(y_vl_price))
print(np.shape(y_tr_type))
print(np.shape(y_vl_type))

unique values for price category 3 [1 0 2]
unique values for type category 24 [ 1 17 22 10 18 20  5  2  8  4 23 13 15 16 14 11 19  0 21  3  6 12  7  9]
(6101, 64, 64, 2)
(1526, 64, 64, 2)
(6101,)
(1526,)
(6101,)
(1526,)


In [30]:
# maximum number of words from the resulting tokenized data which are to be used
vocab_size = 40000
max_len = 100


# build vocabulary from training set
tokenizer = Tokenizer(num_words=vocab_size)
# upadting the internal vocalulary based on the list of text, so it creates the vocabulary
# index based on word frequnecy so every word gets a unique interger value so lower integers 
# mean more frequent word.
tokenizer.fit_on_texts(x_tr_text)


def _preprocess(list_of_text):
    # pads sequence to the same length (all sequences in a list to have the same length), it
    # does so by padding 0 in the beggining of each sequence until they have the same length as
    # the longest sequence. 
    return pad_sequences(
        # transforms each text in texts to a sequence of integers. It takes each word
        # in the text and replaces it with its corresponding integer value from the 
        # dictionary.
        tokenizer.texts_to_sequences(list_of_text),
        # takes in the pre-defined input (100) as maximum length of all sequences.
        maxlen=max_len,
        # does padding after each sequence
        padding='post',
    )
    

# padding is done inside: 
x_tr_text_id = _preprocess(x_tr_text)
x_vl_text_id = _preprocess(x_vl_text)

print(x_tr_text_id.shape)
print(x_vl_text_id.shape)


(6101, 100)
(1526, 100)


In [None]:
### ORIGINAL TEMPLATE WITH COMMENTS ADDED

# defines an input layer (instantiate a Keras tensor object) and allows for building a model.
# batch_shape basically means shape=(100,)
in_text = keras.Input(batch_shape=(None, max_len))
in_image = keras.Input(batch_shape=(None, 64, 64, 2))

### text part

# this layer can only be used as the first layer in a model. This is the first hidden layer of a 
# network and will learn an embedding for all of the words in the trainin dataset. Here we are giving 
# it input and output integers
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(in_text)
# computes the mean of elements across dimensions of the vector. It reduced the input variables along
# the dimenions given in axis by computing the mean of elements across dimensions in the axis. Here in
# the provided code we are reducing the dimensions by one.
averaged = tf.reduce_mean(embedded, axis=1)

### image part

# first convolution layer
# the first parameter is the filter indicating the dimensionality of the output space.
# next parameter is the kernel_size which specifies the heigh and width of the 2D filter.
cov = Conv2D(32, (16, 16))(in_image)
# Max pooling layer to downsample
pl = MaxPool2D((16, 16))(cov)
# flattening the array of pixels
flattened = Flatten()(pl)


### fusion:
# concatenates tensors along one dimension (axis)
fused = tf.concat([averaged, flattened], axis=-1)
# multi-objectives (each is a multi-class classification)
p_price = Dense(len_price, activation='softmax', name='price')(fused)
p_type = Dense(len_type, activation='softmax', name='type')(fused)


model = keras.Model(
    inputs={
        'summary': in_text,
        'image': in_image
    },
    outputs={
        'price': p_price,
        'type': p_type,
    },
)


model.compile(
    optimizer=Adam(),
    loss={
        'price': 'sparse_categorical_crossentropy',
        'type': 'sparse_categorical_crossentropy',
    },
    loss_weights={
        'price': 0.5,
        'type': 0.5,       
    },
    metrics={
        'price': ['SparseCategoricalAccuracy'],
        'type': ['SparseCategoricalAccuracy'],
    },
)


model.summary()

In [None]:
### ATTENTION LAYER ADDED

in_text = keras.Input(batch_shape=(None, max_len))
in_image = keras.Input(batch_shape=(None, 64, 64, 2))

# text part
embedded = tf.keras.layers.Embedding(tokenizer.num_words, 100)(in_text)

# Query encoding of shape [batch_size, Tq, filters].
query_seq_encoding = Bidirectional(GRU(units = 100))(embedded)
# Value encoding of shape [batch_size, Tv, filters].
value_seq_encoding = Bidirectional(GRU(units = 100))(embedded)

# Query-value attention of shape [batch_size, Tq, filters].
query_value_attention_seq = tf.keras.layers.Attention()([query_seq_encoding, value_seq_encoding])
atten = tf.keras.layers.Concatenate()([query_seq_encoding, query_value_attention_seq])
averaged = atten





# image part
cov = Conv2D(32, (16, 16))(in_image)
pl = MaxPool2D((16, 16))(cov)
flattened = Flatten()(pl)


# fusion:
fused = tf.concat([averaged, flattened], axis=-1)

# multi-objectives (each is a multi-class classification)
p_price = Dense(len_price, activation='softmax', name='price')(fused)
p_type = Dense(len_type, activation='softmax', name='type')(fused)


model = keras.Model(
    inputs={
        'summary': in_text,
        'image': in_image
    },
    outputs={
        'price': p_price,
        'type': p_type,
    },
)


model.compile(
    optimizer=Adam(),
    loss={
        'price': 'sparse_categorical_crossentropy',
        'type': 'sparse_categorical_crossentropy',
    },
    loss_weights={
        'price': 0.5,
        'type': 0.5,       
    },
    metrics={
        'price': ['SparseCategoricalAccuracy'],
        'type': ['SparseCategoricalAccuracy'],
    },
)


model.summary()

In [37]:
### CUSTOMIZING IMAGE CONVOLUTIONAL LAYERS


in_text = keras.Input(batch_shape=(None, max_len))
in_image = keras.Input(batch_shape=(None, 64, 64, 2))

# text part
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(in_text)
averaged = tf.reduce_mean(embedded, axis=1)


# image part
cov = Conv2D(32, (16, 16), padding='same', activation='relu')(in_image)
pl = MaxPool2D((16, 16))(cov)

cov2 = Conv2D(64, (16, 16), padding='same', activation='relu')(pl)
p2 = MaxPool2D((16, 16), padding='same')(cov2)
flattened = Flatten()(p2)


fc4 = Dense(84)(flattened)
fc5 = Dropout(0.5)(fc4)
fc6 = Dense(84)(fc5)
fc7 = Dropout(0.5)(fc6)


# fusion:
fused = tf.concat([averaged, fc7], axis=-1)

# multi-objectives (each is a multi-class classification)
p_price = Dense(len_price, activation='softmax', name='price')(fused)
p_type = Dense(len_type, activation='softmax', name='type')(fused)


model = keras.Model(
    inputs={
        'summary': in_text,
        'image': in_image
    },
    outputs={
        'price': p_price,
        'type': p_type,
    },
)


model.compile(
    optimizer=Adam(),
    loss={
        'price': 'sparse_categorical_crossentropy',
        'type': 'sparse_categorical_crossentropy',
    },
    loss_weights={
        'price': 0.5,
        'type': 0.5,       
    },
    metrics={
        'price': ['SparseCategoricalAccuracy'],
        'type': ['SparseCategoricalAccuracy'],
    },
)


model.summary()

Model: "functional_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           [(None, 64, 64, 2)]  0                                            
__________________________________________________________________________________________________
conv2d_23 (Conv2D)              (None, 64, 64, 32)   16416       input_28[0][0]                   
__________________________________________________________________________________________________
max_pooling2d_18 (MaxPooling2D) (None, 4, 4, 32)     0           conv2d_23[0][0]                  
__________________________________________________________________________________________________
conv2d_24 (Conv2D)              (None, 4, 4, 64)     524352      max_pooling2d_18[0][0]           
______________________________________________________________________________________

In [34]:
### FUSION LAYER

in_text = keras.Input(batch_shape=(None, max_len))
in_image = keras.Input(batch_shape=(None, 64, 64, 2))

# text part
embedded = keras.layers.Embedding(vocab_size, 100)(in_text)
averaged = tf.reduce_mean(embedded, axis=1)


# image part
cov = Conv2D(32, (16, 16))(in_image)
pl = MaxPool2D((16, 16))(cov)
flattened = Flatten()(pl)
fc4 = Dense(64)(flattened)
fc5 = Dropout(0.5)(fc4)
fc6 = Dense(100)(fc5)
fc7 = Dropout(0.5)(fc6)


# fusion:
fused = tf.stack([averaged, fc7],axis=-1)
flattened2 = Flatten()(fused)
# multi-objectives (each is a multi-class classification)
p_price = Dense(len_price, activation='softmax', name='price')(flattened2)
p_type = Dense(len_type, activation='softmax', name='type')(flattened2)


model = keras.Model(
    inputs={
        'summary': in_text,
        'image': in_image
    },
    outputs={
        'price': p_price,
        'type': p_type,
    },
)


model.compile(
    optimizer=Adam(),
    loss={
        'price': 'sparse_categorical_crossentropy',
        'type': 'sparse_categorical_crossentropy',
    },
    loss_weights={
        'price': 0.5,
        'type': 0.5,       
    },
    metrics={
        'price': ['SparseCategoricalAccuracy'],
        'type': ['SparseCategoricalAccuracy'],
    },
)


model.summary()

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_26 (InputLayer)           [(None, 64, 64, 2)]  0                                            
__________________________________________________________________________________________________
conv2d_22 (Conv2D)              (None, 49, 49, 32)   16416       input_26[0][0]                   
__________________________________________________________________________________________________
max_pooling2d_17 (MaxPooling2D) (None, 3, 3, 32)     0           conv2d_22[0][0]                  
__________________________________________________________________________________________________
flatten_5 (Flatten)             (None, 288)          0           max_pooling2d_17[0][0]           
______________________________________________________________________________________

In [122]:
### ADVANCDE TOKENIZER

vocab_size = 15480 # maxsize limit hit for spm
max_len = 100

# spm wants a file input so I'm saving the data as a csv here
df_x_text= pd.DataFrame(x_tr_text)
df_x_text.to_csv('spm.train.set.txt')

# training on the data file which creates and saves a model file called 'tokenmodel'
spm.SentencePieceTrainer.train(input='spm.train.set.txt', model_prefix='tokenmodel', vocab_size=vocab_size)
# uses the file created in the last step to "tokenize"
tokenizer = spm.SentencePieceProcessor(model_file='tokenmodel.model')


def _preprocess(list_of_text):
  encoded_list = []
  for text in list_of_text:
    encoded_text = tokenizer.encode(text, enable_sampling=True, alpha=0.1, nbest_size=-1)
    encoded_list.append(encoded_text)

  return pad_sequences(
      encoded_list,
      maxlen=max_len,
      padding='post'    )
    

# padding is done inside: 
x_tr_text_id = _preprocess(x_tr_text)
x_vl_text_id = _preprocess(x_vl_text)

print(x_tr_text_id.shape)
print(x_vl_text_id.shape)

(6101, 100)
(1526, 100)


In [116]:
x_tr_text.shape

(6101,)

In [38]:
history = model.fit(
    x={
        'summary': x_tr_text_id,
        'image': x_tr_image
    },
    y={
        'price': y_tr_price,
        'type': y_tr_type,
    },
    epochs=40,
    batch_size=50,
    validation_data=(
        {
            'summary': x_vl_text_id,
            'image': x_vl_image
         }, 
        {
            'price': y_vl_price,
            'type': y_vl_type,
        }
    ),
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_price_loss', patience=10, restore_best_weights= True )
    ],
    verbose=1
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40


In [25]:
# preprocess test data
x_test_summary = _preprocess(x_test_df.summary.astype(str))
x_test_image = np.array([load_image(i) for i in tqdm(x_test_df.image)])

HBox(children=(FloatProgress(value=0.0, max=7360.0), HTML(value='')))




In [39]:
y_predict = model.predict(
    {
        'summary': x_test_summary,
        'image': x_test_image
    }
)

# only predict for the price column
price_predicted = y_predict['price']
print(price_predicted)
price_category_predicted = np.argmax(price_predicted, axis=1)
print(price_category_predicted)

pd.DataFrame(
    {'id': x_test_df.id,
     'price': price_category_predicted}).to_csv('conv2D_best_weights.spm.csv', index=False)

[[0.7471423  0.20665012 0.04620761]
 [0.65297514 0.29617396 0.05085089]
 [0.6861692  0.2708084  0.04302243]
 ...
 [0.68382627 0.2691224  0.04705141]
 [0.3892078  0.46857643 0.14221576]
 [0.8221707  0.1432627  0.03456659]]
[0 0 0 ... 0 1 0]
