# Car Price Prediction Project

![](https://whatcar.vn/media/2018/09/car-lot-940x470.jpg)

### The goal of this project was to predict the price of the car based on its technical features, text description and image

### Here is what was done: <br>
* A default ('naive') model was built as a benchmark for the future models
* Exploratory Data Analysis was conducted to handle and normalize the features
* The first model was built based tabular data and the CatBoost algorithm
* Additional linear models and their ensebles were used to enhance the results of tabular model. Unfortunately none of them improved the Catboost results
* A simple dense neural network model was built
* A multi-input neural network was built using both tabular data and pre-processed text
* The images were added to the neural network
* The final ensemble of Catboost and neural network was used to improve results
* External dataset was uploaded in order to improve the model results. Unfortunately, it didn't improve the results

## Initial Setup

In [None]:
# setup and libraries import
!pip install -q tensorflow==2.3
!pip install albumentations -q
!pip install pymystem3
! pip install pymorphy2

#general libraries
import random
import numpy as np 
import pandas as pd 
import os
import sys
import PIL
import cv2
import re
import pymorphy2
import matplotlib.pyplot as plt
import seaborn as sns
import albumentations


from catboost import CatBoostRegressor

# Skleran modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Tensorflow modules
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import *


#Text preprocessing modules
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from tqdm import tqdm
from string import punctuation
from pymystem3 import Mystem


# increase the default size of the graphs
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
# graphs in svg format look better
%config InlineBackend.figure_format = 'svg' 
%matplotlib inline

# Transformations for images
from albumentations import (
    HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose
)

In [None]:
# Set common variables

# Set random seed for the future reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Plot sisez
x_size = 8 
y_size = 5

# TOKENIZER
# Max word counts used
MAX_WORDS = 100000
# Max word count in a string
MAX_SEQUENCE_LENGTH = 256
# Patterns for text cleanup
PATTERNS = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"

# A size of the image 
img_size = (320, 240)

### Common Functions 

In [None]:
# A target metric function
def mape(y_true, y_pred):    
    return np.mean(np.abs((y_pred-y_true)/y_true))

# A function for drawing a countplot
def getCountPlot(df, feature, x_s=x_size, y_s=y_size):    
    plt.figure(figsize=(x_s, y_s))
    sns.countplot(data=df,
            order = df[feature].value_counts().index, 
            y=feature)
    plt.title(f'Feature distribution {feature}')
    plt.show;
    

# A function for outliers preprocessing
def emission_replacement(df, column, method='median'):
    '''
    Replaces outliers in the series with the specific value 
    method='median' - replace with median
    method='average' - replace with mean
    method ='probable' - random distribution
    '''
    IQR = df[column].quantile(0.75) - df[column].quantile(0.25)
    perc25 = df[column].quantile(0.25)
    perc75 = df[column].quantile(0.75)

    f = perc25 - 1.5*IQR
    l = perc75 + 1.5*IQR

    if method =='median':
        df.loc[(df[column] < f) | (df[column] > l), column] = df[column].median()
    elif method =='average':
        df.loc[(df[column] < f) | (df[column] > l), column] = df[column].mean()
    elif method =='probable':
        # replacing outliers to nan 
        df[column] = np.where((df[column] < f) | (df[column] > l), np.nan, df[column])
        mask = df[column].isna()
        # distribution stats
        p = df[column].value_counts() / len(df[column].dropna())
        # filling missing values with the probability `p`
        df.loc[mask, column] = np.random.choice(p.index.to_list(),
                                            size=mask.sum(), 
                                            p=p.to_list())
        

# A function for distribution visualisation
def visualize_distributions(titles_values_dict):
  columns = min(3, len(titles_values_dict))
  rows = (len(titles_values_dict) - 1) // columns + 1
  fig = plt.figure(figsize = (columns * 5, rows * 3))
  for i, (title, values) in enumerate(titles_values_dict.items()):
    hist, bins = np.histogram(values, bins = 20)
    ax = fig.add_subplot(rows, columns, i + 1)
    ax.bar(bins[:-1], hist, width = (bins[1] - bins[0]) * 0.7)
    ax.set_title(title)
  plt.show()


# A function for removing redundant features, encodings and other preprocessing
def preproc_data(df_input):
        
    df_output = df_input.copy()
    
    # Removing redundant features
    df_output.drop(['description','sell_id'], axis = 1, inplace=True)
    
     
    # Filling NA values
    for column in numerical_features:
        df_output[column].fillna(df_output[column].median(), inplace=True)

    
    # Data normalization
    scaler = MinMaxScaler()
    for column in numerical_features:
        df_output[column] = scaler.fit_transform(df_output[[column]])[:,0]
    
    # Label Encoding
    for column in categorical_features:
        df_output[column] = df_output[column].astype('category').cat.codes
        
    # One-Hot Encoding
    df_output = pd.get_dummies(df_output, columns=categorical_features, dummy_na=False)
    
    
    return df_output

# A function for text preprocessing (lemmatization)
def lemmatize(doc):
    doc = re.sub(PATTERNS, ' ', doc)
    tokens = []
    for token in doc.split():
        token = token.strip()
        token = morph.normal_forms(token)[0]
        tokens.append(token)
    return ' '.join(tokens)

# A function for getting images from the corresponding folder
def get_image_array(index):
    images_train = []
    for index, sell_id in enumerate(data['sell_id'].loc[index].values):  
        image = cv2.imread(DATA_DIR + 'img/img/' + str(sell_id) + '.jpg')
        assert(image is not None)
        image = cv2.resize(image, img_size)
        images_train.append(image)
    images_train = np.array(images_train)
    print('images shape', images_train.shape, 'dtype', images_train.dtype)
    return(images_train)

# A function that applies transformations to the images
def make_augmentations(images):
    print('applying transformations', end = '')
    augmented_images = np.empty(images.shape)
    for i in range(images.shape[0]):
        if i % 200 == 0:
            print('.', end = '')
        augment_dict = augmentation(image = images[i])
        augmented_image = augment_dict['image']
        augmented_images[i] = augmented_image
    print('')
    return augmented_images

# A set of functions for image and text preprocessing
def process_image(image):
    return augmentation(image = image.numpy())['image']

def tokenize_(descriptions):
    return sequence.pad_sequences(tokenize.texts_to_sequences(descriptions), maxlen = MAX_SEQUENCE_LENGTH)

def tokenize_text(text):
    return tokenize_([text.numpy().decode('utf-8')])[0]

def tf_process_train_dataset_element(image, table_data, text, price):
    im_shape = image.shape
    [image,] = tf.py_function(process_image, [image], [tf.uint8])
    image.set_shape(im_shape)
    [text,] = tf.py_function(tokenize_text, [text], [tf.int32])
    return (image, table_data, text), price

def tf_process_val_dataset_element(image, table_data, text, price):
    [text,] = tf.py_function(tokenize_text, [text], [tf.int32])
    return (image, table_data, text), price

In [None]:
!pip freeze > requirements.txt

## Dataset Loading

In [None]:
# Data loading
DATA_DIR = '../input/sf-dst-car-price-prediction-part2/'
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')
sample_submission = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [None]:
# Let`s look at the data
train.head()

In [None]:
train.info()

## Model 1: Default model 
#### This model predicts the average car price based on the model name and the production year. This model will serve as a benchmark for other models.



In [None]:
# Splitting data
data_train, data_test = train_test_split(train, test_size=0.15, shuffle=True, random_state=RANDOM_SEED)

In [None]:
# Default model
predicts = []
for index, row in pd.DataFrame(data_test[['model_info', 'productionDate']]).iterrows():
    query = f"model_info == '{row[0]}' and productionDate == '{row[1]}'"
    predicts.append(data_train.query(query)['price'].median())

# Filling NA values with median
predicts = pd.DataFrame(predicts)
predicts = predicts.fillna(predicts.median())

# Rounding
predicts = (predicts // 1000) * 1000

# Evaluating precision
print(f"The precision of the default model based on MAPE metric is: {(mape(data_test['price'], predicts.values[:, 0]))*100:0.2f}%")

The precision of the default model based on MAPE metric is: 19.88%

## EDA and preprocessing

In [None]:
train['sample'] = 1 # marking the train
test['sample'] = 0 # marking the test
test['price'] = 0 # filling test target values with 0. This is the variable we have to predict

data = test.append(train, sort=False).reset_index(drop=True) # combining train and test datasets for future preprocessing
print(train.shape, test.shape, data.shape)

In [None]:
data.info()

In [None]:
# Let`s take a look at the numbe rof empty values
data.isnull().sum()

#### Only two features "Владельцы" and "Владение" have empty values. Let's make a note of this for the future

In [None]:
# Let`s check the dataset for the duplicates
len(data.drop_duplicates()) - len(data)

#### No duplicates in the data

### Features lookup

In [None]:
#bodyType
data['bodyType'].value_counts()

In [None]:
#brand
data['brand'].value_counts()

In [None]:
# color
data['color'].value_counts()

In [None]:
# engineDisplacement
data['engineDisplacement'].value_counts()

#### Let's convert *engineDisplacement* feature into numerical format

In [None]:
# Let`s adjust engineDisplacement feature - extract numbers only and convert them to float
data['engineDisplacement'] = data['engineDisplacement'].astype(str).apply(lambda x: x.split()[0])
data['engineDisplacement'] =  data['engineDisplacement'].replace('undefined', 0.0)
data['engineDisplacement'] = data['engineDisplacement'].astype(float)

In [None]:
data['engineDisplacement'].value_counts()

#### Let's convert *enginePower* feature into numerical format

In [None]:
# Let`s adjust enginePower feature - extract numbers only and convert them to float
data['enginePower'] = data['enginePower'].str.split().apply(lambda x: x[0]) 
data['enginePower'] = data['enginePower'].apply(lambda x: int(x))

In [None]:
# fuelType
data['fuelType'].value_counts()

In [None]:
# model_info
data['model_info'].value_counts()

In [None]:
# name
data['name'].value_counts()

#### The most of the specs are mentioned in other columns. Let's extract few new features from the name ferature and remove *name* feature after this 

In [None]:
# Creating new features 4wd and xdrive and getting rid of the name feature
data['4wd'] = data['name'].apply(lambda x: 1 if '4WD' in x else 0)
data['xdrive'] = data['name'].apply(lambda x: 1 if 'xDrive' in x else 0)
data.drop(['name'], axis=1, inplace=True)

In [None]:
# numberOfDoors
data['numberOfDoors'].value_counts()

In [None]:
# vehicleConfiguration
data['vehicleConfiguration'].value_counts()

#### All parameters are mentioned in other columns, so we can remove the *vehicleConfiguration* feature at all

In [None]:
# getting rid of vehicleConfiguration feature
data.drop(['vehicleConfiguration'], axis=1, inplace=True)

In [None]:
# vehicleTransmission
data['vehicleTransmission'].value_counts()

In [None]:
# Ownership ('Владение')
data['Владение'].value_counts()

In [None]:
data['Владение'].isna().sum()

#### Half of the values are empty. Let's remove 'Владение' feature

In [None]:
# getting rid of "Владение" feature
data.drop(['Владение'], axis=1, inplace=True)

In [None]:
# Owners ('Владельцы')
data['Владельцы'].value_counts()

In [None]:
data['Владельцы'].isnull().sum()

In [None]:
# As long as the Owners ('Владельцы') feature has a one missing value I suggest to remove the row with this missing value just for consistency
data.dropna(axis=0, inplace = True)

In [None]:
data.shape

In [None]:
# ПТС
data['ПТС'].value_counts()

In [None]:
# Руль
data['Руль'].value_counts()

#### There is no variability of this feature, so let's remove it

In [None]:
# getting rid of 'Руль' feature
data.drop(['Руль'], axis=1, inplace=True)

In [None]:
# Привод
data['Привод'].value_counts()

#### By this point we were looking at the distribution of the categorical variables. Let's have a look now at the numerical variables distribution


In [None]:
# Looking at the disctribution of a numerical features
visualize_distributions({
    'mileage': train['mileage'].dropna(),
    'modelDate': train['modelDate'].dropna(),
    'productionDate': train['productionDate'].dropna()
})

#### As we can see the numerical features are not distributed normally. So, it makes sense to consider taking a logarithm of these features for the future analysis. Let's take a look how the distribution of the numeric values look like after logarithmization

In [None]:
# Visualisation of numeric features after logariphmisation
visualize_distributions({
    'mileage': np.log(train['mileage'].dropna()),
    'modelDate': np.log(2021 - train['modelDate'].dropna()),
    'productionDate': np.log(2021 - train['productionDate'].dropna())
})

#### We see that the distributon of the numerical features after logarithmization look more like a normal distribution, so let's try to apply logarithm to these features when building a model

In [None]:
# Getting rid of the outliers
emission_replacement(data, 'mileage', method='average')

In [None]:
# Generation of the new features
data['years_in_use'] = 2021 - data['productionDate']
data['mileage_per_year'] = data['mileage'] / data['years_in_use']

In [None]:
data.info()

In [None]:
# A list of categorical features
categorical_features = ['bodyType', 'brand', 'color', 'fuelType', 'model_info', '4wd', 'xdrive', 'numberOfDoors',
                        'vehicleTransmission', 'Владельцы', 'ПТС', 'Привод']

#A list of numerical features
numerical_features = ['productionDate','modelDate', 'mileage',
                     'years_in_use', 'mileage_per_year', 'enginePower', 'engineDisplacement']

In [None]:
# Data preprocessing
df_preproc = preproc_data(data)
df_preproc.sample(10)

In [None]:
df_preproc.shape

## Splitting data into train and test

In [None]:
# Extracting train and test parts
train_data = df_preproc.query('sample == 1').drop(['sample'], axis=1)
test_data = df_preproc.query('sample == 0').drop(['sample'], axis=1)

y = train_data.price.values     # our target
X = train_data.drop(['price'], axis=1)
X_sub = test_data.drop(['price'], axis=1)

In [None]:
X_sub.shape

## Model 2: CatBoostRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=RANDOM_SEED)

In [None]:
model_catboost = CatBoostRegressor(iterations = 5000,                       
                          random_seed = RANDOM_SEED,
                          eval_metric='MAPE',
                          custom_metric=['RMSE', 'MAE'],
                          od_wait=500                          
                         )
model_catboost.fit(X_train, np.log(y_train),
         eval_set=(X_test, np.log(y_test)),
         verbose_eval=100,
         use_best_model=True       
         )

In [None]:
test_predict_catboost = np.exp(model_catboost.predict(X_test))
print(f"TEST mape: {(mape(y_test, test_predict_catboost))*100:0.2f}%")

#### TEST mape on original data gave us: 12.91% MAPE <br> TEST mape with logarithmized price gave us the best result of: 11.12% MAPE <br> All the experiments with logarithmization of other features didn't improve this result unfortunately. So for CatBoost Regressor we use logarithm of the target variable only. In all other algoritms we were not using log, just original values

### Catboost Submission

In [None]:
sub_predict_catboost = np.exp(model_catboost.predict(X_sub))
sample_submission['price'] = sub_predict_catboost
sample_submission.to_csv('catboost_submission.csv', index=False)

## Model 3: Tabular Dense Neural Network

In [None]:
# Simple Dense Neural Network
model_dnn = Sequential()
model_dnn.add(L.Dense(512, input_dim=X_train.shape[1], activation="relu"))
model_dnn.add(L.Dropout(0.5))
model_dnn.add(L.Dense(256, activation="relu"))
model_dnn.add(L.Dropout(0.5))
model_dnn.add(L.Dense(1, activation="linear"))

In [None]:
model_dnn.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
model_dnn.compile(loss='MAPE', optimizer = optimizer, metrics = ['MAPE'])
checkpoint_dnn = ModelCheckpoint('../working/best_model_dnn.hdf5', monitor = ['val_MAPE'], verbose=0, mode='min')
earlystop_dnn = EarlyStopping(monitor='val_MAPE', patience=50, restore_best_weights=True)
callbacks_list_dnn = [checkpoint_dnn,earlystop_dnn]

In [None]:
history_dnn = model_dnn.fit(X_train, y_train,
                   batch_size=512,
                   epochs = 500,
                   validation_data = (X_test, y_test),
                   callbacks = callbacks_list_dnn,
                   verbose = 0,
                   )

In [None]:
plt.title('Loss')
plt.plot(history_dnn.history['MAPE'], label='train')
plt.plot(history_dnn.history['val_MAPE'], label='test')
plt.show();

In [None]:
model_dnn.load_weights('../working/best_model_dnn.hdf5')
model_dnn.save('../working/nn_1.hdf5')

In [None]:
test_predict_nn1 = model_dnn.predict(X_test)[:, 0]
print(f"TEST MAPE:{(mape(y_test, test_predict_nn1))*100:0.2f}%")

#### TEST MAPE of Simple Dense Neural Network:11.47%

### Simple Dense Neural Network Submission

In [None]:
sub_predict_nn1 = model_dnn.predict(X_sub)[:, 0]
sample_submission['price'] = sub_predict_nn1
sample_submission.to_csv('dnn_submission.csv', index=False)

# Model 4: NLP + Multiple Inputs

In [None]:
# Let1s take a look how the description of the item looks
data.description[0]

In [None]:
#Creating an object for text preprocessing
morph = pymorphy2.MorphAnalyzer()
data_lem = data.copy()

In [None]:
data_lem['description'] = data_lem.apply(lambda data_lem: lemmatize(data_lem.description), axis=1)

In [None]:
# Text tokenization after preprocessing
tokenize = Tokenizer(num_words=MAX_WORDS)
tokenize.fit_on_texts(data_lem['description'])

In [None]:
# Data split 
text_train = data.description.loc[X_train.index]
text_test = data.description.loc[X_test.index]
text_sub = data.description.loc[X_sub.index]


# %%time
text_train_sequences = sequence.pad_sequences(tokenize.texts_to_sequences(text_train), maxlen=MAX_SEQUENCE_LENGTH)
text_test_sequences = sequence.pad_sequences(tokenize.texts_to_sequences(text_test), maxlen=MAX_SEQUENCE_LENGTH)
text_sub_sequences = sequence.pad_sequences(tokenize.texts_to_sequences(text_sub), maxlen=MAX_SEQUENCE_LENGTH)

print(text_train_sequences.shape, text_test_sequences.shape, text_sub_sequences.shape, )

In [None]:
# Let`s look how the tokenized descirption looks
print(text_train.iloc[6])
print(text_train_sequences[6])

### RNN NLP

In [None]:
model_nlp = Sequential()
model_nlp.add(L.Input(shape=MAX_SEQUENCE_LENGTH, name="seq_description"))
model_nlp.add(L.Embedding(len(tokenize.word_index)+1, MAX_SEQUENCE_LENGTH,))
model_nlp.add(L.LSTM(256, return_sequences=True))
model_nlp.add(L.Dropout(0.5))
model_nlp.add(L.LSTM(128,))
model_nlp.add(L.Dropout(0.25))
model_nlp.add(L.Dense(64, activation="relu"))
model_nlp.add(L.Dropout(0.25))

### MLP

In [None]:
model_mlp = Sequential()
model_mlp.add(L.Dense(512, input_dim=X_train.shape[1], activation="relu"))
model_mlp.add(L.Dropout(0.5))
model_mlp.add(L.Dense(256, activation="relu"))
model_mlp.add(L.Dropout(0.5))

### Multiple Inputs NN

In [None]:
combinedInput = L.concatenate([model_nlp.output, model_mlp.output])
# being our regression head
head = L.Dense(64, activation="relu")(combinedInput)
head = L.Dense(1, activation="linear")(head)

model_comb = Model(inputs=[model_nlp.input, model_mlp.input], outputs=head)

In [None]:
model_comb.summary()

### Fit

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
model_comb.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

In [None]:
checkpoint_comb = ModelCheckpoint('../working/best_model_comb.hdf5', monitor=['val_MAPE'], verbose=0, mode='min')
earlystop_comb = EarlyStopping(monitor='val_MAPE', patience=10, restore_best_weights=True,)
callbacks_list_comb = [checkpoint_comb, earlystop_comb]

In [None]:
history_comb = model_comb.fit([text_train_sequences, X_train], y_train, 
                    batch_size=512,
                    epochs=500, 
                    validation_data=([text_test_sequences, X_test], y_test),
                    callbacks= callbacks_list_comb)

In [None]:
plt.title('Loss')
plt.plot(history_comb.history['MAPE'], label='train', color='green')
plt.plot(history_comb.history['val_MAPE'], label='test')
plt.show();

In [None]:
model_comb.load_weights('../working/best_model_comb.hdf5')
model_comb.save('../working/nn_mlp_nlp_comb.hdf5')

In [None]:
test_predict_nn2 = model_comb.predict([text_test_sequences, (X_test)])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0]))*100:0.2f}%")

#### Model 4 TEST mape: 11.84% <br> Model 4 TEST mape with text preprocessing: 11.32% 

In [None]:
sub_predict_nn2 = model_comb.predict([text_sub_sequences, X_sub])[:,0]
sample_submission['price'] = sub_predict_nn2
sample_submission.to_csv('nn2_submission.csv', index=False)

## Model 5: Neural Network with Images

In [None]:
# Displaying few examples of the images
plt.figure(figsize = (12,8))

random_image = train.sample(n = 9)
random_image_paths = random_image['sell_id'].values
random_image_cat = random_image['price'].values

for index, path in enumerate(random_image_paths):
    im = PIL.Image.open(DATA_DIR+'img/img/' + str(path) + '.jpg')
    plt.subplot(3, 3, index + 1)
    plt.imshow(im)
    plt.title('price: ' + str(random_image_cat[index]))
    plt.axis('off')
plt.show()

In [None]:
# Converting images to tensors
images_train = get_image_array(X_train.index)
images_test = get_image_array(X_test.index)
images_sub = get_image_array(X_sub.index)

#### Let's apply transformations to the images from Albumentations library. A lot of different transformations were tested in scope of this project, and these are the ones that showed the best score.

In [None]:
augmentation = albumentations.Compose([
    albumentations.HorizontalFlip(p=0.5),
    albumentations.Rotate(limit=30, interpolation=1, border_mode=4,
                          value=None, mask_value=None, always_apply=False, p=0.5),
    albumentations.OneOf([
        albumentations.CenterCrop(height=224, width=200),
        albumentations.CenterCrop(height=200, width=224),
    ], p=0.5),
    albumentations.OneOf([
        albumentations.RandomBrightnessContrast(
            brightness_limit=0.3, contrast_limit=0.3),
        albumentations.RandomBrightnessContrast(
            brightness_limit=0.1, contrast_limit=0.1)
    ], p=0.5),
    albumentations.GaussianBlur(p=0.05),
    albumentations.HueSaturationValue(p=0.5),
    albumentations.RGBShift(p=0.5),
    albumentations.FancyPCA(alpha=0.1, always_apply=False, p=0.5),
    albumentations.Resize(240, 320)
])

In [None]:
# Splitting data
train_dataset = tf.data.Dataset.from_tensor_slices((
    images_train, X_train, data.description.loc[X_train.index], y_train
)).map(tf_process_train_dataset_element)

test_dataset = tf.data.Dataset.from_tensor_slices((
    images_test, X_test, data.description.loc[X_test.index], y_test
)).map(tf_process_val_dataset_element)

y_sub = np.zeros(len(X_sub))
sub_dataset = tf.data.Dataset.from_tensor_slices((
    images_sub, X_sub, data.description.loc[X_sub.index], y_sub
)).map(tf_process_val_dataset_element)


#проверяем, что нет ошибок (не будет выброшено исключение):
train_dataset.__iter__().__next__();
test_dataset.__iter__().__next__();
sub_dataset.__iter__().__next__();

#### Let's now build a convolutional network. Several models had been tried and EfficientNetB3 showed itself as an optimal one. Unfortunately Kaggle Kernel struggles to handle EfficientNetB7, which potentially could do better

In [None]:
# Building a Convolutional Network without a "head" 
efficientnet_model = tf.keras.applications.efficientnet.EfficientNetB3(weights = 'imagenet', include_top = False, input_shape = (img_size[1], img_size[0], 3))

In [None]:
#Fine-tuning. Making all the layers of the base model trainable. The model showed its best results with such settings
efficientnet_model.trainable = True

In [None]:
efficientnet_output = L.GlobalAveragePooling2D()(efficientnet_model.output)

In [None]:
# A network for tabular data analysis
tabular_model = Sequential([
    L.Input(shape = X.shape[1]),
    L.Dense(512, activation = 'relu'),
    L.Dropout(0.5),
    L.Dense(256, activation = 'relu'),
    L.Dropout(0.5),
    ])

In [None]:
# NLP
nlp_model = Sequential([
    L.Input(shape=MAX_SEQUENCE_LENGTH, name="seq_description"),
    L.Embedding(len(tokenize.word_index)+1, MAX_SEQUENCE_LENGTH,),
    L.LSTM(256, return_sequences=True),
    L.Dropout(0.5),
    L.LSTM(128),
    L.Dropout(0.25),
    L.Dense(64),
    ])

In [None]:
# Combining outputs of three different networks
combinedInput = L.concatenate([efficientnet_output, tabular_model.output, nlp_model.output])

# being our regression head
head = L.Dense(256, activation="relu")(combinedInput)
head = L.Dense(1,)(head)

model_combinput = Model(inputs=[efficientnet_model.input, tabular_model.input, nlp_model.input], outputs=head)

In [None]:
optimizer = tf.keras.optimizers.Adam(0.005)
model_combinput.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

In [None]:
checkpoint_combinput = ModelCheckpoint('../working/best_model_combinput.hdf5', monitor=['val_MAPE'], verbose=0, mode='min')
earlystop_combinput = EarlyStopping(monitor='val_MAPE', patience=10, restore_best_weights=True,)
callbacks_list_combinput = [checkpoint_combinput, earlystop_combinput]

In [None]:
history_combinput = model_combinput.fit(train_dataset.batch(30),
                    epochs=100, 
                    validation_data = test_dataset.batch(30),
                    callbacks=callbacks_list_combinput)

In [None]:
plt.title('Loss')
plt.plot(history_combinput.history['MAPE'], label='train')
plt.plot(history_combinput.history['val_MAPE'], label='test')
plt.show();

In [None]:
model_combinput.load_weights('../working/best_model_combinput.hdf5')
model_combinput.save('../working/nn_final_combinput.hdf5')

In [None]:
test_predict_nn3 = model_combinput.predict(test_dataset.batch(30))[:,0]
print(f"TEST mape: {(mape(y_test, test_predict_nn3))*100:0.2f}%")

#### TEST mape for EfficientNetB3 Convolutional Network: 11.85% 

In [None]:
sub_predict_nn3 = model_combinput.predict(sub_dataset.batch(30))
sample_submission['price'] = sub_predict_nn3[:,0]
sample_submission.to_csv('nn3_submission.csv', index=False)

### Blend

In [None]:
# Doing blend prediction of the Catboost algorythm and Convolutional neural network
blend_predict = (test_predict_catboost + test_predict_nn3) / 2
print(f"TEST mape: {(mape(y_test, blend_predict))*100:0.2f}%")

#### Blend of the CatboostRegressor and Convolutional Neural Network, including text and image preprocessing and logarithmization for the Catboost algorythm gave the TEST mape: 11.32%


### Blend Submission

In [None]:

blend_sub_predict = (sub_predict_catboost + sub_predict_nn3[:,0]) / 2
sample_submission['price'] = blend_sub_predict
sample_submission.to_csv('blend_submission_6.csv', index=False)

## Conclusion

#### The best result was achieved by using blending of CatboostRegressor and Convolutional NN EfficientNetB3, together with text and image preprocessing and logarithmization of the target variable for Catboost Algorytmn. <br> The best TEST mape achieved was 11.32%, which gave 11.54% the 59th position on the Leaderboard on sumbission data. <br> Also, there was an attempt to use an external data - a tabular data that was parsed from auto.ru website for one of the previous projects. Unfortunately, it didn't improve the result