## Imports

In [2]:
import keras
import pandas as pd
import os
import numpy as np
from PIL import Image
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import BatchNormalization, Reshape, Conv2D, MaxPooling2D, Activation, Dropout, Dense, Flatten, Input, concatenate


import warnings
warnings.filterwarnings("ignore")

## Set paths and read data

In [3]:
car_damages_cleaned_path = 'car_damages_cleaned.csv'
data_dir = './data_damages/'

damages_data = pd.read_csv(car_damages_cleaned_path)

## One hot encoding 

In [4]:
def encoder(df, column):
    encoder_dict = {}
    idx = 0
    for i in range(len(df)) : 
        value = df.loc[i, column]
        if (value not in encoder_dict):
            encoder_dict[value] = idx
            df.loc[i, column] = idx
            idx += 1
        else:
            encoding = encoder_dict[value]
            df.loc[i, column] = encoding
    return df[column], encoder_dict
 
modified_data = pd.DataFrame()
modified_data['listing_id'] = damages_data['listing_id']
modified_data['damage'], damage_encoder = encoder(damages_data, 'damage')
modified_data['make'], make_encoder = encoder(damages_data, 'make')
modified_data['model'], model_encoder = encoder(damages_data, 'model')
modified_data['full_label'], full_label_encoder = encoder(damages_data, 'full_label')
modified_data['est_value'] = damages_data['est_value']
modified_data['year'] = damages_data['year']

In [5]:
modified_data.head()

Unnamed: 0,listing_id,damage,make,model,full_label,est_value,year
0,23885009,0,0,0,0,12726,2010
1,24487306,0,0,1,1,1984,2002
2,25667019,0,0,2,2,13095,2012
3,25717579,1,0,2,3,4275,2005
4,25887480,0,0,3,4,7742,2008


In [6]:
print('Number of unique makes in dataset: ', len(modified_data.make.unique()))
print('Number of unique models in dataset: ', len(modified_data.model.unique()))
print('Number of unique damage types in dataset: ', len(modified_data.damage.unique()))
print('Number of unique labels (make, model, and year) in dataset: ', len(modified_data.full_label.unique()))
print('Total number of datapoints: ', len(modified_data))

Number of unique makes in dataset:  25
Number of unique models in dataset:  498
Number of unique damage types in dataset:  18
Number of unique labels (make, model, and year) in dataset:  947
Total number of datapoints:  1180


## Convert images to Numpy arrays

In [7]:
def convert_images(df, column, idx):
    for i in range(len(df)) : 
        listing_id = df.loc[i, 'listing_id']
        image = Image.open(data_dir + '{}-{}.jpg'.format(listing_id, idx)).resize((250,250))
        
        image_array = np.array(image)
        df.at[i, column] = image_array
    return df[column]

modified_data['image1'] = None
modified_data['image2'] = None
modified_data['image3'] = None
modified_data['image1'] = convert_images(modified_data, 'image1', 1)
modified_data['image2'] = convert_images(modified_data, 'image2', 2)
modified_data['image3'] = convert_images(modified_data, 'image3', 4)


In [8]:
modified_data_path = 'modified_damages.csv'
modified_data.to_csv(modified_data_path)

In [9]:
modified_data.head()

Unnamed: 0,listing_id,damage,make,model,full_label,est_value,year,image1,image2,image3
0,23885009,0,0,0,0,12726,2010,"[[[253, 255, 254], [253, 255, 254], [253, 255,...","[[[254, 254, 254], [254, 254, 254], [254, 254,...","[[[244, 246, 236], [244, 246, 237], [244, 245,..."
1,24487306,0,0,1,1,1984,2002,"[[[134, 135, 127], [115, 116, 108], [126, 127,...","[[[74, 94, 118], [72, 93, 117], [73, 94, 119],...","[[[134, 146, 149], [135, 147, 150], [137, 150,..."
2,25667019,0,0,2,2,13095,2012,"[[[72, 120, 169], [72, 120, 169], [72, 120, 16...","[[[78, 92, 94], [95, 109, 112], [91, 105, 108]...","[[[87, 132, 174], [87, 132, 174], [87, 132, 17..."
3,25717579,1,0,2,3,4275,2005,"[[[58, 65, 75], [58, 65, 75], [53, 60, 70], [5...","[[[207, 224, 241], [223, 240, 252], [220, 237,...","[[[224, 225, 246], [224, 225, 246], [225, 226,..."
4,25887480,0,0,3,4,7742,2008,"[[[108, 107, 102], [107, 106, 101], [109, 108,...","[[[183, 182, 187], [213, 212, 217], [197, 196,...","[[[220, 223, 228], [220, 223, 228], [220, 223,..."


In [10]:
def image_consolidator(df):
    images = np.zeros((250, 750, 3))
    for i in range(len(df)):
        image1 = df.loc[i, 'image1']
        image2 = df.loc[i, 'image2']
        image3 = df.loc[i, 'image3']
        images[0:250, 0:250] = image1
        images[0:250, 250:500] = image2
        images[0:250, 500:750] = image3
        df.at[i, 'images'] = images
    return df['images']
    
modified_data['images'] = None
modified_data['images'] = image_consolidator(modified_data)

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(modified_data[['listing_id', 'damage', 'make', 'model', 'year', 'full_label', 'images']], modified_data[['est_value']], test_size=0.20, random_state=2020)
x_train, x_valid, y_train, y_valid = train_test_split(modified_data[['listing_id', 'damage', 'make', 'model', 'year', 'full_label', 'images']], modified_data[['est_value']], test_size=0.25, random_state=2020)

In [12]:
x_train_images = x_train[['images']]/255
x_valid_images = x_valid[['images']]/255
x_test_images = x_test[['images']]/255

x_train_attributes = x_train[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]
x_valid_attributes = x_valid[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]
x_test_attributes = x_test[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]

In [14]:
def reshape_dimensions(df):
    all_images = []
    for i in range(len(df)):
        idx = df.index[i]
        all_images.append(df.loc[idx, 'images'])
    return np.array(all_images)

x_train_images = reshape_dimensions(x_train_images)
x_valid_images = reshape_dimensions(x_valid_images)
x_test_images = reshape_dimensions(x_test_images)  

In [15]:
def categorical_data(regress=False):
    model = Sequential()
    model.add(Dense(8, input_dim=6, activation='relu'))
    model.add(Dense(4, activation='relu'))
    
    if regress:
        model.add(Dense(1, activation="linear"))
   
    return model

def image_data(regress=False):
    
    inputs = Input((250, 750, 3))
    x = inputs
    
    x = Conv2D(16, (3,3), padding='same')(x)
    x = Activation('relu')(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(pool_size=(10,10))(x)
    x = Dropout(0.15)(x)
    
    x = Conv2D(64, (50,50), padding='same')(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(20,20))(x)
    x = Dropout(0.15)(x)
    
    x = Flatten()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = BatchNormalization(axis=-1)(x)
    x = Dropout(0.25)(x)
    
    x = Dense(6)(x)
    x = Activation('relu')(x)
    
    if regress:
        x = Dense(1, activation="linear")(x)
 
    model = Model(inputs, x)

    return model

In [16]:
mlp = categorical_data()
cnn = image_data()

final_input = concatenate([mlp.output, cnn.output])

x = Dense(6, activation="relu")(final_input)
x = Dense(1, activation="linear")(x)

model = Model([mlp.input, cnn.input], outputs=x)

opt = Adam(lr=0.01)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)

In [17]:
print("Training model...")
model.fit([x_train_attributes, x_train_images], y_train, epochs = 3)

Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x255d4e900b8>

In [18]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250, 750, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 250, 750, 16) 448         input_1[0][0]                    
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 250, 750, 16) 0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 250, 750, 16) 64          activation_1[0][0]               
____________________________________________________________________________________________

In [19]:
print("Predicting car prices...")
preds = model.predict([x_valid_attributes, x_valid_images])

Predicting car prices...


In [20]:
y_valid = y_valid.reset_index()

In [21]:
preds = pd.DataFrame(preds)
result = pd.concat([preds, y_valid['est_value']], axis=1, ignore_index = True)
result.columns = ['Predicted', 'Actual']

In [22]:
print("An utter disappointment...")
result

An utter disappointment...


Unnamed: 0,Predicted,Actual
0,1403.830078,13195
1,1403.830078,31168
2,1403.830078,24038
3,1403.830078,10751
4,1403.830078,41205
...,...,...
290,1403.830078,15317
291,1403.830078,22403
292,1403.830078,12039
293,1403.830078,19211


## Random forest classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150)
print("Training model...")
clf.fit(x_train_attributes, y_train)

Training model...


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
print("Predicting car prices (without images)...")
preds2 = clf.predict(x_valid_attributes)

Predicting car prices (without images)...


In [26]:
preds2 = pd.DataFrame(preds2)
result2 = pd.concat([preds2, y_valid['est_value']], axis=1, ignore_index = True)
result2.columns = ['Predicted', 'Actual']

In [27]:
print("Another utter disappointment...")
result2

Another utter disappointment...


Unnamed: 0,Predicted,Actual
0,13892,13195
1,18243,31168
2,12601,24038
3,18047,10751
4,25508,41205
...,...,...
290,12601,15317
291,22830,22403
292,4275,12039
293,7717,19211


## Ignore (currently does not work)

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(modified_data[['listing_id', 'damage', 'make', 'model', 'year', 'full_label', 'image1', 'image2', 'image3']], modified_data[['est_value']], test_size=0.20, random_state=2020)
x_train, x_valid, y_train, y_valid = train_test_split(modified_data[['listing_id', 'damage', 'make', 'model', 'year', 'full_label', 'image1', 'image2', 'image3']], modified_data[['est_value']], test_size=0.25, random_state=2020)

In [11]:
x_train_images = x_train[['image1', 'image2', 'image3']]
x_valid_images = x_valid[['image1', 'image2', 'image3']]
x_test_images = x_test[['image1', 'image2', 'image3']]

x_train_attributes = x_train[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]
x_valid_attributes = x_valid[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]
x_test_attributes = x_test[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]

In [30]:
def categorical_data():
    model = Sequential()
    model.add(Dense(16, input_dim=6, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='linear'))
    return model

def image_data():
    image1 = Input((250, 250, 3))
    image2 = Input((250, 250, 3))
    image3 = Input((250, 250, 3))

    conv1 = Conv2D(250, (3,3))(image1)
    flat1 = Flatten()(conv1)
    conv2 = Conv2D(250, (3,3))(image2)
    flat2 = Flatten()(conv2)
    conv3 = Conv2D(250, (3,3))(image3)
    flat3 = Flatten()(conv3)

    x = concatenate([image1, image2, image3])

    x = Conv2D(16, (3,3), padding='same')(x)
    x = Activation('relu')(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(pool_size=(2,2))(x)
    x = Flatten()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = BatchNormalization(axis=-1)(x)
    x = Dropout(0.5)(x)
    
    x = Dense(6)(x)
    x = Activation('relu')(x)
    
    model = Model([image1, image2, image3], x)
    
    return model

In [31]:
mlp = categorical_data(6)
cnn = image_data(250, 250, 3)

final_input = concatenate([mlp.output, cnn.output])

x = Dense(6, activation="relu")(final_input)
x = Dense(1, activation="linear")(x)

model = Model([mlp.input, cnn.input], outputs=x)

ValueError: Layer conv2d_39 was called with an input that isn't a symbolic tensor. Received type: <class 'NoneType'>. Full input: [None]. All inputs to the layer should be tensors.