## Imports

In [11]:
import keras
import pandas as pd
import os
import numpy as np
from PIL import Image
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import BatchNormalization, Reshape, Conv2D, MaxPooling2D, Activation, Dropout, Dense, Flatten, Input, concatenate


import warnings
warnings.filterwarnings("ignore")

## Set paths and read data

In [12]:
car_damages_cleaned_path = 'car_damages_cleaned.csv'
data_dir = './data_damages/'

damages_data = pd.read_csv(car_damages_cleaned_path)

In [13]:
pd.set_option('display.max_colwidth', 0)
damages_data.head()

Unnamed: 0,listing_id,year,make,model,damage,est_value,full_label,listing_url,image1_url,image2_url,image3_url,image4_url
0,23885009,2010,ACURA,MDX,FRONT END,12726,2010 ACURA MDX,https://www.copart.com/lot/23885009,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX133/a36fc639-f0d6-43a1-91b5-afac3efeaa53.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX133/a3405d62-f6eb-4a59-9e1a-af41e4ffb1d2.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX133/f6fd4cc2-453e-41eb-a532-1c5c07985469.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX133/a15abbac-bb28-4383-afb0-59cfdd5de5f9.JPG
1,24487306,2002,ACURA,3.2 CL,FRONT END,1984,2002 ACURA 3.2CL,https://www.copart.com/lot/24487306,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX4/fc065d91-59b4-4944-9730-bafc854552f4.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX4/9e4d3a1f-8fb8-4c8f-99b0-396ee6a99460.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX4/a6d2e762-89fe-4c28-8f7c-cf567c3fe49f.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX4/48d446e3-92c4-448e-a1b9-84cfe245a122.JPG
2,25667019,2012,ACURA,TL,FRONT END,13095,2012 ACURA TL,https://www.copart.com/lot/25667019,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX133/554ed268-ad2f-4f82-a45d-4ab0a4d3bce9.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX133/550e5154-4e24-4a02-b6c2-593579de946f.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX133/5efb5e18-cd6e-48ee-b53c-d3b410027533.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX133/8c49217c-232d-4579-bb97-3e5177b0d901.JPG
3,25717579,2005,ACURA,TL,VANDALISM,4275,2005 ACURA TL,https://www.copart.com/lot/25717579,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX134/511a8e49-0a71-4ddc-8195-1d5f402405ad.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX134/8c1b905e-da04-4957-947d-5e322fe5d2b9.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX134/295cbc24-6f9a-4dbc-a3e3-972e724126d2.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX134/2c25ec02-ef8f-4169-a52b-a6bf3858c65f.JPG
4,25887480,2008,ACURA,MDX TECHNO,FRONT END,7742,2008 ACURA MDX TECHNOLOGY,https://www.copart.com/lot/25887480,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX235/2a758cfa-0d39-4d10-b49f-20c0f6468461.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX235/388d20b0-c9f5-4c22-8363-9cf9df3748b9.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX235/e64d60ed-4b1b-4b2b-ba80-d9422b83055d.JPG,https://cs.copart.com/v1/AUTH_svc.pdoc00001/PIX235/50abfeba-7cb3-48e3-bcb5-b430e14ec737.JPG


## One hot encoding 

In [14]:
def encoder(df, column):
    encoder_dict = {}
    idx = 0
    for i in range(len(df)) : 
        value = df.loc[i, column]
        if (value not in encoder_dict):
            encoder_dict[value] = idx
            df.loc[i, column] = idx
            idx += 1
        else:
            encoding = encoder_dict[value]
            df.loc[i, column] = encoding
    return df[column], encoder_dict
 
modified_data = pd.DataFrame()
modified_data['listing_id'] = damages_data['listing_id']
modified_data['damage'], damage_encoder = encoder(damages_data, 'damage')
modified_data['make'], make_encoder = encoder(damages_data, 'make')
modified_data['model'], model_encoder = encoder(damages_data, 'model')
modified_data['full_label'], full_label_encoder = encoder(damages_data, 'full_label')
modified_data['est_value'] = damages_data['est_value']
modified_data['year'] = damages_data['year']

In [15]:
modified_data.head()

Unnamed: 0,listing_id,damage,make,model,full_label,est_value,year
0,23885009,0,0,0,0,12726,2010
1,24487306,0,0,1,1,1984,2002
2,25667019,0,0,2,2,13095,2012
3,25717579,1,0,2,3,4275,2005
4,25887480,0,0,3,4,7742,2008


In [55]:
print('Number of unique makes in dataset: ', len(modified_data.make.unique()))
print('Number of unique models in dataset: ', len(modified_data.model.unique()))
print('Number of unique years in dataset: ', len(modified_data.year.unique()))
print('Number of unique damage types in dataset: ', len(modified_data.damage.unique()))
print('Number of unique labels (make, model, and year) in dataset: ', len(modified_data.full_label.unique()))
print('Total number of datapoints: ', len(modified_data))

Number of unique makes in dataset:  25
Number of unique models in dataset:  498
Number of unique years in dataset:  27
Number of unique damage types in dataset:  18
Number of unique labels (make, model, and year) in dataset:  947
Total number of datapoints:  1180


## Convert images to Numpy arrays

In [17]:
def convert_images(df, column, idx):
    for i in range(len(df)) : 
        listing_id = df.loc[i, 'listing_id']
        image = Image.open(data_dir + '{}-{}.jpg'.format(listing_id, idx)).resize((250,250))
        
        image_array = np.array(image)
        df.at[i, column] = image_array
    return df[column]

modified_data['image1'] = None
modified_data['image2'] = None
modified_data['image3'] = None
modified_data['image1'] = convert_images(modified_data, 'image1', 1)
modified_data['image2'] = convert_images(modified_data, 'image2', 2)
modified_data['image3'] = convert_images(modified_data, 'image3', 4)


In [18]:
modified_data_path = 'modified_damages.csv'
modified_data.to_csv(modified_data_path)

In [20]:
def image_consolidator(df):
    images = np.zeros((250, 750, 3))
    for i in range(len(df)):
        image1 = df.loc[i, 'image1']
        image2 = df.loc[i, 'image2']
        image3 = df.loc[i, 'image3']
        images[0:250, 0:250] = image1
        images[0:250, 250:500] = image2
        images[0:250, 500:750] = image3
        df.at[i, 'images'] = images
    return df['images']
    
modified_data['images'] = None
modified_data['images'] = image_consolidator(modified_data)

In [21]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(modified_data[['listing_id', 'damage', 'make', 'model', 'year', 'full_label', 'images']], modified_data[['est_value']], test_size=0.20, random_state=2020)
x_train, x_valid, y_train, y_valid = train_test_split(modified_data[['listing_id', 'damage', 'make', 'model', 'year', 'full_label', 'images']], modified_data[['est_value']], test_size=0.25, random_state=2020)

In [22]:
x_train_images = x_train[['images']]/255
x_valid_images = x_valid[['images']]/255
x_test_images = x_test[['images']]/255

x_train_attributes = x_train[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]
x_valid_attributes = x_valid[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]
x_test_attributes = x_test[['listing_id', 'damage', 'make', 'model', 'year', 'full_label']]

In [23]:
def reshape_dimensions(df):
    all_images = []
    for i in range(len(df)):
        idx = df.index[i]
        all_images.append(df.loc[idx, 'images'])
    return np.array(all_images)

x_train_images = reshape_dimensions(x_train_images)
x_valid_images = reshape_dimensions(x_valid_images)
x_test_images = reshape_dimensions(x_test_images)  

In [None]:
def categorical_data(regress=False):
    model = Sequential()
    model.add(Dense(8, input_dim=6, activation='relu'))
    model.add(Dense(4, activation='relu'))
    
    if regress:
        model.add(Dense(1, activation="linear"))
   
    return model

def image_data(regress=False):
    
    inputs = Input((250, 750, 3))
    x = inputs
    
    x = Conv2D(16, (3,3), padding='same')(x)
    x = Activation('relu')(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(pool_size=(10,10))(x)
    x = Dropout(0.15)(x)
    
    x = Conv2D(64, (50,50), padding='same')(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(20,20))(x)
    x = Dropout(0.15)(x)
    
    x = Flatten()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = BatchNormalization(axis=-1)(x)
    x = Dropout(0.25)(x)
    
    x = Dense(6)(x)
    x = Activation('relu')(x)
    
    if regress:
        x = Dense(1, activation="linear")(x)
 
    model = Model(inputs, x)

    return model

In [None]:
mlp = categorical_data()
cnn = image_data()

final_input = concatenate([mlp.output, cnn.output])

x = Dense(6, activation="relu")(final_input)
x = Dense(1, activation="linear")(x)

model = Model([mlp.input, cnn.input], outputs=x)

opt = Adam(lr=0.001)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)

In [None]:
print("Training model...")
model.fit([x_train_attributes, x_train_images], y_train, epochs = 2)

In [None]:
model.summary()

In [18]:
print("Predicting car prices...")
preds = model.predict([x_valid_attributes, x_valid_images])

Predicting car prices...


In [19]:
y_valid = y_valid.reset_index()

In [24]:
model.save('damages_model.h5')

In [20]:
preds = pd.DataFrame(preds)
result = pd.concat([preds, y_valid['est_value']], axis=1, ignore_index = True)
result.columns = ['Predicted', 'Actual']

In [21]:
print("An utter disappointment...")
result

An utter disappointment...


Unnamed: 0,Predicted,Actual
0,206.661774,13195
1,206.661774,31168
2,206.661774,24038
3,206.661774,10751
4,206.661774,41205
...,...,...
290,206.661774,15317
291,206.661774,22403
292,206.661774,12039
293,206.661774,19211


In [24]:
import keras
from keras.models import load_model


model = load_model('damages_model.h5')
preds = model.predict([x_valid_attributes, x_valid_images])


In [29]:
preds = pd.DataFrame(preds)
y_valid = y_valid.reset_index()
result = pd.concat([preds, y_valid['est_value']], axis=1, ignore_index = True)
result.columns = ['Predicted', 'Actual']

In [30]:
result

Unnamed: 0,Predicted,Actual
0,206.661774,13195
1,206.661774,31168
2,206.661774,24038
3,206.661774,10751
4,206.661774,41205
...,...,...
290,206.661774,15317
291,206.661774,22403
292,206.661774,12039
293,206.661774,19211


In [34]:
from sklearn.metrics import mean_squared_error

np.sqrt(mean_squared_error(y_valid['est_value'],preds))

110322.80624941268

## Random forest classifier

In [46]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150)
print("Training model...")
clf.fit(x_train_attributes, y_train)

Training model...


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [47]:
print("Predicting car prices (without images)...")
preds2 = clf.predict(x_valid_attributes)

Predicting car prices (without images)...


In [53]:
preds2 = pd.DataFrame(preds2)
result2 = pd.concat([preds2, y_valid['est_value']], axis=1, ignore_index = True)
result2.columns = ['Predicted Retail Value', 'Listed Retail Value']

In [54]:
print("Another utter disappointment...")
result2

Another utter disappointment...


Unnamed: 0,Predicted Retail Value,Listed Retail Value
0,13892,13195
1,14847,31168
2,12601,24038
3,18047,10751
4,25508,41205
...,...,...
290,12601,15317
291,22830,22403
292,4275,12039
293,7717,19211
