In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import urllib
import cv2
import re
%matplotlib inline

In [4]:
path = r'../data/carmax'
all_files = glob.glob(path + "/*.xlsx")
li = []
for filename in all_files:
    df = pd.read_excel(filename, index_col=None, header=0)
    if 'coupe' in filename:
        df['type'] = 'coupe'
    elif 'crossover' in filename:
        df['type'] = 'crossover'
    elif 'sedan' in filename:
        df['type'] = 'sedan'
    elif 'suv' in filename:
        df['type'] = 'suv'
    elif 'truck' in filename:
        df['type'] = 'truck'
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55525 entries, 0 to 55524
Data columns (total 5 columns):
model      55525 non-null object
price      55525 non-null object
mileage    55525 non-null object
src        55525 non-null object
type       55525 non-null object
dtypes: object(5)
memory usage: 2.1+ MB


In [6]:
df.head()

Unnamed: 0,model,price,mileage,src,type
0,x,x,x,x,sedan
1,2016 Acura\nILX,"$19,998*",20K Miles,https://img2.carmax.com/img/vehicles/17325462/...,sedan
2,2015 Acura\nTLX,"$21,998*",20K Miles,https://img2.carmax.com/img/vehicles/17373223/...,sedan
3,2015 Acura\nTLX,"$19,998*",41K Miles,https://img2.carmax.com/img/vehicles/17221103/...,sedan
4,2016 Acura\nILX,"$17,998*",29K Miles,https://img2.carmax.com/img/vehicles/17320660/...,sedan


In [7]:
df = df.drop(np.array(df[df.src == 'x'].index))
df = df.drop_duplicates()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50580 entries, 1 to 55524
Data columns (total 5 columns):
model      50580 non-null object
price      50580 non-null object
mileage    50580 non-null object
src        50580 non-null object
type       50580 non-null object
dtypes: object(5)
memory usage: 2.3+ MB


In [9]:
df["price"] = df["price"].apply(lambda x: int(re.sub("[^0-9]", "", x)))

In [10]:
def miles_to_int(x):
    if x == 'x':
        x = '0'
    x = x.replace('K', '000')
    return int(re.sub("[^0-9]", "", x))

In [11]:
df["mileage"] = df["mileage"].apply(lambda x: miles_to_int(x))

In [12]:
df["year"] = df["model"].apply(lambda x: int(x.split()[0]))

In [13]:
def get_make(x):
    make_model = x.split('\n')
    make = make_model[0].split(' ', 1)[1]
    return make

def get_trim(x):
    make_model = x.split('\n')
    return make_model[1]

In [14]:
df["make"] = df["model"].apply(lambda x: get_make(x))

In [15]:
df["trim"] = df["model"].apply(lambda x: get_trim(x))

In [16]:
df = df.reset_index().drop('index', axis=1)

In [17]:
df.head()

Unnamed: 0,model,price,mileage,src,type,year,make,trim
0,2016 Acura\nILX,19998,20000,https://img2.carmax.com/img/vehicles/17325462/...,sedan,2016,Acura,ILX
1,2015 Acura\nTLX,21998,20000,https://img2.carmax.com/img/vehicles/17373223/...,sedan,2015,Acura,TLX
2,2015 Acura\nTLX,19998,41000,https://img2.carmax.com/img/vehicles/17221103/...,sedan,2015,Acura,TLX
3,2016 Acura\nILX,17998,29000,https://img2.carmax.com/img/vehicles/17320660/...,sedan,2016,Acura,ILX
4,2016 Acura\nTLX,20998,44000,https://img2.carmax.com/img/vehicles/15666950/...,sedan,2016,Acura,TLX


In [18]:
df.iloc[641]['src']

'https://img2.carmax.com/img/vehicles/17376968/1/750.jpg'

In [117]:
from urllib.error import HTTPError, URLError
import socket

In [118]:
urls = df.iloc[:]['src']

In [120]:
start_i = 0
for i in range(len(urls)):
    cont = False
    count = 0
    while (not cont) and count < 10:
        try:
            urllib.request.urlretrieve(urls[start_i + i], "../images/carmax/" + str(start_i + i) + ".jpg")
            cont = True
        except (HTTPError, URLError) as error:
            print("Error occurred at " + str(i))
            count += 1

Error occurred at 887
Error occurred at 1107
Error occurred at 1275
Error occurred at 1448
Error occurred at 2500
Error occurred at 2703
Error occurred at 4236
Error occurred at 4316
Error occurred at 4658
Error occurred at 5129
Error occurred at 6768
Error occurred at 6790
Error occurred at 8233
Error occurred at 8435
Error occurred at 9573
Error occurred at 10205
Error occurred at 10322
Error occurred at 10689
Error occurred at 11088
Error occurred at 11289
Error occurred at 12917
Error occurred at 13203
Error occurred at 13557
Error occurred at 14433
Error occurred at 14616
Error occurred at 15371
Error occurred at 19398
Error occurred at 20183
Error occurred at 20315
Error occurred at 20575
Error occurred at 21269
Error occurred at 22417
Error occurred at 22804
Error occurred at 24075
Error occurred at 24271
Error occurred at 24529
Error occurred at 24751
Error occurred at 25056
Error occurred at 25762
Error occurred at 25957
Error occurred at 27594
Error occurred at 28011
Error oc

In [19]:
def is_original(image):
    a = cv2.imread('../images/carmax/894.jpg')
    b = cv2.imread(image)
    try:
        difference = cv2.subtract(a, b)    
        result = not np.any(difference)
        if result is True:
            return 'sample'
        else:
            return 'original'
    except:
        try:
            c = cv2.imread('../images/carmax/7751.jpg')
            difference = cv2.subtract(c, b)    
            result = not np.any(difference)
            if result is True:
                return 'sample'
            else:
                return 'original'
        except:    
            return 'original'

In [20]:
path = r'../images/carmax'
all_files = glob.glob(path + "/*.jpg")
originals = []
for i in range(len(all_files)):
    originals.append(is_original('../images/carmax/' + str(i) + '.jpg'))

In [21]:
df['image-type'] = originals

In [22]:
df[df['image-type'] == 'sample'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3070 entries, 7 to 50555
Data columns (total 9 columns):
model         3070 non-null object
price         3070 non-null int64
mileage       3070 non-null int64
src           3070 non-null object
type          3070 non-null object
year          3070 non-null int64
make          3070 non-null object
trim          3070 non-null object
image-type    3070 non-null object
dtypes: int64(3), object(6)
memory usage: 239.8+ KB


In [161]:
from PIL import Image, ImageOps

In [171]:
desired_size = 128
im_pth = "../images/carmax/0.jpg"
im = Image.open(im_pth)
old_size = im.size
ratio = float(desired_size)/max(old_size)
new_size = tuple([int(x*ratio) for x in old_size])
im = im.resize(new_size, Image.ANTIALIAS)
new_im = Image.new("RGB", (desired_size, desired_size))
new_im.paste(im, ((desired_size-new_size[0])//2,
                    (desired_size-new_size[1])//2))
new_im.save()

The following is to extract objects from the image. Because this will take too long to do for 50000 images, we will skip this step

In [168]:
# from imageai.Detection import ObjectDetection
# import os

Using TensorFlow backend.


In [172]:
# execution_path = os.getcwd()

# detector = ObjectDetection()
# detector.setModelTypeAsRetinaNet()
# detector.setModelPath( os.path.join(execution_path , "resnet50_coco_best_v2.0.1.h5"))
# detector.loadModel()
# detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "../images/carmax/4.jpg"), output_image_path=os.path.join(execution_path , "imagenew.jpg"))

# for eachObject in detections:
#     print(eachObject["name"] , " : " , eachObject["percentage_probability"] )

In [23]:
df['model'] = 'undefined'

In [24]:
df.head()

Unnamed: 0,model,price,mileage,src,type,year,make,trim,image-type
0,undefined,19998,20000,https://img2.carmax.com/img/vehicles/17325462/...,sedan,2016,Acura,ILX,original
1,undefined,21998,20000,https://img2.carmax.com/img/vehicles/17373223/...,sedan,2015,Acura,TLX,original
2,undefined,19998,41000,https://img2.carmax.com/img/vehicles/17221103/...,sedan,2015,Acura,TLX,original
3,undefined,17998,29000,https://img2.carmax.com/img/vehicles/17320660/...,sedan,2016,Acura,ILX,original
4,undefined,20998,44000,https://img2.carmax.com/img/vehicles/15666950/...,sedan,2016,Acura,TLX,original


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50580 entries, 0 to 50579
Data columns (total 9 columns):
model         50580 non-null object
price         50580 non-null int64
mileage       50580 non-null int64
src           50580 non-null object
type          50580 non-null object
year          50580 non-null int64
make          50580 non-null object
trim          50580 non-null object
image-type    50580 non-null object
dtypes: int64(3), object(6)
memory usage: 3.5+ MB


In [29]:
df.to_csv('../data/carmax_final_data.csv')