In [1]:
import pandas as pd

DATA_PATH = '/home/misho/Uni/Vision/YOLO/keras-YOLOv3-model-set/Myauto_data/Car_Images/'
CSV_PATH = '/home/misho/Uni/Vision/YOLO/keras-YOLOv3-model-set/Myauto_data/MyAuto_ge_Cars_Data.csv'

## Read dataset csv file and drop useless features

In [2]:
data = pd.read_csv(CSV_PATH)
print('Data shape: {}'.format(data.shape))
data.head()

Data shape: (80577, 20)


Unnamed: 0,ID,Price ($),Levy ($),Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Interior color,Airbags,VIN
0,45568273,11447,501,HONDA,FIT,2014,Hatchback,No,Petrol,1.5,80000 km,4.0,Manual,Front,4/5,Left wheel,Grey,Black,4,
1,45789427,84675,-,MERCEDES-BENZ,C 200 7G-TRONIC,2014,Sedan,Yes,Petrol,2.0 Turbo,66500 km,4.0,Tiptronic,Rear,4/5,Left wheel,White,Black,12,
2,45747002,141124,-,LAND ROVER,Land Rover Sport,2015,Jeep,Yes,Diesel,3.0 Turbo,90000 km,6.0,Tiptronic,4x4,4/5,Left wheel,Black,Red,12,
3,45786808,219527,-,BENTLEY,Continental GT,2012,Coupe,Yes,Petrol,4.0 Turbo,55500 km,8.0,Tiptronic,4x4,2/3,Left wheel,Black,Black,0,
4,45816460,30000,-,BMW,328 F30,2012,Sedan,Yes,Petrol,2.0 Turbo,197000 km,4.0,Tiptronic,Rear,4/5,Left wheel,Black,Black,10,WBA3A5C50CF256782


In [3]:
data = data[['ID', 'Doors', 'Color', 'Category']]

In [4]:
data.head()

Unnamed: 0,ID,Doors,Color,Category
0,45568273,4/5,Grey,Hatchback
1,45789427,4/5,White,Sedan
2,45747002,4/5,Black,Jeep
3,45786808,2/3,Black,Coupe
4,45816460,4/5,Black,Sedan


Keep test data separately

In [5]:
print('Data size: {}'.format(len(data)))
data[60000:].to_csv('test_data.csv', index=False)
data = data[:60000]
print('Train data size: {}'.format(len(data)))

Data size: 80577
Train data size: 60000


## Get car images for each car ID

for each ID get car containing image numbers, alongside drop IDs that don't have images at all

In [6]:
from preprocess.detect_car import car_image
from preprocess import generate 

In [7]:
car_images = generate.get_car_images(data)

In [8]:
car_images.head()

Unnamed: 0,ID,images
0,45568273.0,15_4_1_2_3_5_11
1,45789427.0,4_1_6_2_3_5
2,45747002.0,8_4_1_2_3
3,45786808.0,8_4_1_2_7
4,45816460.0,8_4_1_6_2_3_7


### Merge car features and car images

In [9]:
data = generate.merge_car_features_and_car_images(data, car_images)

In [10]:
print('Train data size: {}'.format(len(data)))
data.head()

Train data size: 58936


Unnamed: 0,ID,Doors,Color,Category,images
0,45568273,4/5,Grey,Hatchback,15_4_1_2_3_5_11
1,45789427,4/5,White,Sedan,4_1_6_2_3_5
2,45747002,4/5,Black,Jeep,8_4_1_2_3
3,45786808,2/3,Black,Coupe,8_4_1_2_7
4,45816460,4/5,Black,Sedan,8_4_1_6_2_3_7


## Generate image oriented data for model training

In [11]:
data = generate.generate_image_oriented_dataset(data)

In [12]:
print('Train data size: {}'.format(len(data)))
data.head()

Train data size: 533516


Unnamed: 0,ID,Category,Doors,Color,img_index,is_car
0,45568273,Hatchback,4/5,Grey,1,1
1,45568273,Hatchback,4/5,Grey,2,1
2,45568273,Hatchback,4/5,Grey,3,1
3,45568273,Hatchback,4/5,Grey,4,1
4,45568273,Hatchback,4/5,Grey,5,1


Shuffle the dataset

In [13]:
data = data.sample(frac=1)
data.head()

Unnamed: 0,ID,Category,Doors,Color,img_index,is_car
16115,45815626,Sedan,4/5,Black,10,0
149176,45805062,Jeep,4/5,Blue,6,1
57962,45790113,Sedan,4/5,Grey,1,1
502041,45632617,Sedan,4/5,Black,7,0
249624,45791612,Jeep,4/5,Grey,2,0


## Keep only the car images for our tasks

In [14]:
data = data[data.is_car == '1']
data.drop(['is_car'], axis=1, inplace=True)

In [15]:
print('Train data size: {}'.format(len(data)))
data.head()

Train data size: 213575


Unnamed: 0,ID,Category,Doors,Color,img_index
149176,45805062,Jeep,4/5,Blue,6
57962,45790113,Sedan,4/5,Grey,1
536,45612049,Jeep,4/5,Grey,1
440420,45759340,Hatchback,4/5,Grey,9
519156,45659003,Hatchback,4/5,White,3


# Generate data for category prediction
Remove useless columns

In [16]:
category_data = data.drop(['Doors', 'Color'], axis=1)
category_data.head()

Unnamed: 0,ID,Category,img_index
149176,45805062,Jeep,6
57962,45790113,Sedan,1
536,45612049,Jeep,1
440420,45759340,Hatchback,9
519156,45659003,Hatchback,3


In [17]:
for col in category_data.Category.unique():
    length = len(category_data[category_data.Category == col])
    print('{}: {}, percentage: {}'.format(col, length, int(100*length/len(category_data))))

Jeep: 77720, percentage: 36
Sedan: 93394, percentage: 43
Hatchback: 27827, percentage: 13
Goods wagon: 946, percentage: 0
Minivan: 4039, percentage: 1
Coupe: 6047, percentage: 2
Universal: 2067, percentage: 0
Microbus: 685, percentage: 0
Pickup: 342, percentage: 0
Cabriolet: 464, percentage: 0
Limousine: 44, percentage: 0


Unite sets for Minivan/Microbus, Hatchback/Universal, Coupe/Cabriolet and drop Goods wagon, Pickup and Limousine as their image samples are to small and they aren't related to any of other categories

In [18]:
def unite_categories(a):
    if a in ['Minivan', 'Microbus']:
        return 'Microbus'
    if a in ['Hatchback', 'Universal']:
        return 'Hatchback'
    if a in ['Coupe', 'Cabriolet']:
        return 'Coupe'
    if a in ['Limousine', 'Pickup', 'Goods wagon']:
        return 0
    return a

category_data.Category = category_data.Category.apply(unite_categories)
category_data = category_data[category_data.Category != 0]

In [19]:
for col in category_data.Category.unique():
    length = len(category_data[category_data.Category == col])
    print('{}: {}, percentage: {}'.format(col, length, int(100*length/len(category_data))))

Jeep: 77720, percentage: 36
Sedan: 93394, percentage: 44
Hatchback: 29894, percentage: 14
Microbus: 4724, percentage: 2
Coupe: 6511, percentage: 3


Select balanced dataset for training

In [20]:
cat_train = category_data[category_data.Category == 'Microbus']

cat_train = pd.concat([cat_train, category_data[category_data.Category == 'Coupe']])

cat_train = pd.concat([cat_train, category_data[category_data.Category == 'Hatchback'][:10000]])

cat_train = pd.concat([cat_train, category_data[category_data.Category == 'Sedan'][:10000]])

cat_train = pd.concat([cat_train, category_data[category_data.Category == 'Jeep'][:10000]])

# Shuffle dataset
cat_train = cat_train.sample(frac=1)

In [21]:
for col in cat_train.Category.unique():
    length = len(cat_train[cat_train.Category == col])
    print('{}: {}, percentage: {}%'.format(col, length, int(100*length/len(cat_train))))

Hatchback: 10000, percentage: 24%
Microbus: 4724, percentage: 11%
Jeep: 10000, percentage: 24%
Coupe: 6511, percentage: 15%
Sedan: 10000, percentage: 24%


Generate folder containing training images

In [22]:
cat_train = generate.generate_training_folder(cat_train, 'category_train_41K')
cat_train = cat_train[cat_train.success]
cat_train.drop(['success'], axis=1, inplace=True)

In [23]:
print('lenght: {}'.format(len(cat_train)))
cat_train.head()

lenght: 41098


Unnamed: 0,ID,Category,img_index
78277,45809877,Hatchback,3
252729,45791267,Microbus,2
183834,45800188,Jeep,2
31056,45814466,Jeep,14
80189,45809670,Coupe,4


Save dataframe to .csv file

In [24]:
cat_train.to_csv('category_train_41K.csv', index=False)

# Generate data for door number prediction
Remove useless columns

In [25]:
door_data = data.drop(['Category', 'Color'], axis=1)
door_data.head()

Unnamed: 0,ID,Doors,img_index
149176,45805062,4/5,6
57962,45790113,4/5,1
536,45612049,4/5,1
440420,45759340,4/5,9
519156,45659003,4/5,3


In [26]:
for col in door_data.Doors.unique():
    length = len(door_data[door_data.Doors == col])
    print('{}: {}, percentage: {}'.format(col, length, int(100*length/len(door_data))))

4/5: 202642, percentage: 94
2/3: 6725, percentage: 3
nan: 0, percentage: 0
>5: 745, percentage: 0


Drop '>5' and nan values

In [27]:
door_data.dropna(inplace=True)
door_data = door_data[door_data.Doors != '>5']

In [28]:
print('length: {}'.format(len(door_data)))
door_data.head()

length: 209367


Unnamed: 0,ID,Doors,img_index
149176,45805062,4/5,6
57962,45790113,4/5,1
536,45612049,4/5,1
440420,45759340,4/5,9
519156,45659003,4/5,3


Create balanced dataset for trianing

In [29]:
door_train = pd.concat([door_data[door_data.Doors == '2/3'], door_data[door_data.Doors == '4/5'][:20000]])
# Shuffle
door_train = door_train.sample(frac=1)
for col in door_train.Doors.unique():
    length = len(door_train[door_train.Doors == col])
    print('{}: {}, percentage: {}%'.format(col, length, int(100*length/len(door_train))))

2/3: 6725, percentage: 25%
4/5: 20000, percentage: 74%


Generate folder containing training images

In [30]:
door_train = generate.generate_training_folder(door_train, 'door_train_26K')
door_train = door_train[door_train.success]
door_train.drop(['success'], axis=1, inplace=True)

In [31]:
print('lenght: {}'.format(len(door_train)))
door_train.head()

lenght: 26642


Unnamed: 0,ID,Doors,img_index
290909,45785426,2/3,1
532008,45747012,4/5,3
522366,45663151,4/5,3
91472,45802956,4/5,9
147859,45805213,4/5,5


Save dataframe to csv

In [32]:
door_train.to_csv('door_train_26K.csv', index=False)

# Generate data for car color prediction
Remove useless columns

In [33]:
color_data = data.drop(['Category', 'Doors'], axis=1)
color_data.dropna(inplace=True)
print('lenght: {}'.format(len(color_data)))
color_data.head()

lenght: 209572


Unnamed: 0,ID,Color,img_index
149176,45805062,Blue,6
57962,45790113,Grey,1
536,45612049,Grey,1
440420,45759340,Grey,9
519156,45659003,White,3


In [34]:
for col in color_data.Color.unique():
    length = len(color_data[color_data.Color == col])
    print('{}: {}, percentage: {}'.format(col, length, int(100*length/len(color_data))))

Blue: 16164, percentage: 7
Grey: 29172, percentage: 13
White: 49207, percentage: 23
Black: 56194, percentage: 26
Silver: 38742, percentage: 18
Green: 3421, percentage: 1
Yellow: 692, percentage: 0
Golden: 1032, percentage: 0
Orange: 1078, percentage: 0
Purple: 231, percentage: 0
Red: 7274, percentage: 3
Brown: 2502, percentage: 1
Sky blue: 972, percentage: 0
Carnelian red: 1401, percentage: 0
Pink: 157, percentage: 0
Beige: 1333, percentage: 0


Create balanced training data

In [35]:
black = color_data[color_data.Color == 'Black'][:5000]
blue = color_data[color_data.Color == 'Blue'][:5000]
white = color_data[color_data.Color == 'White'][:5000]
silver = color_data[color_data.Color == 'Silver'][:5000]
grey = color_data[color_data.Color == 'Grey'][:5000]
green = color_data[color_data.Color == 'Green']
red = color_data[color_data.Color == 'Red'][:5000]

color_train = pd.concat([black, blue, white, silver, grey, green, red])
# Shuffle
color_train = color_train.sample(frac=1)
print('Length: {}'.format(len(color_train)))
for col in color_train.Color.unique():
    length = len(color_train[color_train.Color == col])
    print('{}: {}, percentage: {}%'.format(col, length, int(100*length/len(color_train))))

Length: 33421
Blue: 5000, percentage: 14%
White: 5000, percentage: 14%
Red: 5000, percentage: 14%
Silver: 5000, percentage: 14%
Grey: 5000, percentage: 14%
Black: 5000, percentage: 14%
Green: 3421, percentage: 10%


In [36]:
color_train = generate.generate_training_folder(color_train, 'color_train_33K')
color_train = color_train[color_train.success]
color_train.drop(['success'], axis=1, inplace=True)
print('lenght: {}'.format(len(color_train)))
color_train.head()

lenght: 33326


Unnamed: 0,ID,Color,img_index
119547,45807645,Blue,1
417106,45763067,Blue,10
269865,45788939,Blue,4
177000,45801298,White,6
107286,45733396,Blue,4


Save dataframe to csv file

In [37]:
color_train.to_csv('color_train_33K.csv', index=False)