# This notebook is EDA for the Honk4Honda data challenge (unsupervised learning)

# Goal is to (as Honda) find promising markets to expand to

## First import modules and data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt

data = pd.read_csv('./data/craigslistVehiclesFull.csv')
data.head()

In [2]:
len(data)

1723065

# Explore data

In [3]:
# Check for any missing data
print("Table size -", end=' ')
print(data.shape)

print("Checking for missing values..")
# Number of missing values in each column of training data
missing_val_count_by_column = (data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

# Get names of columns with missing values
missing_cols = [col for col in data.columns
                     if data[col].isnull().any()]

Table size - (1723065, 26)
Checking for missing values..
year               6315
manufacturer     136414
make              69699
condition        700790
cylinders        691291
fuel              10367
odometer         564054
title_status       2554
transmission       9022
vin             1118215
drive            661884
size            1123967
type             702931
paint_color      695650
image_url             1
county_fips       58833
county_name       58833
state_fips        58833
state_code        58833
weather           59428
dtype: int64


In [4]:
data.columns

Index(['url', 'city', 'price', 'year', 'manufacturer', 'make', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission', 'vin',
       'drive', 'size', 'type', 'paint_color', 'image_url', 'lat', 'long',
       'county_fips', 'county_name', 'state_fips', 'state_code', 'state_name',
       'weather'],
      dtype='object')

In [5]:
len(missing_cols)

20

In [6]:
# What's special about the missing states
data[data['state_code'].isnull()]

Unnamed: 0,url,city,price,year,manufacturer,make,condition,cylinders,fuel,odometer,...,paint_color,image_url,lat,long,county_fips,county_name,state_fips,state_code,state_name,weather
8,https://soo.craigslist.ca/cto/d/ebike-red-zone...,soo,2700,2018.0,,Ebike,excellent,,electric,,...,red,https://images.craigslist.org/01111_7YfxuQ9Nxu...,46.509803,-84.257825,,,,,FAILED,
9,https://soo.craigslist.ca/ctd/d/2003-ford-excu...,soo,12995,2003.0,ford,excursion,excellent,10 cylinders,gas,236000.0,...,blue,https://images.craigslist.org/00707_97tX51ryfc...,46.514191,-84.291307,,,,,FAILED,
10,https://soo.craigslist.ca/cto/d/2006-chev-expr...,soo,4000,2006.0,chev,express 2500 van,good,6 cylinders,gas,138000.0,...,white,https://images.craigslist.org/00J0J_fovkGAUCQI...,46.497529,-84.467464,,,,,FAILED,
11,https://soo.craigslist.ca/cto/d/2002-chev-crew...,soo,13000,2002.0,chevrolet,2500 hd,excellent,8 cylinders,diesel,350000.0,...,white,https://images.craigslist.org/00V0V_bN5BoWGnSS...,46.497529,-84.467464,,,,,FAILED,
12,https://soo.craigslist.ca/cto/d/2016-hyundai-s...,soo,21695,2016.0,hyundai,sonata,like new,4 cylinders,gas,44814.0,...,white,https://images.craigslist.org/00z0z_fLBdcucFWW...,46.517791,-84.342291,,,,,FAILED,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723010,https://reddeer.craigslist.ca/ctd/d/2015-kia-f...,reddeer,10850,2015.0,kia,forte 4dr sdn lx,,4 cylinders,gas,29400.0,...,,https://images.craigslist.org/00t0t_b46ljc5h29...,49.172550,-123.078064,,,,,FAILED,
1723011,https://reddeer.craigslist.ca/ctd/d/sold-but-n...,reddeer,17995,2013.0,nissan,leaf sl,,,electric,52389.0,...,,https://images.craigslist.org/00303_7xGQ0mFuzF...,49.172550,-123.078064,,,,,FAILED,
1723012,https://reddeer.craigslist.ca/ctd/d/2013-kia-s...,reddeer,19995,2013.0,kia,sorento ex 4wd,,6 cylinders,gas,70200.0,...,,https://images.craigslist.org/00000_8YqcDnJ2YS...,49.172550,-123.078064,,,,,FAILED,
1723013,https://reddeer.craigslist.ca/cto/d/2012-kia-o...,reddeer,10000,2012.0,kia,optima lx 4dr sedan,excellent,4 cylinders,gas,78000.0,...,blue,https://images.craigslist.org/00L0L_85RjFB9r13...,52.252165,-113.795804,,,,,FAILED,


In [3]:
# Seems like these countries throw an error and state_name is failed, remove these and check again for missing values
data = data[data['state_name'] != 'FAILED']

### Not interested in data where manufacturer is null. Drop these columns

In [4]:
data = data[data['manufacturer'].notna()]

In [50]:
# recheck for any missing data
print("Table size -", end=' ')
print(data.shape)

print("Checking for missing values..")
# Number of missing values in each column of training data
missing_val_count_by_column = (data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

# Get names of columns with missing values
missing_cols = [col for col in data.columns
                     if data[col].isnull().any()]

Table size - (1531482, 24)
Checking for missing values..
year              5303
make             68121
condition       626974
cylinders       599905
fuel              8716
odometer        477171
title_status      1250
transmission      7312
vin             969314
drive           569684
size            995046
type            606351
paint_color     601047
weather            509
dtype: int64


In [53]:
# what's going on with weather? Only missing for district of columbia
data[data['weather'].isnull()]['state_name'].value_counts()

District of Columbia    509
Name: state_name, dtype: int64

In [5]:
# set this temp = average temp in Maryland
data.loc[(data['state_name'] == 'District of Columbia'), 'weather'] = data.loc[(data['state_name'] == 'Maryland'), 'weather'].mean()

In [58]:
# recheck for any missing data
print("Table size -", end=' ')
print(data.shape)

print("Checking for missing values..")
# Number of missing values in each column of training data
missing_val_count_by_column = (data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

# Get names of columns with missing values
missing_cols = [col for col in data.columns
                     if data[col].isnull().any()]

Table size - (1531482, 24)
Checking for missing values..
year              5303
make             68121
condition       626974
cylinders       599905
fuel              8716
odometer        477171
title_status      1250
transmission      7312
vin             969314
drive           569684
size            995046
type            606351
paint_color     601047
dtype: int64


## Remove some columns from model

In [17]:
data['image_url'][0]

'https://images.craigslist.org/00i0i_2ggH1eKDNKZ_600x450.jpg'

### I'm sure webscraping some of these links would be useful some day but as of today not considering them
### Also the links are bad (404)

In [6]:
data = data.drop(['url','image_url'], axis=1)
data.head()

Unnamed: 0,city,price,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,...,type,paint_color,lat,long,county_fips,county_name,state_fips,state_code,state_name,weather
0,marshall,11900,2010.0,dodge,challenger se,good,6 cylinders,gas,43600.0,clean,...,coupe,red,43.4775,-96.7041,46083.0,Lincoln,46.0,SD,South Dakota,38.0
2,marshall,17550,2008.0,ford,f-150,,,gas,,clean,...,,,44.1415,-103.2052,46093.0,Meade,46.0,SD,South Dakota,38.0
3,marshall,2800,2004.0,ford,taurus,good,6 cylinders,gas,168591.0,clean,...,sedan,grey,43.2845,-95.593,19143.0,Osceola,19.0,IA,Iowa,47.0
5,marshall,9900,2007.0,gmc,yukon,good,8 cylinders,gas,169000.0,clean,...,,,43.328149,-96.774445,46083.0,Lincoln,46.0,SD,South Dakota,38.0
6,marshall,12500,2015.0,jeep,patriot high altitude,like new,4 cylinders,gas,39500.0,rebuilt,...,SUV,grey,43.5486,-96.6332,46099.0,Minnehaha,46.0,SD,South Dakota,38.0


In [59]:
# 446 cities in dataset
data['city'].value_counts()

cosprings      7814
grandrapids    7702
anchorage      7690
sacramento     7685
omaha          7585
               ... 
halifax           3
kelowna           2
abbotsford        2
vancouver         1
toronto           1
Name: city, Length: 446, dtype: int64

In [60]:
# 51 states in dataset
data['state_name'].value_counts()

California              141697
Florida                 101407
Texas                    88418
Michigan                 73853
New York                 65098
North Carolina           58968
Ohio                     57263
Pennsylvania             53026
Wisconsin                49804
Washington               48551
Virginia                 43099
Oregon                   43065
Colorado                 38343
Tennessee                38327
Missouri                 36985
Minnesota                34860
Illinois                 32549
Georgia                  31869
Indiana                  29524
New Jersey               29288
Massachusetts            29043
Iowa                     27423
South Carolina           26119
Alabama                  25667
Arizona                  25469
Connecticut              21877
Oklahoma                 21524
Maryland                 20397
Kentucky                 19732
Montana                  18662
Idaho                    18658
Nebraska                 17863
Arkansas

In [61]:
# 117 years in dataset, some very possibly wrong ones (302, 718)
print(len(data['year'].value_counts()))
data['year'].value_counts()

117


2007.0    98127
2008.0    94059
2006.0    91160
2015.0    85588
2005.0    83696
          ...  
1906.0        2
1902.0        2
1907.0        1
1905.0        1
1903.0        1
Name: year, Length: 117, dtype: int64

In [75]:
# First car was in 1885 so take dates between 1885 and 2020 only, new value counts
data = data[(data['year'] >= 1885) & (data['year'] <= 2020)]
data['year'].value_counts()

2007.0    98127
2008.0    94059
2006.0    91160
2015.0    85588
2005.0    83696
          ...  
1906.0        2
1902.0        2
1907.0        1
1905.0        1
1903.0        1
Name: year, Length: 117, dtype: int64

In [62]:
# 53 brands, can group some together eg (mercedes-benz, mercedesbenz), (chevrolet, chevy, chev). honda is overall #4
print(len(data['manufacturer'].value_counts()))
data['manufacturer'].value_counts()

43


ford               286825
chevrolet          256115
toyota             113267
honda               86059
nissan              76993
dodge               72771
jeep                72533
gmc                 67845
ram                 54656
bmw                 41161
volkswagen          35837
hyundai             30856
chrysler            30397
mercedes-benz       29187
subaru              28376
cadillac            24591
kia                 22708
buick               22443
pontiac             21840
mazda               21208
lexus               20020
acura               16959
audi                14218
lincoln             12629
infiniti            11636
volvo               11007
mercury             10229
mitsubishi          10068
saturn               9290
mini                 6157
rover                5152
jaguar               4180
fiat                 1684
harley-davidson       835
datsun                598
infinity              500
ferrari               221
alfa-romeo            212
porche      

In [7]:
data.loc[((data['manufacturer'] == 'mercedesbenz') | (data['manufacturer'] == 'mercedes')), 
         'manufacturer'] = 'mercedes-benz'

In [8]:
data.loc[((data['manufacturer'] == 'chevy') | (data['manufacturer'] == 'chev')), 
         'manufacturer'] = 'chevrolet'

In [9]:
data.loc[(data['manufacturer'] == 'vw'), 'manufacturer'] = 'volkswagen'

In [10]:
data.loc[(data['manufacturer'] == 'aston'), 'manufacturer'] = 'aston-martin'

In [11]:
data.loc[((data['manufacturer'] == 'land rover') | (data['manufacturer'] == 'landrover')), 
         'manufacturer'] = 'rover'

In [12]:
data.loc[(data['manufacturer'] == 'alfa'), 'manufacturer'] = 'alfa-romeo'

In [13]:
data.loc[(data['manufacturer'] == 'harley'), 'manufacturer'] = 'harley-davidson'

In [44]:
# recheck manufacturers, 43 unique
print(len(data['manufacturer'].value_counts()))
data['manufacturer'].value_counts()

43


ford               286825
chevrolet          256115
toyota             113267
honda               86059
nissan              76993
dodge               72771
jeep                72533
gmc                 67845
ram                 54656
bmw                 41161
volkswagen          35837
hyundai             30856
chrysler            30397
mercedes-benz       29187
subaru              28376
cadillac            24591
kia                 22708
buick               22443
pontiac             21840
mazda               21208
lexus               20020
acura               16959
audi                14218
lincoln             12629
infiniti            11636
volvo               11007
mercury             10229
mitsubishi          10068
saturn               9290
mini                 6157
rover                5152
jaguar               4180
fiat                 1684
harley-davidson       835
datsun                598
infinity              500
ferrari               221
alfa-romeo            212
porche      

# Explore on a state-wise basis

## What's special about Omaha, NE? Mentioned that they found a strong regional demand

In [64]:
data.loc[data['state_name'] == 'Nebraska','manufacturer'].value_counts().head(10)

ford         3972
chevrolet    3765
dodge         941
gmc           869
toyota        862
nissan        833
jeep          771
honda         763
ram           622
chrysler      527
Name: manufacturer, dtype: int64

In [68]:
data.loc[(data['city'] == 'omaha') & (data['state_name'] == 'Nebraska'),'manufacturer'].value_counts().head(10)

ford         1262
chevrolet    1125
nissan        358
dodge         347
toyota        337
honda         313
jeep          259
gmc           243
ram           217
chrysler      192
Name: manufacturer, dtype: int64

In [71]:
# There is definitely an increase in demand for Honda in Omaha
print(f'Honda/Ford in Omaha vs all of Nebraska = {round(763/3972,3)}:{round(313/1262,3)}')

Honda/Ford in Omaha vs all of Nebraska = 0.192:0.248


### A better representation would be market share so let's do that

In [72]:
# There is definitely an increase in demand for Honda in Omaha
ms_om = len(data[(data['city'] == 'omaha') & (data['state_name'] == 'Nebraska') & (data['manufacturer'] == 'honda')])/len(data[(data['city'] == 'omaha') & (data['state_name'] == 'Nebraska')])
ms_ne = len(data[(data['state_name'] == 'Nebraska') & (data['manufacturer'] == 'honda')])/len(data[(data['state_name'] == 'Nebraska')])
print(f'Market share of Honda in Omaha vs all of Nebraska = {round(ms_om,3)}:{round(ms_ne,3)}')

Market share of Honda in Omaha vs all of Nebraska = 0.051:0.043


In [73]:
# What is overall market share? Weirdly greater than Omaha. I guess bc opportunity?
ms_overall = len(data[(data['manufacturer'] == 'honda')])/len(data)
print(f'Market share of Honda in US overall = {round(ms_overall,3)}')

Market share of Honda in US overall = 0.056


# Overall I want to do some kind of clustering
# Leave out state location info because I don't want clusters to be based on regionality

In [14]:
data.head()

Unnamed: 0,city,price,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,...,type,paint_color,lat,long,county_fips,county_name,state_fips,state_code,state_name,weather
0,marshall,11900,2010.0,dodge,challenger se,good,6 cylinders,gas,43600.0,clean,...,coupe,red,43.4775,-96.7041,46083.0,Lincoln,46.0,SD,South Dakota,38.0
2,marshall,17550,2008.0,ford,f-150,,,gas,,clean,...,,,44.1415,-103.2052,46093.0,Meade,46.0,SD,South Dakota,38.0
3,marshall,2800,2004.0,ford,taurus,good,6 cylinders,gas,168591.0,clean,...,sedan,grey,43.2845,-95.593,19143.0,Osceola,19.0,IA,Iowa,47.0
5,marshall,9900,2007.0,gmc,yukon,good,8 cylinders,gas,169000.0,clean,...,,,43.328149,-96.774445,46083.0,Lincoln,46.0,SD,South Dakota,38.0
6,marshall,12500,2015.0,jeep,patriot high altitude,like new,4 cylinders,gas,39500.0,rebuilt,...,SUV,grey,43.5486,-96.6332,46099.0,Minnehaha,46.0,SD,South Dakota,38.0


In [78]:
data.columns

Index(['city', 'price', 'year', 'manufacturer', 'make', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission', 'vin',
       'drive', 'size', 'type', 'paint_color', 'lat', 'long', 'county_fips',
       'county_name', 'state_fips', 'state_code', 'state_name', 'weather'],
      dtype='object')

Unnamed: 0,city,price,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,county_name,state_code,state_name,weather
0,marshall,11900,2010.0,dodge,challenger se,good,6 cylinders,gas,43600.0,clean,automatic,rwd,,coupe,red,Lincoln,SD,South Dakota,38.0
2,marshall,17550,2008.0,ford,f-150,,,gas,,clean,automatic,,,,,Meade,SD,South Dakota,38.0
3,marshall,2800,2004.0,ford,taurus,good,6 cylinders,gas,168591.0,clean,automatic,fwd,full-size,sedan,grey,Osceola,IA,Iowa,47.0
5,marshall,9900,2007.0,gmc,yukon,good,8 cylinders,gas,169000.0,clean,automatic,4wd,,,,Lincoln,SD,South Dakota,38.0
6,marshall,12500,2015.0,jeep,patriot high altitude,like new,4 cylinders,gas,39500.0,rebuilt,automatic,4wd,sub-compact,SUV,grey,Minnehaha,SD,South Dakota,38.0


In [16]:
# Convert cylinders string to just a number
def format_cylinders(x):
    if type(x) == str:
        x=x.strip(' cylinders')
    return x

In [17]:
data['cylinders'] = data.apply(lambda x: format_cylinders(x['cylinders']), axis=1)
data.head()

Unnamed: 0,city,price,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,county_name,state_code,state_name,weather
0,marshall,11900,2010.0,dodge,challenger se,good,6.0,gas,43600.0,clean,automatic,rwd,,coupe,red,Lincoln,SD,South Dakota,38.0
2,marshall,17550,2008.0,ford,f-150,,,gas,,clean,automatic,,,,,Meade,SD,South Dakota,38.0
3,marshall,2800,2004.0,ford,taurus,good,6.0,gas,168591.0,clean,automatic,fwd,full-size,sedan,grey,Osceola,IA,Iowa,47.0
5,marshall,9900,2007.0,gmc,yukon,good,8.0,gas,169000.0,clean,automatic,4wd,,,,Lincoln,SD,South Dakota,38.0
6,marshall,12500,2015.0,jeep,patriot high altitude,like new,4.0,gas,39500.0,rebuilt,automatic,4wd,sub-compact,SUV,grey,Minnehaha,SD,South Dakota,38.0


# Make a composite location string - city, state_code (don't need county)

In [18]:
# No missing values here so we're good
data['location'] = data.apply(lambda x: x['city']+', '+x['state_code'], axis=1)

In [19]:
data.head()

Unnamed: 0,city,price,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,county_name,state_code,state_name,weather,location
0,marshall,11900,2010.0,dodge,challenger se,good,6.0,gas,43600.0,clean,automatic,rwd,,coupe,red,Lincoln,SD,South Dakota,38.0,"marshall, SD"
2,marshall,17550,2008.0,ford,f-150,,,gas,,clean,automatic,,,,,Meade,SD,South Dakota,38.0,"marshall, SD"
3,marshall,2800,2004.0,ford,taurus,good,6.0,gas,168591.0,clean,automatic,fwd,full-size,sedan,grey,Osceola,IA,Iowa,47.0,"marshall, IA"
5,marshall,9900,2007.0,gmc,yukon,good,8.0,gas,169000.0,clean,automatic,4wd,,,,Lincoln,SD,South Dakota,38.0,"marshall, SD"
6,marshall,12500,2015.0,jeep,patriot high altitude,like new,4.0,gas,39500.0,rebuilt,automatic,4wd,sub-compact,SUV,grey,Minnehaha,SD,South Dakota,38.0,"marshall, SD"


In [114]:
# check value counts of location, should be more than cities because repeated names
# 5650 locations in dataset, almost 10 times just cities
data['location'].value_counts()

cosprings, CO          7651
anchorage, AK          7568
grandrapids, MI        7567
sacramento, CA         7500
sfbay, CA              7425
                       ... 
charlottesville, WI       1
morgantown, CA            1
longisland, NC            1
corvallis, MA             1
sanantonio, FL            1
Name: location, Length: 5650, dtype: int64

# Drop unnecessary columns and save file

In [116]:
final = data.drop(['vin','lat','long','county_fips','state_fips','city', 'county_name', 'state_code', 'state_name'], axis=1)
final.head()

Unnamed: 0,price,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,weather,location
0,11900,2010.0,dodge,challenger se,good,6.0,gas,43600.0,clean,automatic,rwd,,coupe,red,38.0,"marshall, SD"
2,17550,2008.0,ford,f-150,,,gas,,clean,automatic,,,,,38.0,"marshall, SD"
3,2800,2004.0,ford,taurus,good,6.0,gas,168591.0,clean,automatic,fwd,full-size,sedan,grey,47.0,"marshall, IA"
5,9900,2007.0,gmc,yukon,good,8.0,gas,169000.0,clean,automatic,4wd,,,,38.0,"marshall, SD"
6,12500,2015.0,jeep,patriot high altitude,like new,4.0,gas,39500.0,rebuilt,automatic,4wd,sub-compact,SUV,grey,38.0,"marshall, SD"


In [117]:
export = final.to_csv('./data/reduced_data.csv', header=True, index = False)