In [82]:
import pandas as pd
import numpy as np
import collections

In [83]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## eBay used Cars

This project will be based on dataset of used cars from German eBay website.  I have a data from [Kaggle](https://www.kaggle.com/orgesleka/used-cars-database/data)

A brief description of the columns
- dateCrawled - When this ad was first crawled. All field-values are taken from this date.
- name - Name of the car.
- seller - Whether the seller is private or a dealer.
- offerType - The type of listing
- price - The price on the ad to sell the car.
- abtest - Whether the listing is included in an A/B test.
- vehicleType - The vehicle Type.
- yearOfRegistration - The year in which the car was first registered.
- gearbox - The transmission type.
- powerPS - The power of the car in PS.
- model - The car model name.
- kilometer - How many kilometers the car has driven.
- monthOfRegistration - The month in which the car was first registered.
- fuelType - What type of fuel the car uses.
- brand - The brand of the car.
- notRepairedDamage - If the car has a damage which is not yet repaired.
- dateCreated - The date on which the eBay listing was created.
- nrOfPictures - The number of pictures in the ad.
- postalCode - The postal code for the location of the vehicle.
- lastSeenOnline - When the crawler saw this ad last online.

In [84]:
autos = pd.read_csv('autos.csv', encoding='Latin-1')

In [85]:
autos.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [86]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 20 columns):
dateCrawled            371528 non-null object
name                   371528 non-null object
seller                 371528 non-null object
offerType              371528 non-null object
price                  371528 non-null int64
abtest                 371528 non-null object
vehicleType            333659 non-null object
yearOfRegistration     371528 non-null int64
gearbox                351319 non-null object
powerPS                371528 non-null int64
model                  351044 non-null object
kilometer              371528 non-null int64
monthOfRegistration    371528 non-null int64
fuelType               338142 non-null object
brand                  371528 non-null object
notRepairedDamage      299468 non-null object
dateCreated            371528 non-null object
nrOfPictures           371528 non-null int64
postalCode             371528 non-null int64
lastSeen              

There is 20 column with mostly non-null objects. Only the column 'notRepairedDamage' is missing a lot of data. Seven columns are int64, the rest is a string. The column use camelcase instead of snakecase. Below I will change naming convention.

In [87]:
autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model', 'kilometer', 'monthOfRegistration', 'fuelType', 'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode', 'lastSeen'], dtype='object')

In [88]:
new_columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
       'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model',
       'kilometer', 'registration_month', 'fuel_type', 'brand',
       'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code',
       'last_seen']

In [89]:
autos.columns = new_columns

In [90]:
autos.head()

Unnamed: 0,date_crawled,name,seller,offer_type,price,abtest,vehicle_type,registration_year,gearbox,power_ps,model,kilometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [91]:
autos.describe(include='all')

Unnamed: 0,date_crawled,name,seller,offer_type,price,abtest,vehicle_type,registration_year,gearbox,power_ps,model,kilometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
count,371528,371528,371528,371528,371528.0,371528,333659,371528.0,351319,371528.0,351044,371528.0,371528.0,338142,371528,299468,371528,371528.0,371528.0,371528
unique,280500,233531,2,2,,2,8,,2,,251,,,7,40,2,114,,,182806
top,2016-03-24 14:49:47,Ford_Fiesta,privat,Angebot,,test,limousine,,manuell,,golf,,,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:45:59
freq,7,657,371525,371516,,192585,95894,,274214,,30070,,,223857,79640,263182,14450,,,17
mean,,,,,17295.14,,,2004.577997,,115.549477,,125618.688228,5.734445,,,,,0.0,50820.66764,
std,,,,,3587954.0,,,92.866598,,192.139578,,40112.337051,3.712412,,,,,0.0,25799.08247,
min,,,,,0.0,,,1000.0,,0.0,,5000.0,0.0,,,,,0.0,1067.0,
25%,,,,,1150.0,,,1999.0,,70.0,,125000.0,3.0,,,,,0.0,30459.0,
50%,,,,,2950.0,,,2003.0,,105.0,,150000.0,6.0,,,,,0.0,49610.0,
75%,,,,,7200.0,,,2008.0,,150.0,,150000.0,9.0,,,,,0.0,71546.0,


In [92]:
autos.seller.value_counts()

privat        371525
gewerblich         3
Name: seller, dtype: int64

In [93]:
autos.offer_type.value_counts()

Angebot    371516
Gesuch         12
Name: offer_type, dtype: int64

In [94]:
autos.abtest.value_counts()

test       192585
control    178943
Name: abtest, dtype: int64

In [95]:
autos.nr_of_pictures.value_counts()

0    371528
Name: nr_of_pictures, dtype: int64

The four columns above, for which I checked the values, are not relevant for further analysis, so I will remove them.

In [96]:
autos = autos.drop(columns=['seller', 'offer_type', 'abtest', 'nr_of_pictures'])

In [97]:
autos.head()

Unnamed: 0,date_crawled,name,price,vehicle_type,registration_year,gearbox,power_ps,model,kilometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
0,2016-03-24 11:52:17,Golf_3_1.6,480,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,18300,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",9800,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,1500,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,3600,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,60437,2016-04-06 10:17:21


In the next steps I will check the quality and correctness of the data.

In [98]:
autos.price.value_counts().sort_index(ascending=False).head()

2147483647     1
99999999      15
99000000       1
74185296       1
32545461       1
Name: price, dtype: int64

In [99]:
autos[autos.price > 500000].sort_values(by='price').head(10)

Unnamed: 0,date_crawled,name,price,vehicle_type,registration_year,gearbox,power_ps,model,kilometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
118058,2016-04-07 04:36:19,Audi__A6_allroad_quattro,517895,kombi,2013,automatik,245,andere,40000,1,diesel,audi,nein,2016-04-07 00:00:00,85092,2016-04-07 04:36:19
1846,2016-03-10 22:47:05,BMW_M1_Museumsfahrzeug_Neuwagenzustand_Glossy_...,579000,coupe,1980,manuell,277,andere,20000,12,benzin,bmw,nein,2016-03-10 00:00:00,60435,2016-03-23 10:45:27
136495,2016-03-27 17:39:13,Mercedes_Benz_300_d_Cabriolet__W189_,585000,cabrio,1960,manuell,160,andere,5000,7,benzin,mercedes_benz,nein,2016-03-27 00:00:00,50667,2016-04-05 13:17:43
365461,2016-03-31 18:53:48,BMW_M1_mit_4900_Km_Neuwagenzustand_Glossy_Orange,599000,coupe,1980,manuell,377,andere,5000,3,benzin,bmw,nein,2016-03-31 00:00:00,60435,2016-04-06 11:44:48
161773,2016-03-12 16:39:54,Porsche_Carrera_GT,600000,cabrio,2005,manuell,612,andere,10000,4,benzin,porsche,nein,2016-03-12 00:00:00,70569,2016-03-19 18:46:28
26327,2016-03-21 19:43:54,Porsche_911_R,600000,coupe,2016,manuell,500,911,5000,3,benzin,porsche,nein,2016-03-21 00:00:00,76275,2016-03-21 19:43:54
137856,2016-03-19 01:36:24,Porsche_911R_Der_letzte_Samurai!_Sauger__Schal...,619000,coupe,2016,manuell,500,911,5000,3,benzin,porsche,nein,2016-03-19 00:00:00,60435,2016-03-20 16:30:13
169098,2016-03-08 10:36:17,Ferrari_206,650000,coupe,1969,manuell,163,,90000,1,benzin,sonstige_autos,nein,2016-03-08 00:00:00,80796,2016-03-20 23:47:46
100851,2016-03-12 18:46:35,Porsche_911R,700000,coupe,2016,,0,911,5000,3,,porsche,nein,2016-03-12 00:00:00,76275,2016-03-12 18:46:35
111647,2016-03-17 00:57:28,Lamborghini_Miura_S_Project,725000,coupe,1971,manuell,625,,5000,4,benzin,sonstige_autos,nein,2016-03-16 00:00:00,70173,2016-03-17 00:57:28


I'm removing all the data above 500000.

In [100]:
autos = autos[(autos.price < 500000)]

In [101]:
autos.shape

(371426, 16)

In [102]:
autos.kilometer.value_counts().sort_values()

10000       1945
20000       5673
30000       6040
40000       6374
5000        7042
50000       7611
60000       8668
70000       9773
80000      11052
90000      12522
100000     15915
125000     38063
150000    240748
Name: kilometer, dtype: int64

### Analyzing datetime columns

In [103]:
autos[['date_crawled','ad_created','last_seen']][0:5]

Unnamed: 0,date_crawled,ad_created,last_seen
0,2016-03-24 11:52:17,2016-03-24 00:00:00,2016-04-07 03:16:57
1,2016-03-24 10:58:45,2016-03-24 00:00:00,2016-04-07 01:46:50
2,2016-03-14 12:52:21,2016-03-14 00:00:00,2016-04-05 12:47:46
3,2016-03-17 16:54:04,2016-03-17 00:00:00,2016-03-17 17:40:17
4,2016-03-31 17:25:20,2016-03-31 00:00:00,2016-04-06 10:17:21


In [104]:
autos[['date_crawled','ad_created','last_seen']][0:5].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 3 columns):
date_crawled    5 non-null object
ad_created      5 non-null object
last_seen       5 non-null object
dtypes: object(3)
memory usage: 160.0+ bytes


This is a datetime format but in string.

In [105]:
autos.date_crawled = pd.to_datetime(autos.date_crawled)
autos.ad_created = pd.to_datetime(autos.ad_created)
autos.last_seen = pd.to_datetime(autos.last_seen)

In [106]:
autos[['date_crawled','ad_created','last_seen']][0:5]

Unnamed: 0,date_crawled,ad_created,last_seen
0,2016-03-24 11:52:17,2016-03-24,2016-04-07 03:16:57
1,2016-03-24 10:58:45,2016-03-24,2016-04-07 01:46:50
2,2016-03-14 12:52:21,2016-03-14,2016-04-05 12:47:46
3,2016-03-17 16:54:04,2016-03-17,2016-03-17 17:40:17
4,2016-03-31 17:25:20,2016-03-31,2016-04-06 10:17:21


In [107]:
autos[['date_crawled','ad_created','last_seen']][0:5].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 3 columns):
date_crawled    5 non-null datetime64[ns]
ad_created      5 non-null datetime64[ns]
last_seen       5 non-null datetime64[ns]
dtypes: datetime64[ns](3)
memory usage: 160.0 bytes


In [108]:
autos.registration_year.describe()

count    371426.000000
mean       2004.561022
std          91.936113
min        1000.000000
25%        1999.000000
50%        2003.000000
75%        2008.000000
max        9999.000000
Name: registration_year, dtype: float64

As seen above, the data is incorrect, because there was no cars in 1000 year and the future data is also wrong,so I will remove wrong data.

I also see that this data is still incorrect because it it impossible to get the data in 2016 with the year of car registration 2019. This is an issue that should be solved according to the purpose.

In [109]:
autos = autos[autos.registration_year.between(1900, 2019)]

In [110]:
autos.registration_year.value_counts().head()

2000    24540
1999    22764
2005    22312
2006    20226
2001    20215
Name: registration_year, dtype: int64

In [111]:
autos.registration_year.value_counts(normalize=True).head()

2000    0.066102
1999    0.061318
2005    0.060100
2006    0.054481
2001    0.054452
Name: registration_year, dtype: float64

The setting 'normalize = True' allows me to check the distribution - what is the most common yesr of registration. It is 2000 the next most frequent values are 1999, 2005, 2006, 2001.

### Exploring the Brand column

In [112]:
autos.brand.describe()

count         371246
unique            40
top       volkswagen
freq           79581
Name: brand, dtype: object

In [113]:
autos.brand.value_counts()

volkswagen        79581
bmw               40247
opel              40113
mercedes_benz     35290
audi              32860
ford              25563
renault           17965
peugeot           11025
fiat               9668
seat               7020
mazda              5688
skoda              5639
smart              5249
citroen            5178
nissan             5037
toyota             4694
sonstige_autos     3923
hyundai            3645
mini               3393
volvo              3326
mitsubishi         3058
honda              2836
kia                2555
alfa_romeo         2342
suzuki             2327
porsche            2200
chevrolet          1841
chrysler           1451
dacia               900
daihatsu            806
jeep                805
subaru              776
land_rover          769
jaguar              621
trabant             586
daewoo              542
saab                529
rover               489
lancia              484
lada                225
Name: brand, dtype: int64

In [114]:
autos.brand.value_counts(normalize=True).head()

volkswagen       0.214362
bmw              0.108411
opel             0.108050
mercedes_benz    0.095058
audi             0.088513
Name: brand, dtype: float64

### Analyzing prices

I want to find mean price for the most common brand 

In [115]:
brand_counts = autos.brand.value_counts(normalize=True)
common_brands = brand_counts[brand_counts > 0.01].index
common_brands

Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford', 'renault', 'peugeot', 'fiat', 'seat', 'mazda', 'skoda', 'smart', 'citroen', 'nissan', 'toyota', 'sonstige_autos'], dtype='object')

In [116]:
brand_mean_price = {}
for brand in common_brands:
    brand_only = autos[autos.brand == brand]
    mean_price = brand_only.price.mean()
    brand_mean_price[brand] = mean_price

In [117]:
brand_mean_price

{'volkswagen': 5148.798921853207,
 'bmw': 8141.082739086143,
 'opel': 2838.120484630918,
 'mercedes_benz': 8308.688240294701,
 'audi': 8723.979214850882,
 'ford': 3557.5554121190785,
 'renault': 2334.430782076259,
 'peugeot': 3167.542494331066,
 'fiat': 2774.772134877948,
 'seat': 4354.239173789174,
 'mazda': 3925.34405766526,
 'skoda': 6413.3686823904945,
 'smart': 3531.5620118117736,
 'citroen': 3609.2827346465815,
 'nissan': 4527.5108199324995,
 'toyota': 5232.657008947593,
 'sonstige_autos': 12514.42161611012}

In [118]:
sorted_brand_mean_price = sorted(brand_mean_price.items(), key=lambda kv: kv[1],reverse=True)
sorted_brand_mean_price

[('sonstige_autos', 12514.42161611012),
 ('audi', 8723.979214850882),
 ('mercedes_benz', 8308.688240294701),
 ('bmw', 8141.082739086143),
 ('skoda', 6413.3686823904945),
 ('toyota', 5232.657008947593),
 ('volkswagen', 5148.798921853207),
 ('nissan', 4527.5108199324995),
 ('seat', 4354.239173789174),
 ('mazda', 3925.34405766526),
 ('citroen', 3609.2827346465815),
 ('ford', 3557.5554121190785),
 ('smart', 3531.5620118117736),
 ('peugeot', 3167.542494331066),
 ('opel', 2838.120484630918),
 ('fiat', 2774.772134877948),
 ('renault', 2334.430782076259)]

'sonstige_autos' is a group of different types of cars, this group covers about 1 percent. The mean is quite high so it means in this group are rare and expensive car. On the next positions I see in order Audi, Mercedes and BMW that prices are quite close to each other. Then after some gap price I see Skoda. 

### Analyzing kilometers

In [119]:
brand_mean_kilometers = {}

for brand in common_brands:
    brand_only = autos[autos["brand"] == brand]
    mean_kilometers = brand_only["kilometer"].mean()
    brand_mean_kilometers[brand] = int(mean_kilometers)

mean_kilometers = pd.Series(brand_mean_kilometers).sort_values(ascending=False)
mean_prices = pd.Series(brand_mean_price).sort_values(ascending=False)

In [120]:
brand_info = pd.DataFrame(mean_kilometers,columns=['mean_kilometers'])
brand_info

Unnamed: 0,mean_kilometers
bmw,132818
mercedes_benz,130712
audi,129739
opel,128965
volkswagen,128629
renault,128076
mazda,126107
peugeot,124991
ford,123862
seat,121506


The average mileage for brands are above. BMW is on the first place and Smart on the last but between the majority this difference are not so significant. Smart stands out probably because this is a city car.

In [121]:
brand_info["mean_price"] = mean_prices
brand_info

Unnamed: 0,mean_kilometers,mean_price
bmw,132818,8141.082739
mercedes_benz,130712,8308.68824
audi,129739,8723.979215
opel,128965,2838.120485
volkswagen,128629,5148.798922
renault,128076,2334.430782
mazda,126107,3925.344058
peugeot,124991,3167.542494
ford,123862,3557.555412
seat,121506,4354.239174


### Translating german word into english

I need to find out what are the words to change, below I will create a dictionary so I can map the words.

In [122]:
autos.head(15)

Unnamed: 0,date_crawled,name,price,vehicle_type,registration_year,gearbox,power_ps,model,kilometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
0,2016-03-24 11:52:17,Golf_3_1.6,480,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,18300,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",9800,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,1500,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,3600,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31,60437,2016-04-06 10:17:21
5,2016-04-04 17:36:23,BMW_316i___e36_Limousine___Bastlerfahrzeug__Ex...,650,limousine,1995,manuell,102,3er,150000,10,benzin,bmw,ja,2016-04-04,33775,2016-04-06 19:17:07
6,2016-04-01 20:48:51,Peugeot_206_CC_110_Platinum,2200,cabrio,2004,manuell,109,2_reihe,150000,8,benzin,peugeot,nein,2016-04-01,67112,2016-04-05 18:18:39
7,2016-03-21 18:54:38,VW_Derby_Bj_80__Scheunenfund,0,limousine,1980,manuell,50,andere,40000,7,benzin,volkswagen,nein,2016-03-21,19348,2016-03-25 16:47:58
8,2016-04-04 23:42:13,Ford_C___Max_Titanium_1_0_L_EcoBoost,14500,bus,2014,manuell,125,c_max,30000,8,benzin,ford,,2016-04-04,94505,2016-04-04 23:42:13
9,2016-03-17 10:53:50,VW_Golf_4_5_tuerig_zu_verkaufen_mit_Anhaengerk...,999,kleinwagen,1998,manuell,101,golf,150000,0,,volkswagen,,2016-03-17,27472,2016-03-31 17:17:06


In [123]:
translate = {'kleinwagen':'hatchback',
             'automatik':'automatic',
             'manuell':'manually',
             'benzin':'petrol',
             'c_klasse':'c_class',
             'e_klasse':'e_class',
             'nein':'no',
             'ja':'yes'
            }

In [124]:
col_to_translate = ['vehicle_type', 'gearbox', 'fuel_type', 'unrepaired_damage', 'model']

for col in col_to_translate:
    autos[col] = autos[col].replace(translate)

In [125]:
autos.head(5)

Unnamed: 0,date_crawled,name,price,vehicle_type,registration_year,gearbox,power_ps,model,kilometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
0,2016-03-24 11:52:17,Golf_3_1.6,480,,1993,manually,0,golf,150000,0,petrol,volkswagen,,2016-03-24,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,18300,coupe,2011,manually,190,,125000,5,diesel,audi,yes,2016-03-24,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",9800,suv,2004,automatic,163,grand,125000,8,diesel,jeep,,2016-03-14,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,1500,hatchback,2001,manually,75,golf,150000,6,petrol,volkswagen,no,2016-03-17,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,3600,hatchback,2008,manually,69,fabia,90000,7,diesel,skoda,no,2016-03-31,60437,2016-04-06 10:17:21


In [126]:
autos.head(2)

Unnamed: 0,date_crawled,name,price,vehicle_type,registration_year,gearbox,power_ps,model,kilometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
0,2016-03-24 11:52:17,Golf_3_1.6,480,,1993,manually,0,golf,150000,0,petrol,volkswagen,,2016-03-24,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,18300,coupe,2011,manually,190,,125000,5,diesel,audi,yes,2016-03-24,66954,2016-04-07 01:46:50


### Searching for the most common brand - model  combination

In [127]:
autos_grouped_m = autos.groupby(['brand','model'])['model'].count()

In [128]:
autos_grouped_m.max()

30047

In [129]:
autos_grouped_m.sort_values(ascending=False).head(10)

brand          model  
volkswagen     golf       30047
bmw            3er        20560
volkswagen     polo       13085
opel           corsa      12567
               astra      10828
volkswagen     passat     10303
audi           a4         10255
mercedes_benz  c_class     8775
bmw            5er         8542
mercedes_benz  e_class     7560
Name: model, dtype: int64

Thanks to grouping I could find that the most frequent cars that are selling on the german ebay are:
- volkswagen     golf        
- bmw            3er         
- volkswagen     polo        
- opel           corsa       
- opel           astra       
- volkswagen     passat      
- audi           a4          
- mercedes_benz  c_class    
- bmw            5er          
- mercedes_benz  e_class          

In [130]:
autos_grouped_km = autos.groupby(['kilometer']).count()

In [131]:
autos_grouped_km.name.sort_values(ascending=False)

kilometer
150000    240744
125000     38062
100000     15914
90000      12522
80000      11052
70000       9773
60000       8668
50000       7611
5000        6932
40000       6374
30000       6040
20000       5673
10000       1881
Name: name, dtype: int64

This data shows that the more kilometers a car has travelled, the more offers I can find on the list.

### Checking connection between price and kilometers

Putting the price into the bins

In [132]:
bins = range(0,1000000,10000)

In [133]:
bins

range(0, 1000000, 10000)

In [134]:
autos_price_bins = pd.cut(autos.price, bins)

In [135]:
autos['price_bins'] = autos_price_bins

In [136]:
autos_grouped_km_p = autos.groupby(['kilometer', 'price_bins']).count()

In [137]:
autos_grouped_km_p.name.sort_values(ascending=False).head(30)

kilometer  price_bins    
150000     (0, 10000]        215475.0
125000     (0, 10000]         30321.0
150000     (10000, 20000]     15064.0
100000     (0, 10000]         11485.0
90000      (0, 10000]          8612.0
80000      (0, 10000]          7185.0
70000      (0, 10000]          5785.0
125000     (10000, 20000]      5440.0
60000      (0, 10000]          4786.0
5000       (0, 10000]          4749.0
50000      (0, 10000]          3740.0
100000     (10000, 20000]      2960.0
90000      (10000, 20000]      2782.0
40000      (0, 10000]          2771.0
80000      (10000, 20000]      2715.0
70000      (10000, 20000]      2709.0
60000      (10000, 20000]      2583.0
30000      (0, 10000]          2493.0
50000      (10000, 20000]      2414.0
20000      (0, 10000]          2278.0
40000      (10000, 20000]      2085.0
150000     (20000, 30000]      1976.0
30000      (10000, 20000]      1839.0
20000      (10000, 20000]      1559.0
125000     (20000, 30000]      1205.0
30000      (20000, 30000

This data shows that the most frequent is car with more than 150000 km and cheaper than 10000. The most cars with mileage less than 6000 has a higher price.

### Checking connection between price, kilometers and unrepaired damage

In [138]:
autos_grouped_d = autos.groupby(['kilometer', 'unrepaired_damage', 'price_bins'])['price_bins'].count()

In [139]:
autos_grouped_d.sort_index(level=0, ascending=False).head()

kilometer  unrepaired_damage  price_bins      
150000     yes                (100000, 110000]     1
                              (90000, 100000]      2
                              (50000, 60000]       3
                              (30000, 40000]       8
                              (20000, 30000]      32
Name: price_bins, dtype: int64

In [140]:
autos_grouped_d.index[0]

(5000, 'no', Interval(0, 10000, closed='right'))

In [141]:
autos_grouped_d.filter(like='Interval(0, 10000, closed=\'right\')').sort_values(ascending=False).head()

kilometer  unrepaired_damage  price_bins
150000     no                 (0, 10000]    141542
           yes                (0, 10000]     26247
125000     no                 (0, 10000]     22673
100000     no                 (0, 10000]      8585
90000      no                 (0, 10000]      6658
Name: price_bins, dtype: int64

In [142]:
repair_150000 = autos_grouped_d.filter(like='150000').sort_index(ascending=False) > 100

In [143]:
repair_125000 = autos_grouped_d.filter(like='125000').sort_index(ascending=False) > 100

In [144]:
repair_100000 = autos_grouped_d.filter(like='100000').sort_index(ascending=False) > 100

In [145]:
repair_150000[repair_150000 == True]

kilometer  unrepaired_damage  price_bins    
150000     yes                (10000, 20000]    True
                              (0, 10000]        True
           no                 (30000, 40000]    True
                              (20000, 30000]    True
                              (10000, 20000]    True
                              (0, 10000]        True
Name: price_bins, dtype: bool

In [146]:
repair_125000[repair_125000 == True]

kilometer  unrepaired_damage  price_bins    
125000     yes                (0, 10000]        True
           no                 (30000, 40000]    True
                              (20000, 30000]    True
                              (10000, 20000]    True
                              (0, 10000]        True
Name: price_bins, dtype: bool

In [147]:
repair_100000[repair_100000 == True]

kilometer  unrepaired_damage  price_bins    
100000     yes                (0, 10000]        True
           no                 (30000, 40000]    True
                              (20000, 30000]    True
                              (10000, 20000]    True
                              (0, 10000]        True
Name: price_bins, dtype: bool

#### Conclusions

I took the results if there is more that 100 offers. It is clear that there are price differences that depend on the need for repair. There is no offer or not too much offer(below 100) with cars that have status - unrepaired_damage.

In [148]:
autos.price[(autos.kilometer == 150000) & (autos.unrepaired_damage == 'no')].head(2)

3    1500
6    2200
Name: price, dtype: int64

In [149]:
autos[autos.unrepaired_damage == 'yes'].head(2)

Unnamed: 0,date_crawled,name,price,vehicle_type,registration_year,gearbox,power_ps,model,kilometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen,price_bins
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,18300,coupe,2011,manually,190,,125000,5,diesel,audi,yes,2016-03-24,66954,2016-04-07 01:46:50,"(10000, 20000]"
5,2016-04-04 17:36:23,BMW_316i___e36_Limousine___Bastlerfahrzeug__Ex...,650,limousine,1995,manually,102,3er,150000,10,petrol,bmw,yes,2016-04-04,33775,2016-04-06 19:17:07,"(0, 10000]"


I want to create now new DataFrame that will allowa comparision between the prices of the cars that need to be repaired and those that do not. For this purpose, I will also create two Series and insert them into DataFrame after counting the mean value.

In [150]:
kilometer_unique = autos.kilometer.sort_values(ascending=False).unique()

In [151]:
autos_repair = pd.DataFrame(kilometer_unique, columns=['kilometer'])

In [152]:
repaired_mean_s = pd.Series([])
unrepaired_mean_s = pd.Series([])

In [153]:
for item in range(len(kilometer_unique)):
    repaired_mean = autos.price[(autos.kilometer == kilometer_unique[item]) & (autos.unrepaired_damage == 'no')].mean()
    repaired_mean_s[item] = repaired_mean
    unrepaired_mean = autos.price[(autos.kilometer == kilometer_unique[item]) & (autos.unrepaired_damage == 'yes')].mean()
    unrepaired_mean_s[item] = unrepaired_mean   

In [154]:
autos_repair.insert(1, 'repaired_mean', repaired_mean_s)
autos_repair.insert(2, 'unrepaired_mean', unrepaired_mean_s)

In [155]:
autos_repair.head(2)

Unnamed: 0,kilometer,repaired_mean,unrepaired_mean
0,150000,4397.373921,1682.10266
1,125000,6967.852848,2577.926938


In [156]:
autos_repair['difference'] = autos_repair.repaired_mean - autos_repair.unrepaired_mean

In [157]:
autos_repair.head(2)

Unnamed: 0,kilometer,repaired_mean,unrepaired_mean,difference
0,150000,4397.373921,1682.10266,2715.271261
1,125000,6967.852848,2577.926938,4389.925911


In [158]:
autos_repair['fraction'] = (autos_repair.unrepaired_mean *100)/autos_repair.repaired_mean

In [159]:
autos_repair

Unnamed: 0,kilometer,repaired_mean,unrepaired_mean,difference,fraction
0,150000,4397.373921,1682.10266,2715.271261,38.252436
1,125000,6967.852848,2577.926938,4389.925911,36.997437
2,100000,8792.026418,2932.562216,5859.464202,33.354793
3,90000,9369.318507,3638.129534,5731.188973,38.830247
4,80000,10316.360679,3785.181208,6531.179471,36.691051
5,70000,11637.100024,4508.327314,7128.77271,38.740986
6,60000,12772.831375,5542.544199,7230.287176,43.393231
7,50000,14134.171065,6402.016287,7732.154778,45.2946
8,40000,15657.258581,9718.211538,5939.047042,62.068411
9,30000,17078.501343,6476.633466,10601.867876,37.922727


The dataframe 'autos_repair' show the difference and the proportion between cars that need a repairand that doesn't. The difference is often about 40 percent. The smallest difference is 12% for the smallest mileage - 5000. The highest difference is for 40000 - more that 60%, which is in value at almost 6000.