In [None]:
import math 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
dataset = pd.read_csv('./cleaned_df_with_manufact.csv', keep_default_na=False) #Avoid 'null' in CSV to be parsed as NaN 
dataset_na = pd.read_csv('./cleaned_df_with_manufact.csv')
dataset.head(3)

## Individual Variable Description   

## Price, Year and Mileage

In [None]:
fig, ax = plt.subplots(1,3,figsize=(12,4), constrained_layout=True)
ax[0].hist(dataset['price'], 25)
ax[0].axvline(x=dataset['price'].mean(),color='black',label='mean')
ax[0].axvline(x=dataset['price'].median(),color='r',label='median')
ax[0].set_title('Histogram of Price'), ax[0].set_xlabel('Price'), ax[0].set_ylabel('Count')
ax[0].legend()

ax[1].hist(dataset['year'], len(dataset['year'].value_counts()), align='right')
ax[1].set_title('Histogram of Vehicle Year'), ax[1].set_xlabel('Year')
ax[1].arrow(2009.2,5900,0,-2500,head_width=0.5,head_length=150,color='gray')
ax[1].axvline(x=dataset['year'].mean(),color='black',label='mean')
ax[1].axvline(x=dataset['year'].median(),color='r',label='median')
ax[1].text(2017,8300,'2017',horizontalalignment='center')
ax[1].text(2008.5,6000,'2009',horizontalalignment='center')
ax[1].legend()

ax[2].hist(dataset['odometer'], 50, align='right')
ax[2].set_title('Histogram of Mileage'), ax[2].set_xlabel('Mile')
ax[2].axvline(x=dataset['odometer'].mean(),color='black',label='mean')
ax[2].axvline(x=dataset['odometer'].median(),color='r',label='median')
ax[2].legend()

fig.savefig('./graphs/histograms.png')
fig.show()

## Make

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['manufacturer'].value_counts().iloc[:10].index, dataset['type'].value_counts().iloc[:10].values)
ax.set_title('Top 10 Popular Make'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()

## Model

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['model'].value_counts().iloc[:10].index, dataset['model'].value_counts().iloc[:10].values)
ax.set_title('Top 10 Popular Model'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()

## Condition

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['condition'].value_counts().index, dataset['condition'].value_counts().values)
ax.set_title('Vehicle Condition'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()

Clearly, a large amount (25416) of vehicle condition are not reported in this dataset.

## Engine Cylinder Count

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['cylinders'].value_counts().index, dataset['cylinders'].value_counts().values)
ax.set_title('Engine Cylinder Count'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()

## Fuel Type

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.bar(dataset['fuel'].value_counts().index, dataset['fuel'].value_counts().values)
ax.set_title('Fuel Type'), ax.set_ylabel('Count')
fig.show()

## Transmission

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.bar(dataset['transmission'].value_counts().index, dataset['transmission'].value_counts().values)
ax.set_title('Transmission Type'), ax.set_ylabel('Count')
fig.show()

## Drive

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.bar(dataset['drive'].value_counts().index, dataset['drive'].value_counts().values)
ax.set_title('Drive Type'), ax.set_ylabel('Count')
fig.show()

## Vehicle Type

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['type'].value_counts().index, dataset['type'].value_counts().values)
ax.set_title('Vehicle Type'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()

## Paint Color

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['paint_color'].value_counts().index, dataset['paint_color'].value_counts().values)
ax.set_title('Paint Color'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()

## State

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
ax.barh(dataset['state'].value_counts().index, dataset['state'].value_counts().values)
ax.set_title('State'), ax.set_ylabel('Count'), ax.invert_yaxis()
fig.tight_layout()
fig.show()

# Multivariate Analysis

In [None]:
sns.pairplot(pd.concat([dataset.price,dataset.odometer,dataset.year], axis=1),              corner=True, plot_kws=dict(marker="+", linewidth=1))
plt.show() 

In [None]:
with sns.axes_style('white'):
    p_price = sns.jointplot('price', 'odometer', dataset, kind='hex')
plt.savefig('./graphs/price_vs_odometer.png')

In [None]:
with sns.axes_style('white'):
    p_year = sns.jointplot('price', 'year', dataset, kind='hex')
    plt.savefig('./graphs/price_vs_year.png')

## Price and Odometer vs. Condition

In [None]:
sns.catplot(x='price',y='condition',data=dataset_na[['price','condition']],kind='boxen')
plt.savefig('./graphs/price_vs_condition.png')
plt.show()

## Price vs. Vehicle Type

In [None]:
dataset['type'].value_counts().iloc[:5]     # Top 5 types of vehicle

In [None]:
sns.catplot(x='price',y='type',data=dataset[['price','type']].loc[dataset['type'].isin(dataset['type'].value_counts().iloc[:5].index.tolist())],kind='boxen')
plt.savefig('./graphs/price_vs_type.png')
plt.show()

## Price vs. Transmission, Drive, Cylinders and Fuel Type

In [None]:
sns.catplot(x='price',y='fuel',col='drive',
            data=dataset[['price','drive','fuel']],kind='boxen')
plt.savefig('./graphs/price_vs_fuel_drive.png')
plt.show()

## Price vs. Paint Color

In [None]:
dataset['paint_color'].value_counts() 

In [None]:
sns.catplot(x='price',y='paint_color',\
        data=dataset[['price','paint_color']].loc[dataset['paint_color'].isin(['white','black','silver','purple','green'])],\
        kind='boxen',\
        palette=sns.color_palette(['white','silver','m','k','g']))
plt.savefig('./graphs/price_vs_color.png')
plt.show()