## Get data and packages

In [9]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os

os.chdir(os.path.join(os.getcwd(), '..'))
import src.preprocessing

In [10]:
data = src.preprocessing.load_data('data/data_pricing_challenge.csv')
df_1 = src.preprocessing.delete_outliers(data)
df_2 = src.preprocessing.strings_to_date(df_1)
df_3 = src.preprocessing.booleans_to_numeric(df_2)
dataframe = src.preprocessing.strings_to_numeric(df_3)

FileNotFoundError: [Errno 2] No such file or directory: 'data/data_pricing_challenge.csv'

In [None]:
dataframe.dtypes

maker_key                    object
model_key                    object
mileage                       int64
engine_power                  int64
registration_date    datetime64[ns]
fuel                         object
paint_color                  object
car_type                     object
feature_1                     int32
feature_2                     int32
feature_3                     int32
feature_4                     int32
feature_5                     int32
feature_6                     int32
feature_7                     int32
feature_8                     int32
price                         int64
sold_at              datetime64[ns]
dtype: object

## Initial EDA

In [None]:
dataframe.describe()

Unnamed: 0,mileage,engine_power,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price
count,4840.0,4840.0,4840.0,4840.0,4840.0,4840.0,4840.0,4840.0,4840.0,4840.0,4840.0
mean,140826.602479,129.002066,0.549587,0.792975,0.202066,0.198554,0.460744,0.241529,0.932231,0.541116,15819.690083
std,58893.195228,38.930239,0.497586,0.405215,0.401583,0.398952,0.498508,0.428054,0.251374,0.498358,9181.296224
min,476.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
25%,103011.25,100.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,10800.0
50%,141084.5,120.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,14200.0
75%,175184.75,135.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,18600.0
max,484615.0,423.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,178500.0


In [None]:
for column in dataframe.columns:
    if dataframe[column].dtype == object:
        unique_values = dataframe[column].nunique()
        print(f'{unique_values} unique values in {column}')

1 unique values in maker_key
75 unique values in model_key
4 unique values in fuel
10 unique values in paint_color
8 unique values in car_type


Create ranges for numerical variables

In [None]:
def create_ranges(dataframe, variable, min_value, max_value, num_ranges):
    # Ensure dataframe is a copy
    dataframe = dataframe.copy()
    
    # Calculate the range width
    range_width = (max_value - min_value) / num_ranges
    
    # Generate the boundaries of the ranges
    range_limits = [min_value + i * range_width for i in range(num_ranges + 1)]
    
    # Generate the range labels
    range_labels = [f'Rank {i+1}: from {int(left)} to {int(right)}' for i, (left, right) in enumerate(zip(range_limits[:-1], range_limits[1:]))]
    
    # Add the column with the range labels to the dataframe
    dataframe[f'{variable}_range'] = pd.cut(dataframe[variable], bins=range_limits, labels=range_labels, include_lowest=True, right=False)
    return dataframe



# Define the minimum and maximum value and the number of ranges
min_value_mileage = dataframe['mileage'].min()-1
max_value_mileage = dataframe['mileage'].max()+1
num_ranges_mileage = 8
min_value_engine_power = dataframe['engine_power'].min()-1
max_value_engine_power = dataframe['engine_power'].max()+1
num_ranges_engine_power = 5


# Apply the function to create the ranges
dataframe = create_ranges(dataframe, 'mileage', min_value_mileage, max_value_mileage, num_ranges_mileage)
dataframe = create_ranges(dataframe, 'engine_power', min_value_engine_power, max_value_engine_power, num_ranges_engine_power)

Plot aggregated data

In [None]:
def aggregate_data(dataframe, column, aggregation):
    if aggregation == 'count':
        dataframe_agg = dataframe.groupby(dataframe[column]).size().reset_index(name='count')
        dataframe_agg = dataframe_agg.sort_values(by = column)
    else:        
        dataframe_agg = dataframe.groupby(dataframe[column]).agg({'price':aggregation}).reset_index()
        dataframe_agg = dataframe_agg.sort_values(by = column)
    return dataframe_agg

def plot_data(df, x, y):
    if y == 'count':
        fig = px.bar(df, x=x, y='count', title=f'# Cars Sold per {x}')
        fig.update_traces(texttemplate='%{y}')
        fig.update_layout(
            xaxis_title=x,
            yaxis_title='# Cars Sold',
            hovermode='x'
        )
        fig.show()

    else:
        fig = px.bar(df, x=x, y='price', title=f'Avg Price per {x}')
        fig.update_traces(texttemplate='%{y:.0f}')
        fig.update_layout(
            xaxis_title=x,
            yaxis_title='Avg Price',
            hovermode='x'
        )
        fig.show()


dataframe['registration_year'] = pd.to_datetime(dataframe['registration_date']).dt.year
columns_to_analyze = ['maker_key', 'model_key', 'registration_year', 'mileage_range', 'engine_power_range', 'fuel', 'paint_color', 'car_type', 'sold_at']
for column in columns_to_analyze:
    df_mean = aggregate_data(dataframe, column, 'mean')
    plot_data(df_mean, column, 'price')

    df_count = aggregate_data(dataframe, column, 'count')
    plot_data(df_count, column, 'count')

In [None]:
def aggregate_data_sold(dataframe, column, aggregation):
    if aggregation == 'count':
        dataframe_agg = dataframe.groupby(['sold_at', column]).size().reset_index(name='count')
    else:        
        dataframe_agg = dataframe.groupby(['sold_at', column]).agg({'price':aggregation}).reset_index()
    return dataframe_agg


def plot_data_sold(df, color_plot, y):
    if y == 'count':
        if color_plot == 'paint_color':
            custom_palette = {
            'beige': '#F5F5DC',
            'black': '#000000',
            'blue': '#0000FF',
            'brown': '#A52A2A',
            'green': '#008000',
            'grey': '#808080',
            'orange': '#FFA500',
            'red': '#FF0000',
            'silver': '#C0C0C0',
            'white': '#FFFFFF'
            }

            fig = px.line(df, x='sold_at', y='count', title=f'# Cars Sold per month (by {color_plot})', color = color_plot,
                        color_discrete_map=custom_palette)
            fig.update_traces(texttemplate='%{y}')
            fig.update_layout(
                xaxis_title='Sold at',
                yaxis_title='# Cars Sold',
                hovermode='x'
            )
            fig.show()
        else:
            fig = px.line(df, x='sold_at', y='count', title=f'# Cars Sold per month (by {color_plot})', color = color_plot)
            fig.update_traces(texttemplate='%{y}')
            fig.update_layout(
                xaxis_title = 'Sold at',
                yaxis_title = '# Cars Sold',
                hovermode='x'
            )
            fig.show()

    else:
        if color_plot == 'paint_color':
            custom_palette = {
            'beige': '#F5F5DC',
            'black': '#000000',
            'blue': '#0000FF',
            'brown': '#A52A2A',
            'green': '#008000',
            'grey': '#808080',
            'orange': '#FFA500',
            'red': '#FF0000',
            'silver': '#C0C0C0',
            'white': '#FFFFFF'
            }

            fig = px.line(df, x='sold_at', y='price', title=f'Avg Price per month (by {color_plot})', color = color_plot,
                        color_discrete_map=custom_palette)
            fig.update_traces(texttemplate='%{y:.0f}')
            fig.update_layout(
                xaxis_title='Sold at',
                yaxis_title='Avg Price',
                hovermode='x'
            )
            fig.show()

        else:
            fig = px.line(df, x='sold_at', y='price', title=f'Avg Price per month (by {color_plot})', color = color_plot)
            fig.update_traces(texttemplate='%{y:.0f}')
            fig.update_layout(
                xaxis_title = 'Sold at',
                yaxis_title = 'Avg Price',
                hovermode='x'
            )
            fig.show()


columns_to_analyze_sold = ['mileage_range', 'engine_power_range', 'fuel', 'paint_color', 'car_type']
for column in columns_to_analyze_sold:
    df_mean = aggregate_data_sold(dataframe, column, 'mean')
    plot_data_sold(df_mean, column, 'price')

    df_count = aggregate_data_sold(dataframe, column, 'count')
    plot_data_sold(df_count, column, 'count')

## Conclusions

1) Data overview
    - Data is generally correct and ready to be exploited, there is no need to change dataframe structure.
    - No significant outliers / inconsistency in the data is detected.
    - However, following ranges have been applied before using the data for training:
             0 <= min_mileage <= 500000         -->     2 registers excluded
             10 <= engine power                 -->     1 register excluded


2) As expected:
    - Car price descreases with antiquity.
        - 2012, 2013 and 2014 are the years with more registred cars.
    - Car price decreases with mileage.
        - Mileage range mean is 140827, being the most common range from 120000 to 180000.
    - Car price increases with engine power.
        - Engine power mean is 129, being the most common range from 104 to 184.


3) Most common fuel is diesel.
    - Petrol cars have a significant sales peak on May.

4) Subdued colors (i.e., black, grey, brown and white) are the most sold, only blue can be compared to them.
    - Black and Blue suffer the most with summer arrival.
        
5) Estate, Hatchback, Sedan and Suv are the most sold car types
    - Suv is more sold in summer




