# Title

## Loading libraries

In [None]:
# Importeer libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
import os
from scipy.stats import norm
import json

## Import dataset

In [18]:
df = pd.read_csv(os.path.join('..', 'data', 'input', 'title.csv'))

In [None]:
# df.head(10)

In [None]:
df_title = df.copy()

In [None]:
# Import json-file with graphic colours
with open('../scripts/colours.json', 'r') as json_bestand:
    colours = json.load(json_bestand)

## Data exploration

In [None]:
# Print all columns
print(df_title.columns)

### Find and delete duplicate rows

In [None]:
# Count of rows before deduplication
print('Row count before deduplication:', len(df_cars))

# find duplicates in new column
df_title['duplicate'] = df_title.duplicated(keep='first')
df_title = df_title[df_title['duplicate'] == False]
print('Row count after deduplication:', len(df_cars))

# delete newly made duplicate column
df_title = df_title.drop('duplicate', axis=1)

#### Relevant rows NaN values
Calculation below gives an insight in count and share of NaN values for each column.

In [12]:
# Count NaN values for each column in df_cars_relevant
na_counts = df_title.isna().sum()

# Count of rows in DataFrame
total_rows = len(df_title)

# Count of non NaN values for each column
non_na_counts = total_rows - na_counts

# Percentage NaN values per column
na_percentage = (na_counts / total_rows * 100).round(1)

# Make DataFrame with NaN values, non NaN values and percentages
output_df = pd.DataFrame({
    '# NaN values': na_counts,
    '% NaN values': na_percentage.astype(str) + '%',
    '# non NaN values': non_na_counts
})

print(output_df)

                     # NaN values % NaN values  # non NaN values
manufacturer                    0         0.0%            752946
year                            0         0.0%            752946
mileage                       478         0.1%            752468
fuel_type                   22153         2.9%            730793
accidents_or_damage         23658         3.1%            729288
price                           0         0.0%            752946


In [13]:
# Delete NaN values from DataFrame
df_title.dropna(subset=['columntitle', 'columntitle', 'columntitle'], inplace=True)

#### OPTIONAL: Relevant columns with categorical variables

In [None]:
# Drop irrelevant columns
df_title = df_title.drop(columns=['columntitle'
                                  , 'columntitle']
                         ,errors='ignore')
print(df_title_relevant.columns)

## Data cleansing

#### Data cleansing categorical variables
Look for inconsistencies.

In [None]:
# delete white spaces before title with str.strip()

In [5]:
# Print list of all values of manufacturers
manufacturers = df_title['columntitle'].unique()
print(manufacturers)

#### OPTIONAL: calculate shares

In [15]:
# Total count of records in DataFrame
total_record_count = len(df_title)

# Percentual share of records where accidents_or_damage equals to 0 and 1
share_0 = (df_title['columntitle'] == 0).sum() / total_record_count * 100
share_1 = (df_title['columntitle'] == 1).sum() / total_record_count * 100

print()  # Print blank line
print("Share 0: {:.2f}%".format(aandeel_0))
print("Share 1: {:.2f}%".format(aandeel_1))


Share accidents_or_damage=0: 74.77%
Share accidents_or_damage=1: 22.09%


#### OPTIONAL: delete rows where value equals XXXX

In [18]:
# Delete rows where 'fuel_type_mapped' equals to 'Other' and make a copy.
# This is a relatively small share.
df_title_cleaned = df_title[df_title['columntitle'] != 'Other'].copy()

# Assign values of 'fuel_type_mapped' to 'fuel_type' in df_cars_cleaned
df_cars_cleaned['fuel_type'] = df_cars_cleaned['fuel_type_mapped']

# Delete 'fuel_type_mapped' column
df_cars_cleaned.drop(columns=['fuel_type_mapped'], inplace=True)

#### Data cleaning of numeric variables¶
Look for inconsistencies & outliers. What does the distribution of our numeric variables look like?

In [10]:
# Select necessary variables for boxplots
column1_data = df_title['columntitle']
column2_data = df_cars_relevant_rows_and_cols['columntitle']

# Make a figure with three subplots in one row
fig, axs = plt.subplots(1, 2, figsize=(18, 6))

# Boxplot 'year'
axs[0].boxplot(year_data)
axs[0].set_title('Year')

# Boxplot 'mileage'
axs[1].boxplot(mileage_data)
axs[1].set_title('Mileage')

plt.tight_layout()
plt.show()

In [11]:
# Show highest price values
print('Top XXX COLUMNTITLE:')

print(df_title['columntitle']
      .astype(int)
      .sort_values(ascending=False)
      .head(10)
     )

print()  # Blank line

In [8]:
# Filter out outliers.
df_title_cleaned = df_title_cleaned[df_title_cleaned['columntitle'] <= 1000000]

In [9]:
# Make DataFrame with only 1 category
df_title_cat = df_title_cleaned[df_title_cleaned['columntitle'] == 'columnvalue']

# Calculate median for the category
median_cat = df_title_cat['columntitle'].median()

# Use conditional selection to directly find the row closest to the median price
cat_median = df_title.loc[(df_title['columntitle'] - median_cat).abs().idxmin()]
cat_median

# Calculate average of a cat
cat_average = (df_title[df_title['columntitle'] == 'columnvalue']
                   ['columntitle'].mean()
                  )


## Visualisations

#### Histogram

In [None]:
# Make a histogram of 2nd hand Volvo prices
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(df_cars_cleaned[df_cars_cleaned['manufacturer'] == 'Volvo']['price'], 
        bins=30, 
        color='skyblue', 
        edgecolor='black')
ax.set_title("Distribution 2nd Hand Volvo Prices")
ax.set_xlabel('Prijs')
ax.set_ylabel("Vehicle Count")

# Add vertical lines for median (red) and average (purple)
ax.axvline(median_price_volvo, 
           color='red', 
           linestyle='solid', 
           linewidth=2, 
           label='Mediaan')
ax.axvline(avg_price_volvo, 
           color='purple', 
           linestyle='solid', 
           linewidth=2, 
           label='Gemiddelde')

# Delete lines on top and right
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add grid
ax.grid(True)
plt.show()

In [17]:
# multiple histograms

# Unique fuel types
fuel_types = df_cars_cleaned['fuel_type'].unique()

# Make subplots
fig, axes = plt.subplots(1, 
                         len(fuel_types), 
                         figsize=(15, 5), 
                         squeeze=False)
axes = axes.flatten()

# Number of bins
num_bins = 20  # Increase this number to increase the number of bins

# Plot histogram for each fuel type
for i, fuel_type in enumerate(fuel_types):
    ax = axes[i]
    df_cars_cleaned[df_cars_cleaned['fuel_type'] == fuel_type]['price'].plot(kind='hist', 
                                                                                 ax=ax, 
                                                                                 bins=num_bins)
    ax.set_title(fuel_type)
    ax.set_xlabel('Price')
    ax.set_ylabel("Vehicle count")

# Change layout
plt.tight_layout()

# Show plot
plt.show()

#### KDE

In [14]:
# Plot KDE of price distribution of all fuel types
plt.figure(figsize=(10, 6))
for fuel_type, color in colours.items():
    fuel_data = df_cars_cleaned[df_cars_cleaned['fuel_type'] == fuel_type]
    sns.kdeplot(fuel_data['price'], 
                label=fuel_type, 
                fill=True, 
                color=colours[fuel_type])

plt.title('Price Distribution for all fuel types')
plt.xlabel('Price')
plt.ylabel('Density')
plt.legend()

plt.xlim(left=0)  # Lower bound x axis
plt.xlim(right=250000)  # Upper bound x axis

# Delete right and top lines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.show()

#### Violin plot

In [16]:
# Make violin plot of price distribution for all fuel types
plt.figure(figsize=(10, 6))
sns.violinplot(data=df_cars_cleaned, 
               x='fuel_type', 
               y='price', 
               hue='fuel_type', 
               legend=False, 
               palette=colours
              )

plt.title('Price distribution for all fuel types')
plt.xlabel('Fuel type')
plt.ylabel('Price')
plt.ylim(0, 250000) # Upper bound of y axis to make plot less flat
plt.grid(True)

# Delete right and top lines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.show()

NameError: name 'plt' is not defined