# Loading and Exploring the Data

## Importing Libraries


In [53]:
import numpy as np
import pandas as pd
import cufflinks as cf
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

pio.templates.default = "plotly_white"

init_notebook_mode(connected=True)
cf.go_offline()

import warnings
warnings.filterwarnings("ignore")

bold_s = "\033[1m"
bold_e = "\033[0;0m"

In [54]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,model,condition,year,color,body_type,fuel_type,transmission_type,engine_capacity,location,km,price
0,Mitsubishi,Xpander,New,2023,,,Benzine,Automatic,,"Shorouk City, Cairo",,"EGP 1,240,000"
1,Other make,Other,New,2022,Blue- Navy Blue,,Electric,Automatic,,"Heliopolis, Cairo",,"EGP 100,000"
2,Mercedes-Benz,C200,Used,2019,Other Color,Cabriolet,Benzine,Automatic,1400 - 1500,"Heliopolis, Cairo",60000 to 69999,"EGP 1,350,000"
3,MINI,Cooper,Used,2022,,Hatchback,Benzine,Automatic,1600,"Sheikh Zayed, Giza",0 to 9999,"EGP 1,050,000"
4,Haval,H6,New,2022,White,SUV,Benzine,Automatic,1400 - 1500,"Mansura, Dakahlia",,"EGP 300,000"


In [55]:
print(f'{bold_s}DataFrame Shape: {bold_e}{df.shape}')

[1mDataFrame Shape: [0;0m(8895, 12)


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8895 entries, 0 to 8894
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   brand              8895 non-null   object
 1   model              8895 non-null   object
 2   condition          8895 non-null   object
 3   year               8895 non-null   int64 
 4   color              7443 non-null   object
 5   body_type          7785 non-null   object
 6   fuel_type          8895 non-null   object
 7   transmission_type  8895 non-null   object
 8   engine_capacity    8060 non-null   object
 9   location           8880 non-null   object
 10  km                 7106 non-null   object
 11  price              8895 non-null   object
dtypes: int64(1), object(11)
memory usage: 834.0+ KB


In [57]:
df.columns
print(f'{bold_s}DataFrame columns: \n{bold_e}{df.columns.to_list()}')

[1mDataFrame columns: 
[0;0m['brand', 'model', 'condition', 'year', 'color', 'body_type', 'fuel_type', 'transmission_type', 'engine_capacity', 'location', 'km', 'price']


# Data Cleaning


In [58]:
df['price'] = pd.to_numeric(
    df['price'].str.replace('EGP ', '').str.replace(',', ''))

df['engine_capacity'] = pd.to_numeric(
    df['engine_capacity'].str.split(' ').str[-1], errors='coerce')

df['km'] = pd.to_numeric(df['km'].str.split(' ').str[-1], errors='coerce')

df['car_age'] = 2023 - df['year']

cols = df.columns.to_list()
cols = cols[:4] + cols[-1:] + cols[4:-1]
df = df[cols]

# df['engine_capacity'].value_counts()

In [59]:
df.head()

Unnamed: 0,brand,model,condition,year,car_age,color,body_type,fuel_type,transmission_type,engine_capacity,location,km,price
0,Mitsubishi,Xpander,New,2023,0,,,Benzine,Automatic,,"Shorouk City, Cairo",,1240000.0
1,Other make,Other,New,2022,1,Blue- Navy Blue,,Electric,Automatic,,"Heliopolis, Cairo",,100000.0
2,Mercedes-Benz,C200,Used,2019,4,Other Color,Cabriolet,Benzine,Automatic,1500.0,"Heliopolis, Cairo",69999.0,1350000.0
3,MINI,Cooper,Used,2022,1,,Hatchback,Benzine,Automatic,1600.0,"Sheikh Zayed, Giza",9999.0,1050000.0
4,Haval,H6,New,2022,1,White,SUV,Benzine,Automatic,1500.0,"Mansura, Dakahlia",,300000.0


In [60]:
print(f'{bold_s}Features data types: {bold_e}\n{df.dtypes}')

[1mFeatures data types: [0;0m
brand                 object
model                 object
condition             object
year                   int64
car_age                int64
color                 object
body_type             object
fuel_type             object
transmission_type     object
engine_capacity      float64
location              object
km                   float64
price                float64
dtype: object


In [61]:
nan_sum = df.isnull().sum().sort_values(ascending=False)[:5]
nan_percent = (df.isna().mean().round(
    4) * 100).sort_values(ascending=False)[:5]

print(f'{bold_s}Missing values count: {bold_e}\n{nan_sum}')
print(f'\n{bold_s}Missing values contribution:{bold_e} \n{nan_percent}')

[1mMissing values count: [0;0m
km                 1789
color              1452
body_type          1110
engine_capacity     835
location             15
dtype: int64

[1mMissing values contribution:[0;0m 
km                 20.11
color              16.32
body_type          12.48
engine_capacity     9.39
location            0.17
dtype: float64


In [62]:
used = df[df['condition'] == 'Used']
new = df[df['condition'] == 'New']