In [1]:
import pyreadr as pyr
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer # used with one hot encoding
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.inspection import permutation_importance

carAd_file = pyr.read_r("car_ads_fp.RData")
carAd = carAd_file["carAd"]
carAd = pd.DataFrame(carAd) # so pd functions are colored
carAd.reset_index(drop = True, inplace = True)
print(carAd)

            Maker Genmodel Genmodel_ID   Adv_ID  Adv_year  Adv_month   Color  \
0         Bentley   Arnage        10_1  10_1$$1    2018.0        4.0  Silver   
1         Bentley   Arnage        10_1  10_1$$2    2018.0        6.0    Grey   
2         Bentley   Arnage        10_1  10_1$$3    2017.0       11.0    Blue   
3         Bentley   Arnage        10_1  10_1$$4    2018.0        4.0   Green   
4         Bentley   Arnage        10_1  10_1$$5    2017.0       11.0    Grey   
...           ...      ...         ...      ...       ...        ...     ...   
268250  Westfield    Sport        97_1  97_1$$1    2018.0        5.0  Yellow   
268251  Westfield    Sport        97_1  97_1$$2    2018.0        5.0  Yellow   
268252      Zenos      E10        99_1  99_1$$1    2018.0        3.0     Red   
268253      Zenos      E10        99_1  99_1$$2    2018.0        3.0   Green   
268254      Zenos      E10        99_1  99_1$$3    2018.0        5.0    Grey   

        Reg_year     Bodytype Runned_Mi

In [2]:
import pandas as pd

# Models of interest
models_of_interest = ['C3', 'DS3', 'Grande Punto', 'Panda']

# Filtering the DataFrame for these specific models and getting their counts
filtered_models_counts = carAd["Genmodel"].value_counts(dropna = False).loc[models_of_interest]

# Displaying the counts for the models of interest
print(filtered_models_counts)


C3              1429
DS3             1036
Grande Punto     663
Panda            391
Name: Genmodel, dtype: int64


In [3]:
# Filter for 'Hatchback' bodytype and 'Petrol' or 'Diesel' fuel types
filtered_cars = carAd[
    (carAd['Bodytype'] == 'Hatchback') & 
    (carAd['Fuel_type'].isin(['Petrol', 'Diesel']))
]

# Display the filtered DataFrame
print(filtered_cars)

            Maker           Genmodel Genmodel_ID     Adv_ID  Adv_year  \
1120    Chevrolet              Kalos       16_10   16_10$$1    2018.0   
1121    Chevrolet              Kalos       16_10   16_10$$2    2018.0   
1122    Chevrolet              Kalos       16_10   16_10$$3    2018.0   
1123    Chevrolet              Kalos       16_10   16_10$$4    2018.0   
1124    Chevrolet              Kalos       16_10   16_10$$5    2018.0   
...           ...                ...         ...        ...       ...   
267719      Volvo  V40 Cross Country        96_8  96_8$$694    2021.0   
267720      Volvo  V40 Cross Country        96_8  96_8$$695    2021.0   
267721      Volvo  V40 Cross Country        96_8  96_8$$696    2021.0   
267722      Volvo  V40 Cross Country        96_8  96_8$$697    2021.0   
267723      Volvo  V40 Cross Country        96_8  96_8$$698    2021.0   

        Adv_month   Color  Reg_year   Bodytype Runned_Miles Engin_size  \
1120          4.0    Grey    2007.0  Hatchback   

In [4]:
# 5 colors 
print(carAd["Color"].value_counts(dropna = False))
# captures 0-4 top color classes
cols = carAd["Color"].value_counts().index.tolist()[:5]
print(cols) # inspect


Black          48751
Silver         40214
Blue           38376
Grey           37678
White          34270
Red            25987
NaN            21875
Green           5027
Yellow          3072
Brown           2878
Orange          2829
Beige           1982
Purple          1361
Gold            1223
Bronze          1200
Multicolour      800
Pink             299
Maroon           179
Turquoise        176
Burgundy          48
Magenta           18
Navy               8
Indigo             4
Name: Color, dtype: int64
['Black', 'Silver', 'Blue', 'Grey', 'White']


In [5]:
top_5_colors = carAd[carAd['Genmodel'].isin(['C3', 'DS3', 'Grande Punto', 'Panda'])]['Color'].value_counts().head(5).index.tolist()

# Filter for 'Hatchback' bodytype, 'Petrol' or 'Diesel' fuel types, and specified models
filtered_cars = carAd[
    (carAd['Bodytype'] == 'Hatchback') & 
    (carAd['Fuel_type'].isin(['Petrol', 'Diesel'])) &
    (carAd['Genmodel'].isin(['C3', 'DS3', 'Grande Punto', 'Panda'])) &
    (carAd['Color'].isin(top_5_colors))
]

# Select only the required columns
filtered_cars = filtered_cars[['Maker', 'Genmodel', 'Color', 'Gearbox', 'Fuel_type', 'Bodytype']]

# Display the filtered DataFrame with the specified columns
print(filtered_cars)



         Maker Genmodel  Color Gearbox Fuel_type   Bodytype
7740   Citroen       C3   Blue  Manual    Diesel  Hatchback
7742   Citroen       C3  Black  Manual    Diesel  Hatchback
7743   Citroen       C3  Black  Manual    Petrol  Hatchback
7745   Citroen       C3    Red  Manual    Petrol  Hatchback
7746   Citroen       C3   Blue  Manual    Petrol  Hatchback
...        ...      ...    ...     ...       ...        ...
16262     Fiat    Panda  White  Manual    Petrol  Hatchback
16264     Fiat    Panda  White  Manual    Petrol  Hatchback
16265     Fiat    Panda    Red  Manual    Petrol  Hatchback
16268     Fiat    Panda    Red  Manual    Petrol  Hatchback
16269     Fiat    Panda   Blue  Manual    Petrol  Hatchback

[2662 rows x 6 columns]
