In [1]:
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from IPython.core.display import Markdown
from IPython.core.display_functions import display
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

import notebooks.config as config
import notebooks.utils as nb_utils
from src.config import FeaturesInfo
from src.features.univariate_analysis import UACleaner
from src.utils import init_cols_nan_strategy, preprocess_init

STAGE = 1
PREVIOUS_STAGE = STAGE - 1

%load_ext autoreload
%autoreload 2

## Load cleaned data and metadata in artifacts

In [2]:
df, features_info = nb_utils.load_dataset_and_metadata(
    nb_utils.STAGES_DICT[PREVIOUS_STAGE]["name"],
    nb_utils.STAGES_DICT[PREVIOUS_STAGE]["folder_path"],
)

In [3]:
df.info(memory_usage="deep", max_cols=200)
print()

max_nans_for_rows = df.isna().sum(axis=1).max()
print(f"Max NaNs for rows: {max_nans_for_rows}")
print()

pd.options.display.max_rows = None

print("NaNs for cols")
nans_for_cols = nb_utils.get_nas(df)
display(nans_for_cols[nans_for_cols > 0])

pd.reset_option("display.max_rows", 0)

<class 'pandas.core.frame.DataFrame'>
Index: 29991 entries, 9249043 to 23246511
Data columns (total 170 columns):
 #    Column                                   Non-Null Count  Dtype         
---   ------                                   --------------  -----         
 0    name                                     29991 non-null  string        
 1    short_url                                29991 non-null  string        
 2    price                                    29991 non-null  UInt32        
 3    listing_followers_no                     27205 non-null  UInt16        
 4    location                                 29991 non-null  category      
 5    images_no                                29991 non-null  UInt8         
 6    description                              29991 non-null  string        
 7    gi_brand                                 29991 non-null  category      
 8    gi_model                                 29991 non-null  category      
 9    gi_production_year    

Unnamed: 0,missing count,missing [%]
ai_range_on_full_battery_km,29864,99.57654
gi_battery_capacity,29834,99.47651
ai_interest_free_credit,29805,99.379814
ai_cash_payment,29708,99.056384
ai_deposit,29518,98.42286
ai_installment_amount,29413,98.072755
ai_installment_no,29381,97.966056
ai_leasing,29204,97.375879
gi_certified,28474,94.941816
ai_credit,28174,93.941516


In [4]:
df_train, df_test = train_test_split(
    df,
    test_size=config.TEST_SIZE,
    random_state=config.RANDOM_SEED,
)

df_orig = df
df = df_train  # Using alias for making things easier and more neat

cols_nan_strategy = init_cols_nan_strategy()
idx_to_remove = []

In [5]:
n_rows_train, n_cols_train = df_train.shape

print("Train dataset shape")
print(20 * "-")
print(f"No of rows: {n_rows_train}")
print(f"No of cols: {n_cols_train}")

print()

n_rows_test, n_cols_test = df_test.shape

print("Test dataset shape")
print(20 * "-")
print(f"No of rows: {n_rows_test}")
print(f"No of cols: {n_cols_test}")
print(20 * "-")

Train dataset shape
--------------------
No of rows: 23992
No of cols: 170

Test dataset shape
--------------------
No of rows: 5999
No of cols: 170
--------------------


### nominal features

In [7]:
@preprocess_init
def ua_nominal_features_nb(
    df: pd.DataFrame,
    features_info: FeaturesInfo,
    cols_nan_strategy: Dict[str, List[str]],
) -> Tuple[pd.DataFrame, FeaturesInfo, Dict[str, List[str]]]:
    print("Frequencies for nominal features")
    for col in features_info["nominal"]:
        df[col] = df[col].cat.remove_unused_categories()
        display(nb_utils.get_value_counts_freq_with_perc(df, col))
    print("Dropped empty categories")
    print()

    df.loc[df.gi_body_type == "", "gi_body_type"] = np.nan
    print("Replaced ' ' category for 'gi_body_type' with NaN")

    df.gi_fuel_type = pd.Categorical(
        df.gi_fuel_type.astype("string").replace(
            {
                "Hibridni pogon (Benzin)": "Hibridni pogon",
                "Hibridni pogon (Dizel)": "Hibridni pogon",
                "Plug-in hibrid": "Hibridni pogon",
                "Metan CNG": "Benzin + Metan (CNG)",
            }
        ),
        ordered=False,
    )
    print("Grouped simillar categories for 'gi_fuel_type'")

    df.ai_gearbox_type = pd.Categorical(
        df.ai_gearbox_type.astype("string").replace(
            {
                "Automatski": "Automatski / poluautomatski",
                "Poluautomatski": "Automatski / poluautomatski",
            }
        )
    )
    print("Grouped simillar categories for 'ai_gearbox_type'")

    constant_strat_cols = [
        "ai_floating_flywheel",
        "ai_interior_material",
        "ai_interior_color",
        "ai_ownership",
        "ai_import_country",
        "ai_sales_method",
    ]
    modus_strat_cols = [
        col for col in features_info["nominal"] if col not in constant_strat_cols
    ]
    cols_nan_strategy["const"].extend(constant_strat_cols)
    cols_nan_strategy["modus"].extend(modus_strat_cols)
    print("NaN constant strategy columns:")
    print(constant_strat_cols)
    print("Extended 'constant' and 'modus' column_handle_nan_dict")

    return df, features_info, cols_nan_strategy


df, features_info, cols_nan_strategy = ua_nominal_features_nb(
    df=df, features_info=features_info, cols_nan_strategy=cols_nan_strategy
)

Frequencies for nominal features


Unnamed: 0_level_0,count,percentage [%]
location,Unnamed: 1_level_1,Unnamed: 2_level_1
Beograd,3925,16.359620
Novi Sad,1682,7.010670
Niš,977,4.072191
Kragujevac,890,3.709570
Čačak,716,2.984328
...,...,...
Rudnik,1,0.004168
Rudna Glava,1,0.004168
Riđica,1,0.004168
Jaša Tomić,1,0.004168


Unnamed: 0_level_0,count,percentage [%]
gi_brand,Unnamed: 1_level_1,Unnamed: 2_level_1
Volkswagen,3855,16.067856
Audi,2503,10.432644
BMW,2173,9.057186
Opel,1886,7.860954
Peugeot,1612,6.718906
...,...,...
Great Wall,1,0.004168
Geely,1,0.004168
Cadillac,1,0.004168
Buick,1,0.004168


Unnamed: 0_level_0,count,percentage [%]
gi_model,Unnamed: 1_level_1,Unnamed: 2_level_1
A4,677,2.821774
Octavia,499,2.079860
A3,496,2.067356
A6,480,2.000667
Golf 7,465,1.938146
...,...,...
E 230,1,0.004168
SL 280,1,0.004168
SL 300,1,0.004168
SL 63 AMG,1,0.004168


Unnamed: 0_level_0,count,percentage [%]
gi_body_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Hečbek,8212,34.228076
Džip/SUV,4740,19.756586
Limuzina,4429,18.46032
Karavan,3633,15.142548
Monovolumen (MiniVan),1944,8.102701
Kupe,645,2.688396
Kabriolet/Roadster,247,1.02951
Pickup,140,0.583528
,2,0.008336


Unnamed: 0_level_0,count,percentage [%]
gi_fuel_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Dizel,16394,68.33111
Benzin,5895,24.57069
Benzin + Gas (TNG),895,3.73041
Benzin + Metan (CNG),443,1.846449
Hibridni pogon,196,0.816939
Električni pogon,129,0.537679
Hibridni pogon (Benzin),24,0.100033
Hibridni pogon (Dizel),13,0.054185
Plug-in hibrid,2,0.008336
Metan CNG,1,0.004168


Unnamed: 0_level_0,count,percentage [%]
gi_trade_in,Unnamed: 1_level_1,Unnamed: 2_level_1
NE,17683,73.703735
DA,2981,12.424975
DA (za jeftinije),2206,9.194732
DA (za skuplje),775,3.230243
DA (u istoj ceni),347,1.446315


Unnamed: 0_level_0,count,percentage [%]
ai_floating_flywheel,Unnamed: 1_level_1,Unnamed: 2_level_1
,16319,68.018506
Sa plivajućim zamajcem,4941,20.594365
Bez plivajućeg zamajca,2732,11.387129


Unnamed: 0_level_0,count,percentage [%]
ai_gearbox_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Manuelni 5 brzina,8499,35.424308
Manuelni 6 brzina,8116,33.827943
Automatski / poluautomatski,7252,30.226742
Manuelni 4 brzine,111,0.462654
Automatski,13,0.054185
Poluautomatski,1,0.004168


Unnamed: 0_level_0,count,percentage [%]
ai_air_conditioning,Unnamed: 1_level_1,Unnamed: 2_level_1
Automatska klima,16266,67.797599
Manuelna klima,6940,28.926309
Nema klimu,786,3.276092


Unnamed: 0_level_0,count,percentage [%]
ai_color,Unnamed: 1_level_1,Unnamed: 2_level_1
Siva,7659,31.923141
Crna,6265,26.112871
Bela,3565,14.85912
Plava,2177,9.073858
Crvena,1094,4.559853
Srebrna,594,2.475825
Teget,576,2.4008
Braon,462,1.925642
Zelena,386,1.60887
Bordo,256,1.067022


Unnamed: 0_level_0,count,percentage [%]
ai_interior_material,Unnamed: 1_level_1,Unnamed: 2_level_1
Štof,12955,53.997166
Prirodna koža,4103,17.101534
,3266,13.612871
Kombinovana koža,2933,12.224908
Drugi,380,1.583861
Velur,355,1.47966


Unnamed: 0_level_0,count,percentage [%]
ai_interior_color,Unnamed: 1_level_1,Unnamed: 2_level_1
Crna,12217,50.92114
Siva,4596,19.156385
,4182,17.43081
Druga,1351,5.631044
Bež,1139,4.747416
Smeđa,507,2.113204


Unnamed: 0_level_0,count,percentage [%]
ai_propulsion,Unnamed: 1_level_1,Unnamed: 2_level_1
Prednji,17406,72.549183
4x4,4012,16.722241
Zadnji,2091,8.715405
4x4 reduktor,483,2.013171


Unnamed: 0_level_0,count,percentage [%]
ai_vehicle_origin,Unnamed: 1_level_1,Unnamed: 2_level_1
Domaće tablice,14673,61.157886
Na ime kupca,9171,38.225242
Strane tablice,146,0.608536
,2,0.008336


Unnamed: 0_level_0,count,percentage [%]
ai_ownership,Unnamed: 1_level_1,Unnamed: 2_level_1
Vodi se na prodavca,10279,42.843448
,9338,38.921307
Ne vodi se na prodavca,4375,18.235245


Unnamed: 0_level_0,count,percentage [%]
ai_import_country,Unnamed: 1_level_1,Unnamed: 2_level_1
,17948,74.808269
Švajcarska,1609,6.706402
Italija,1109,4.622374
Francuska,995,4.147216
Nemačka,846,3.526175
Holandija,534,2.225742
Belgija,294,1.225408
Norveška,209,0.871124
Austrija,155,0.646049
Slovenija,131,0.546015


Unnamed: 0_level_0,count,percentage [%]
ai_sales_method,Unnamed: 1_level_1,Unnamed: 2_level_1
,21563,89.875792
Komisiona,2210,9.211404
Konsignaciona,219,0.912804


Dropped empty categories

Replaced ' ' category for 'gi_body_type' with NaN
Grouped simillar categories for 'gi_fuel_type'
Grouped simillar categories for 'ai_gearbox_type'
NaN constant strategy columns:
['ai_floating_flywheel', 'ai_interior_material', 'ai_interior_color', 'ai_ownership', 'ai_import_country', 'ai_sales_method']
Extended 'constant' and 'modus' column_handle_nan_dict
