In [1]:
from typing import Tuple

import numpy as np
import pandas as pd
from IPython.core.display import Markdown
from IPython.core.display_functions import display

import notebooks.utils as nb_utils
import notebooks.config as config
from src.config import FeaturesInfo
from src.db.broker import DbBroker
from src.features.initial_cleaning import InitialCleaner
from src.logger import logging
from src.utils import init_features_info, preprocess_init

STAGE = 0

%load_ext autoreload
%autoreload 2

## Get dataset and metadata

In [2]:
def get_dataset_from_db() -> pd.DataFrame:
    db_broker = DbBroker()
    df = pd.read_sql(
        db_broker.get_all_listings_statement(),
        db_broker.engine,
        dtype_backend=config.DTYPE_BACKEND,
        index_col=config.INDEX,
    )
    df = df.rename(str, axis="columns")
    db_broker.engine.dispose()
    return df


def get_metadata() -> FeaturesInfo:
    features_info = init_features_info()
    return features_info

In [3]:
df = get_dataset_from_db()
features_info = get_metadata()

## Dataset info

In [4]:
print("Entire dataset")
print(f"Duplicate rows: {df.duplicated().sum()}")
print(20 * "-")
df.info()

Entire dataset


Duplicate rows: 0
--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 30788 entries, 9249043 to 23246511
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   name                      30788 non-null  string
 1   short_url                 30788 non-null  string
 2   price                     30788 non-null  string
 3   listing_followers_no      30788 non-null  string
 4   location                  30788 non-null  string
 5   images_no                 30788 non-null  string
 6   safety                    30788 non-null  string
 7   equipment                 30788 non-null  string
 8   other                     30788 non-null  string
 9   description               30788 non-null  string
 10  id_1                      30788 non-null  Int64 
 11  condition                 30788 non-null  string
 12  brand                     30788 non-null  string
 13  model                     30788 n

## Entire dataset cleaning

### Initial clean

In [5]:
@preprocess_init
def initial_preparation_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    df.columns = df.columns.astype("string")
    print("Transformed column type to string")

    # Prefix columns from table general_informations with "gi_"
    # and additional_informations with "ai"
    id_1_col_idx = df.columns.get_loc("id_1")
    id_2_col_idx = df.columns.get_loc("id_2")

    columns_from_gi = df.columns[id_1_col_idx + 1 : id_2_col_idx].values
    columns_from_ai = df.columns[id_2_col_idx + 1 :].values
        
    df.rename(
        columns=dict(zip(columns_from_gi, "gi_" + columns_from_gi)), inplace=True
    )
    print("Added prefix 'gi_' to columns from table general_informations")
    df.rename(
        columns=dict(zip(columns_from_ai, "ai_" + columns_from_ai)), inplace=True
    )
    print("Added prefix 'ai_' to columns from table additional_informations")

    del df["id_1"], df["id_2"]
    print("Removed redundant ids")

    del df["gi_fixed_price"]
    print("Removing gi_fixed_price that is poorly scraped")

    return df, features_info

In [6]:
df, features_info = initial_preparation_nb(df=df, features_info=features_info)

Transformed column type to string
Added prefix 'gi_' to columns from table general_informations
Added prefix 'ai_' to columns from table additional_informations
Removed redundant ids
Removing gi_fixed_price that is poorly scraped


### Feature 'name'

In [7]:
@preprocess_init
def cf_name_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    print("Counts by frequencies for the first 20 categories")
    display(df[feature_name].value_counts().head(20))
    print()

    sum_irregular_name_rows = (
        df[feature_name].str.contains("ucesc|učešć", case=False).sum()
    )
    print(f"Irregular 'name' rows count: {sum_irregular_name_rows}")
    display(Markdown("This feature will be left for the Multivariate analysis."))
    print()

    features_info["other"].append(feature_name)
    print(f"Added {feature_name} to 'other' features")
    
    return df, features_info

In [8]:
df, features_info = cf_name_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
9249043,Volvo 940 DIPLOMATA NOV
9650557,"SsangYong Kyron 2,0 XDI4x4RESTAJLING"
9657017,Audi A4 2.5 TDI Quattro CH
9755964,Dacia Sandero 1.0 TCE 90 LAGER
9921885,Zastava 750 presla 38.400



Counts by frequencies for the first 20 categories


name
Audi A4                 179
Volkswagen Golf 5       123
Audi A6                 119
BMW 320                 115
Volkswagen Polo         114
Volkswagen Passat B6    106
Audi A3                 105
Renault Megane          102
Škoda Octavia           100
Volkswagen Golf 6        98
Volkswagen Golf 7        87
Ford Focus               82
Renault Clio             79
BMW 520                  73
Ford Fiesta              69
Fiat Punto               68
Opel Corsa D             68
Peugeot 308              67
Citroen C5               65
Audi A4 2.0 TDI          64
Name: count, dtype: Int64


Irregular 'name' rows count: 18


This feature will be left for the Multivariate analysis.


Added name to 'other' features


### Feature 'short_url'

In [9]:
@preprocess_init
def cf_short_url_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("Display first 5 rows")
    display(df[[feature_name]].head())
    print()

    features_info["other"].append(feature_name)
    print(f"Added {feature_name} to 'other' features")
    
    return df, features_info

In [10]:
df, features_info = cf_short_url_nb(df=df, features_info=features_info)

Display first 5 rows


Unnamed: 0_level_0,short_url
id,Unnamed: 1_level_1
9249043,https://www.polovniautomobili.com/auto-oglasi/...
9650557,https://www.polovniautomobili.com/auto-oglasi/...
9657017,https://www.polovniautomobili.com/auto-oglasi/...
9755964,https://www.polovniautomobili.com/auto-oglasi/...
9921885,https://www.polovniautomobili.com/auto-oglasi/...



Added short_url to 'other' features


### Feature 'price'

In [11]:
@preprocess_init
def cf_price_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    df[feature_name] = pd.to_numeric(
        df[feature_name].str.slice(stop=-1).str.replace(".", "").str.strip(),
        errors="raise",
        downcast="unsigned",
    )
    print("Removed '.' and spaces from values and transformed to numerical")
    print()

    print("Cars where price = 1")
    display(df[df[feature_name] == 1])
    display(
        Markdown(
            "These are cars where price='Po dogovoru' on the website, so that means they didn't have a price to begin with and they will be dropped."
        )
    )
    print()

    df = df[df[feature_name] != 1]
    print("Removed cars that had price = 1")

    print("Cars where price = 'Po dogovoru'")
    display(df[df[feature_name] == "Po dogovoru"])
    print()

    cars_price_less_than_100 = df.loc[df[feature_name] < 100, feature_name]
    print("Short url of cars that had price < 100")
    display(df.loc[cars_price_less_than_100.index, "short_url"].tolist())
    print()

    display(
        Markdown(
            "All of these cars are going to be deleted because they actually don't have a value of price (their current value is the result of poor handling the case when price is set by an agreement)."
        )
    )

    df = df.drop(cars_price_less_than_100.index, axis=0)
    print("Removed cars that had price < 100")

    return df, features_info

In [12]:
df, features_info = cf_price_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
9249043,18.000 €
9650557,4.500 €
9657017,3.300 €
9755964,16.100 €
9921885,12.999 €



Removed '.' and spaces from values and transformed to numerical

Cars where price = 1


Unnamed: 0_level_0,name,short_url,price,listing_followers_no,location,images_no,safety,equipment,other,description,...,ai_import_country,ai_sales_method,ai_credit,ai_deposit,ai_installment_no,ai_installment_amount,ai_interest_free_credit,ai_leasing,ai_cash_payment,ai_range_on_full_battery_km
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23220124,Audi A4 RATA 183e BEZ UCESCA,https://www.polovniautomobili.com/auto-oglasi/...,1,2.0,Beograd,39,"Airbag za vozača,Airbag za suvozača,Bočni airb...","Servo volan,Daljinsko zaključavanje,Tonirana s...",,,...,,,DA,,,183 €,,,,
23220566,Renault Scenic RATA 148e BEZ UCESCA,https://www.polovniautomobili.com/auto-oglasi/...,1,,Beograd,28,"Airbag za vozača,Airbag za suvozača,Bočni airb...","Servo volan,Multifunkcionalni volan,Tempomat,D...",,VOZILO U DOLASKU ...............................,...,,,DA,,,148 €,,,,
23221119,Opel Astra J RATA 148e BEZ UCESCA,https://www.polovniautomobili.com/auto-oglasi/...,1,1.0,Beograd,14,"Airbag za vozača,Airbag za suvozača,Bočni airb...","Servo volan,Multifunkcionalni volan,Tempomat,D...",,Vozilo u dolasku ...............................,...,,,DA,,,148 €,,,,
23228430,Toyota Yaris RATA 78e BEZ UCESCA,https://www.polovniautomobili.com/auto-oglasi/...,1,1.0,Beograd,17,"Airbag za vozača,Airbag za suvozača,Bočni airb...","Servo volan,Daljinsko zaključavanje,Putni raču...",,VOZILO U DOLASKU,...,,,DA,,,78 €,,,,
23230448,Peugeot 306,https://www.polovniautomobili.com/auto-oglasi/...,1,2.0,Landol,19,"Airbag za vozača,Airbag za suvozača,ABS","Metalik boja,Servo volan,Tonirana stakla,Radio...",Garažiran,,...,,,,,,,,,,


These are cars where price='Po dogovoru' on the website, so that means they didn't have a price to begin with and they will be dropped.


Removed cars that had price = 1
Cars where price = 'Po dogovoru'


Unnamed: 0_level_0,name,short_url,price,listing_followers_no,location,images_no,safety,equipment,other,description,...,ai_import_country,ai_sales_method,ai_credit,ai_deposit,ai_installment_no,ai_installment_amount,ai_interest_free_credit,ai_leasing,ai_cash_payment,ai_range_on_full_battery_km
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1



Short url of cars that had price < 100


['https://www.polovniautomobili.com/auto-oglasi/22675403/Opel Meriva RATA 78e BEZ UCESCA',
 'https://www.polovniautomobili.com/auto-oglasi/22681873/Ford Focus RATA 83e BEZ UČEŠĆA',
 'https://www.polovniautomobili.com/auto-oglasi/22681890/Opel Astra H RATA 99e BEZ UČEŠĆA',
 'https://www.polovniautomobili.com/auto-oglasi/23225453/Zastava Yugo Tempo']




All of these cars are going to be deleted because they actually don't have a value of price (their current value is the result of poor handling the case when price is set by an agreement).

Removed cars that had price < 100


### Feature 'listing_followers_no'

In [13]:
@preprocess_init
def cf_listing_followers_no_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    df[feature_name] = pd.to_numeric(
        df[feature_name], downcast="unsigned", errors="raise"
    )
    print("Transformed to numerical")

    features_info["numerical"].append(feature_name)
    print(f"Added '{feature_name}' to numerical features'")

    return df, features_info

In [14]:
df, features_info = cf_listing_followers_no_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,listing_followers_no
id,Unnamed: 1_level_1
9249043,58
9650557,87
9657017,151
9755964,50
9921885,421



Transformed to numerical
Added 'listing_followers_no' to numerical features'


### Feature 'location'

In [15]:
@preprocess_init
def cf_location_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    df[feature_name] = pd.Categorical(df[feature_name], ordered=False)
    print(f"Converted '{feature_name}' to categorical type (nominal)")

    features_info["nominal"].append(feature_name)
    print(f"Added '{feature_name}' to 'nominal' features")
    
    return df, features_info

In [16]:
df, features_info = cf_location_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,location
id,Unnamed: 1_level_1
9249043,Vrbas
9650557,Novi Sad
9657017,Loznica
9755964,Novi Sad
9921885,Zemun



Converted 'location' to categorical type (nominal)
Added 'location' to 'nominal' features


### Feature 'images_no'

In [17]:
@preprocess_init
def cf_images_no_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    df[feature_name] = pd.to_numeric(
        df[feature_name], downcast="unsigned", errors="raise"
    )
    print("Transformed to numerical")

    features_info["numerical"].append(feature_name)
    print(f"Added {feature_name} to 'numerical' features")

    return df, features_info

In [18]:
df, features_info = cf_images_no_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,images_no
id,Unnamed: 1_level_1
9249043,30
9650557,17
9657017,19
9755964,9
9921885,15



Transformed to numerical
Added images_no to 'numerical' features


### Feature 'safety'

In [19]:
@preprocess_init
def cf_safety_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    prefix = "s_"

    # Create data frame with dummy columns
    df_safety_dummies = df[feature_name].str.get_dummies(sep=",").add_prefix(prefix)
    df = pd.concat([df, df_safety_dummies], axis=1)
    print("Extended the data frame with dummy columns")

    del df[feature_name]
    print(f"Deleted '{feature_name}' column")

    safety_columns = [col for col in df.columns if col.startswith(prefix)]
    df[safety_columns] = df[safety_columns].astype("boolean")
    print("Converted all remaining safety columns to boolean")

    safety_columns_fixed = (
        df[safety_columns]
        .columns.str.strip()
        .str.replace(r"[- ]", "_", regex=True)
        .str.replace("/", "_ili_")
    ).tolist()

    df.rename(columns=dict(zip(safety_columns, safety_columns_fixed)), inplace=True)
    print("Fixed column names")

    features_info["binary"].extend(safety_columns_fixed)
    print("Added all remaining safety columns to 'binary' features")

    return df, features_info

In [20]:
df, features_info = cf_safety_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,safety
id,Unnamed: 1_level_1
9249043,
9650557,"Airbag za vozača,Airbag za suvozača,Bočni airb..."
9657017,"Airbag za vozača,Airbag za suvozača,Bočni airb..."
9755964,"Airbag za vozača,Airbag za suvozača,Bočni airb..."
9921885,



Extended the data frame with dummy columns
Deleted 'safety' column
Converted all remaining safety columns to boolean
Fixed column names
Added all remaining safety columns to 'binary' features


### Feature 'equipment'

In [21]:
@preprocess_init
def cf_equipment_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    prefix = "e_"

    # Create data frame with dummy columns
    df_equipment_dummies = df[feature_name].str.get_dummies(sep=",").add_prefix(prefix)
    df = pd.concat([df, df_equipment_dummies], axis=1)
    print("Extended the data frame with dummy columns")

    del df[feature_name]
    print(f"Deleted '{feature_name}' column")

    equipment_columns = df_equipment_dummies.columns.tolist()

    df[equipment_columns] = df[equipment_columns].astype("boolean")
    print("Converted all remaining equipment columns to boolean")

    equipment_columns_fixed = (
        df[equipment_columns]
        .columns.str.strip()
        .str.replace(r"[- ]", "_", regex=True)
        .str.replace("/", "_ili_")
    ).tolist()

    df.rename(columns=dict(zip(equipment_columns, equipment_columns_fixed)), inplace=True)
    print("Fixed column names")

    features_info["binary"].extend(equipment_columns_fixed)
    print("Added all remaining equipment columns to 'binary' features")

    return df, features_info

In [22]:
df, features_info = cf_equipment_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,equipment
id,Unnamed: 1_level_1
9249043,
9650557,"Metalik boja,Branici u boji auta,Servo volan,M..."
9657017,"Metalik boja,Branici u boji auta,Servo volan,T..."
9755964,"Metalik boja,Branici u boji auta,Servo volan,M..."
9921885,Svetla za maglu



Extended the data frame with dummy columns
Deleted 'equipment' column
Converted all remaining equipment columns to boolean
Fixed column names
Added all remaining equipment columns to 'binary' features


### Feature 'other'

In [23]:
@preprocess_init
def cf_other_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    prefix = "o_"

    # Create data frame with dummy columns
    df_other_dummies = df[feature_name].str.get_dummies(sep=",").add_prefix(prefix)
    df = pd.concat([df, df_other_dummies], axis=1)
    print("Extended the data frame with dummy columns")

    del df[feature_name]
    print(f"Deleted '{feature_name}' column")

    other_columns = df_other_dummies.columns.tolist()

    df[other_columns] = df[other_columns].astype("boolean")
    print("Converted all remaining other columns to boolean")

    other_columns_fixed = (
        df[other_columns]
        .columns.str.strip()
        .str.replace(r"[- ]", "_", regex=True)
        .str.replace("/", "_ili_")
    ).tolist()

    df.rename(columns=dict(zip(other_columns, other_columns_fixed)), inplace=True)
    print("Fixed column names")

    features_info["binary"].extend(other_columns_fixed)
    print("Added all remaining other columns to 'binary' features")

    return df, features_info

In [24]:
df, features_info = cf_other_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,other
id,Unnamed: 1_level_1
9249043,
9650557,Rezervni ključ
9657017,"Servisna knjiga,Rezervni ključ"
9755964,"Prvi vlasnik,Kupljen nov u Srbiji,Garancija,Ga..."
9921885,"Kupljen nov u Srbiji,Garancija,Garažiran,Servi..."



Extended the data frame with dummy columns
Deleted 'other' column
Converted all remaining other columns to boolean
Fixed column names
Added all remaining other columns to 'binary' features


### Feature 'description'

In [25]:
@preprocess_init
def cf_description_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    feature_name = nb_utils.get_feature_name()

    print("First 5 rows")
    display(df[[feature_name]].head())
    print()

    features_info["other"].append(feature_name)
    print(f"Added '{feature_name}' to 'other' features")

    return df, features_info

In [26]:
df, features_info = cf_description_nb(df=df, features_info=features_info)

First 5 rows


Unnamed: 0_level_0,description
id,Unnamed: 1_level_1
9249043,"DIPLOMATA,JEDINSTVEN SA MALOM KILOMETRAZOM,(25..."
9650557,Vozilo u odlicnom stanju ...
9657017,- A4 Quattro - manuelni menjac - Uvoz iz Svjca...
9755964,"NOVO VOZILO, ODLICNI USLOVI FINANSIRANJA . OVL..."
9921885,Fica ima pravih 38.400 km !!! Nikad nije nist...



Added 'description' to 'other' features


### "GeneralInformation" features

In [27]:
@preprocess_init
def c_general_informations_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    pd.set_option("mode.chained_assignment", None)

    gi_columns = [
        "gi_condition",
        "gi_brand",
        "gi_model",
        "gi_production_year",
        "gi_kilometerage",
        "gi_body_type",
        "gi_fuel_type",
        "gi_engine_capacity",
        "gi_engine_power",
        "gi_trade_in",
        "gi_certified",
        "gi_battery_capacity",
    ]

    print("Frequencies for GeneralInformation columns")
    for col in gi_columns:
        display(nb_utils.get_value_counts_freq_with_perc(df, col))

    display(
        Markdown(
            "Only used cars will be left in the dataset, so all new cars are going to be deleted."
        )
    )

    new_cars_cond = df.gi_condition == "Novo vozilo"
    df = df.loc[~new_cars_cond, :]
    print(f"Deleted from df {new_cars_cond.sum()} rows that represent new cars")

    del df["gi_condition"]
    print("Deleted 'gi_condition' feature")

    df.gi_kilometerage = (
        df.gi_kilometerage.str.rstrip("km").str.replace(".", "").str.strip()
    )
    print("Stripped 'km' and spaces and removed '.' from 'gi_kilometerage'")

    df.gi_production_year = df.gi_production_year.str.rstrip(".").str.strip()
    print("Removed '.' and stripped spaces from 'gi_production_year'")

    df.gi_engine_capacity = df.gi_engine_capacity.str.rstrip("cm3").str.strip()
    print("Stripped 'cm3' and spaces from 'gi_engine_capacity'")

    df.gi_engine_power = df.gi_engine_power.str.split("/", n=1).str.get(0).str.strip()
    print(
        "Extracted only value of KS (stands for horse powers) and stripped spaces"
    )

    df.gi_certified = pd.to_datetime(
        df.gi_certified.str.strip()
        .str.lstrip("do: ")
        .replace({"Nije atestiran": np.nan}),
        format="%m.%Y",
        errors="raise",
    )
    print(
        "Stripped spaces and 'do: ' from 'gi_certified', replaced 'Nije atestiran' with NA and transformed to datetime"
    )

    df.gi_battery_capacity = df.gi_battery_capacity.str.rstrip("kWh").str.strip()
    print("Stripped 'kWh' and spaces from 'gi_battery_capacity'")

    nominal_cols = [
        "gi_brand",
        "gi_model",
        "gi_body_type",
        "gi_fuel_type",
        "gi_trade_in",
    ]
    numerical_cols = [
        "gi_kilometerage",
        "gi_production_year",
        "gi_engine_capacity",
        "gi_engine_power",
        "gi_battery_capacity",
    ]
    other_cols = ["gi_certified"]

    for col in nominal_cols:
        df[col] = pd.Categorical(df[col], ordered=False)
    print("Converted nominal columns to categorical types (nominal)")

    for col in numerical_cols:
        df[col] = pd.to_numeric(df[col], errors="raise", downcast="unsigned")
    print("Converted numerical columns to numerical types")

    features_info["nominal"].extend(nominal_cols)
    print(f"Added {nominal_cols} columns to 'nominal' features")
    features_info["numerical"].extend(numerical_cols)
    print(f"Added {numerical_cols} columns to 'numerical' features")
    features_info["other"].extend(other_cols)
    print(f"Added {other_cols} columns to 'other' features")

    pd.set_option("mode.chained_assignment", "warn")

    return df, features_info

In [28]:
df, features_info = c_general_informations_nb(df=df, features_info=features_info)

Frequencies for GeneralInformation columns


Unnamed: 0_level_0,count,percentage [%]
gi_condition,Unnamed: 1_level_1,Unnamed: 2_level_1
Polovno vozilo,30135,97.907664
Novo vozilo,644,2.092336


Unnamed: 0_level_0,count,percentage [%]
gi_brand,Unnamed: 1_level_1,Unnamed: 2_level_1
Volkswagen,4811,15.630787
Audi,3189,10.36096
BMW,2770,8.999643
Opel,2377,7.722798
Peugeot,2055,6.67663
...,...,...
Oldsmobile,1,0.003249
Jinpeng,1,0.003249
Lamborghini,1,0.003249
Buick,1,0.003249


Unnamed: 0_level_0,count,percentage [%]
gi_model,Unnamed: 1_level_1,Unnamed: 2_level_1
A4,866,2.813607
Octavia,630,2.04685
A3,628,2.040352
A6,621,2.017609
Golf 7,574,1.864908
...,...,...
ML 420,1,0.003249
469,1,0.003249
bZ4X,1,0.003249
GLE 63 AMG,1,0.003249


Unnamed: 0_level_0,count,percentage [%]
gi_production_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2008.,1984,6.445953
2011.,1976,6.419962
2010.,1871,6.07882
2009.,1838,5.971604
2007.,1833,5.955359
...,...,...
1957.,1,0.003249
1961.,1,0.003249
1973.,1,0.003249
1958.,1,0.003249


Unnamed: 0_level_0,count,percentage [%]
gi_kilometerage,Unnamed: 1_level_1,Unnamed: 2_level_1
250.000 km,379,1.231359
230.000 km,334,1.085155
220.000 km,326,1.059164
0 km,323,1.049417
200.000 km,303,0.984437
...,...,...
183.548 km,1,0.003249
280.200 km,1,0.003249
77.080 km,1,0.003249
232.788 km,1,0.003249


Unnamed: 0_level_0,count,percentage [%]
gi_body_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Hečbek,10419,33.851002
Džip/SUV,6346,20.617954
Limuzina,5634,18.304688
Karavan,4603,14.955002
Monovolumen (MiniVan),2443,7.93723
Kupe,809,2.628415
Kabriolet/Roadster,315,1.023425
Pickup,208,0.675785
,2,0.006498


Unnamed: 0_level_0,count,percentage [%]
gi_fuel_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Dizel,20740,67.383606
Benzin,7787,25.299717
Benzin + Gas (TNG),1136,3.690828
Benzin + Metan (CNG),555,1.803177
Hibridni pogon,327,1.062413
Električni pogon,181,0.588063
Hibridni pogon (Benzin),35,0.113714
Hibridni pogon (Dizel),14,0.045486
Plug-in hibrid,3,0.009747
Metan CNG,1,0.003249


Unnamed: 0_level_0,count,percentage [%]
gi_engine_capacity,Unnamed: 1_level_1,Unnamed: 2_level_1
1968 cm3,3569,11.595568
1598 cm3,2743,8.91192
1995 cm3,2111,6.858572
1560 cm3,1474,4.788979
1461 cm3,1219,3.960493
...,...,...
1480 cm3,1,0.003249
190 cm3,1,0.003249
699 cm3,1,0.003249
5216 cm3,1,0.003249


Unnamed: 0_level_0,count,percentage [%]
gi_engine_power,Unnamed: 1_level_1,Unnamed: 2_level_1
110/150 (kW/KS),2513,8.164658
103/140 (kW/KS),1923,6.247766
85/116 (kW/KS),1653,5.370545
77/105 (kW/KS),1439,4.675266
81/110 (kW/KS),1334,4.334124
...,...,...
60/81 (kW/KS),1,0.003249
146/199 (kW/KS),1,0.003249
500/680 (kW/KS),1,0.003249
167/227 (kW/KS),1,0.003249


Unnamed: 0_level_0,count,percentage [%]
gi_trade_in,Unnamed: 1_level_1,Unnamed: 2_level_1
NE,22580,73.361708
DA,3740,12.151142
DA (za jeftinije),2827,9.184834
DA (za skuplje),991,3.219728
DA (u istoj ceni),438,1.423048
DA (staro za novo),203,0.659541


Unnamed: 0_level_0,count,percentage [%]
gi_certified,Unnamed: 1_level_1,Unnamed: 2_level_1
,29087,94.502745
Nije atestiran,164,0.532831
do: 10.2027,54,0.175444
do: 07.2027,49,0.159199
do: 01.2028,49,0.159199
...,...,...
do: 10.2032,1,0.003249
do: 10.2030,1,0.003249
do: 04.2031,1,0.003249
do: 10.2029,1,0.003249


Unnamed: 0_level_0,count,percentage [%]
gi_battery_capacity,Unnamed: 1_level_1,Unnamed: 2_level_1
,30598,99.411937
17 kWh,11,0.035739
75 kWh,10,0.03249
58 kWh,7,0.022743
22 kWh,7,0.022743
...,...,...
21.6 kWh,1,0.003249
38.3 kWh,1,0.003249
72.6 kWh,1,0.003249
36.6 kWh,1,0.003249


Only used cars will be left in the dataset, so all new cars are going to be deleted.

Deleted from df 644 rows that represent new cars
Deleted 'gi_condition' feature
Stripped 'km' and spaces and removed '.' from 'gi_kilometerage'
Removed '.' and stripped spaces from 'gi_production_year'
Stripped 'cm3' and spaces from 'gi_engine_capacity'
Extracted only value of KS (stands for horse powers) and stripped spaces
Stripped spaces and 'do: ' from 'gi_certified', replaced 'Nije atestiran' with NA and transformed to datetime
Stripped 'kWh' and spaces from 'gi_battery_capacity'
Converted nominal columns to categorical types (nominal)
Converted numerical columns to numerical types
Added ['gi_brand', 'gi_model', 'gi_body_type', 'gi_fuel_type', 'gi_trade_in'] columns to 'nominal' features
Added ['gi_kilometerage', 'gi_production_year', 'gi_engine_capacity', 'gi_engine_power', 'gi_battery_capacity'] columns to 'numerical' features
Added ['gi_certified'] columns to 'other' features


### "AdditionalInformation" features

In [29]:
@preprocess_init
def c_additional_informations_nb(
    df: pd.DataFrame, features_info: FeaturesInfo
) -> Tuple[pd.DataFrame, FeaturesInfo]:
    ai_columns = [col for col in df.columns if "ai_" in col]

    pd.set_option("mode.chained_assignment", None)

    print("Frequencies for AdditionalInformation columns")
    for col in ai_columns:
        display(nb_utils.get_value_counts_freq_with_perc(df, col))

    df.ai_engine_emission_class = df.ai_engine_emission_class.str.lstrip(
        "Euro"
    ).str.strip()
    print("Stripped 'Euro' and spaces from 'ai_engine_emission_class'")

    df.ai_doors_no = df.ai_doors_no.map({"4/5 vrata": True, "2/3 vrata": False})
    print(
        "Stripped spaces and mapped 'ai_doors_no' so that True represents 4/5 doors and False 2/3 doors"
    )

    df.ai_seats_no = df.ai_seats_no.str.rstrip("sedišta").str.strip()
    print("Stripped 'sedišta' and spaces from 'ai_seats_no")

    df = df.loc[df.ai_steering_wheel_side.str.strip() != "Desni volan", :]
    print("Kept only cars that have steering wheele on the right side")

    del df["ai_steering_wheel_side"]
    print("Deleted 'ai_steering_wheel_side' feature (no longer useful)")

    df.ai_registered_until = pd.to_datetime(
        df.ai_registered_until.str.strip().replace({"Nije registrovan": np.nan}),
        format="%m.%Y.",
        errors="raise",
    )
    print(
        "Stripped spaces from 'ai_registered_until', replaced 'Nije registrovan' with NA and transformed to datetime"
    )

    df.ai_credit = df.ai_credit.str.strip().map({"DA": True, np.nan: False})
    print(
        "Stripped spaces and mapped 'ai_credit' so that True represents 'DA' and False <NA>"
    )

    df.ai_deposit = df.ai_deposit.str.rstrip("€").str.strip()
    print("Stripped '€' and spaces from 'ai_deposit'")

    df.ai_installment_amount = df.ai_installment_amount.str.rstrip("€").str.strip()
    print("Stripped '€' and spaces from 'ai_installment_amount'")

    df.ai_interest_free_credit = df.ai_interest_free_credit.str.strip().map(
        {"DA": True, np.nan: False}
    )
    print(
        "Stripped spaces and mapped 'ai_interest_free_credit' so that True represents 'DA' and False <NA>"
    )

    df.ai_leasing = df.ai_leasing.str.strip().map({"DA": True, np.nan: False})
    print(
        "Stripped spaces and mappped 'ai_leasing' so that True represents 'DA' and False <NA>"
    )

    df.ai_cash_payment = df.ai_cash_payment.str.rstrip("€").str.strip()
    print("Stripped '€' and spaces from 'ai_cash_payment'")

    binary_cols = ["ai_doors_no", "ai_credit", "ai_interest_free_credit", "ai_leasing"]
    ordinal_cols = ["ai_engine_emission_class", "ai_damage"]
    nominal_cols = [
        "ai_floating_flywheel",
        "ai_gearbox_type",
        "ai_air_conditioning",
        "ai_color",
        "ai_interior_material",
        "ai_interior_color",
        "ai_propulsion",
        "ai_vehicle_origin",
        "ai_ownership",
        "ai_import_country",
        "ai_sales_method",
    ]
    numerical_cols = [
        "ai_seats_no",
        "ai_deposit",
        "ai_installment_no",
        "ai_installment_amount",
        "ai_cash_payment",
        "ai_range_on_full_battery_km",
    ]
    other_cols = [
        "ai_registered_until",
    ]

    df[binary_cols] = df[binary_cols].astype("boolean")
    print("Converted binary columns to boolean")

    for col in ordinal_cols:
        df[col] = pd.Categorical(df[col], ordered=True)
    print("Converted ordinal columns to categorical types (ordinal)")

    for col in nominal_cols:
        df[col] = pd.Categorical(df[col], ordered=False)
    print("Converted nominal columns to categorical types (nominal)")

    for col in numerical_cols:
        df[col] = pd.to_numeric(df[col], errors="raise", downcast="unsigned")
    print("Converted numerical columns to numerical types")

    features_info["binary"].extend(binary_cols)
    print(f"Added {binary_cols} columns to 'binary' features")
    features_info["ordinal"].extend(ordinal_cols)
    print(f"Added {ordinal_cols} columns to 'ordinal' features")
    features_info["nominal"].extend(nominal_cols)
    print(f"Added {nominal_cols} columns to 'nominal' features")
    features_info["numerical"].extend(numerical_cols)
    print(f"Added {numerical_cols} columns to 'numerical' features")
    features_info["other"].extend(other_cols)
    print(f"Added {other_cols} columns to 'other' features")

    pd.set_option("mode.chained_assignment", "warn")

    return df, features_info

In [30]:
df, features_info = c_additional_informations_nb(df=df, features_info=features_info)

Frequencies for AdditionalInformation columns


Unnamed: 0_level_0,count,percentage [%]
ai_floating_flywheel,Unnamed: 1_level_1,Unnamed: 2_level_1
,20498,68.020574
Sa plivajućim zamajcem,6211,20.610586
Bez plivajućeg zamajca,3426,11.36884


Unnamed: 0_level_0,count,percentage [%]
ai_engine_emission_class,Unnamed: 1_level_1,Unnamed: 2_level_1
Euro 4,8987,29.822466
Euro 6,8925,29.616725
Euro 5,8360,27.741828
Euro 3,3210,10.652066
Euro 2,386,1.280903
Euro 1,260,0.862784
,7,0.023229


Unnamed: 0_level_0,count,percentage [%]
ai_propulsion,Unnamed: 1_level_1,Unnamed: 2_level_1
Prednji,21834,72.453957
4x4,5034,16.704828
Zadnji,2659,8.823627
4x4 reduktor,608,2.017588


Unnamed: 0_level_0,count,percentage [%]
ai_gearbox_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Manuelni 5 brzina,10642,35.314418
Manuelni 6 brzina,10192,33.821138
Automatski / poluautomatski,9129,30.293678
Manuelni 4 brzine,154,0.511034
Automatski,16,0.053094
Poluautomatski,1,0.003318
,1,0.003318


Unnamed: 0_level_0,count,percentage [%]
ai_doors_no,Unnamed: 1_level_1,Unnamed: 2_level_1
4/5 vrata,27034,89.70964
2/3 vrata,3099,10.283723
,2,0.006637


Unnamed: 0_level_0,count,percentage [%]
ai_seats_no,Unnamed: 1_level_1,Unnamed: 2_level_1
5 sedišta,26987,89.553675
4 sedišta,1785,5.923345
7 sedišta,932,3.092749
2 sedišta,344,1.14153
6 sedišta,48,0.159283
3 sedišta,18,0.059731
8 sedišta,13,0.043139
9 sedišta,8,0.026547


Unnamed: 0_level_0,count,percentage [%]
ai_steering_wheel_side,Unnamed: 1_level_1,Unnamed: 2_level_1
Levi volan,29991,99.52215
Desni volan,144,0.47785


Unnamed: 0_level_0,count,percentage [%]
ai_air_conditioning,Unnamed: 1_level_1,Unnamed: 2_level_1
Automatska klima,20447,67.851336
Manuelna klima,8709,28.89995
Nema klimu,979,3.248714


Unnamed: 0_level_0,count,percentage [%]
ai_color,Unnamed: 1_level_1,Unnamed: 2_level_1
Siva,9677,32.112162
Crna,7828,25.976439
Bela,4450,14.766882
Plava,2768,9.185333
Crvena,1382,4.58603
Srebrna,740,2.455616
Teget,714,2.369338
Braon,563,1.868259
Zelena,507,1.682429
Bordo,311,1.032023


Unnamed: 0_level_0,count,percentage [%]
ai_interior_material,Unnamed: 1_level_1,Unnamed: 2_level_1
Štof,16262,53.963829
Prirodna koža,5191,17.225817
,4086,13.558985
Kombinovana koža,3655,12.128754
Drugi,484,1.606106
Velur,457,1.516509


Unnamed: 0_level_0,count,percentage [%]
ai_interior_color,Unnamed: 1_level_1,Unnamed: 2_level_1
Crna,15373,51.013771
Siva,5790,19.213539
,5244,17.401692
Druga,1694,5.62137
Bež,1401,4.649079
Smeđa,633,2.100548


Unnamed: 0_level_0,count,percentage [%]
ai_registered_until,Unnamed: 1_level_1,Unnamed: 2_level_1
Nije registrovan,12986,43.092749
07.2024.,1493,4.954372
06.2024.,1487,4.934462
05.2024.,1438,4.77186
04.2024.,1419,4.70881
12.2024.,1390,4.612577
08.2024.,1381,4.582711
11.2024.,1365,4.529617
10.2024.,1351,4.483159
09.2024.,1304,4.327194


Unnamed: 0_level_0,count,percentage [%]
ai_vehicle_origin,Unnamed: 1_level_1,Unnamed: 2_level_1
Domaće tablice,18410,61.091754
Na ime kupca,11516,38.214701
Strane tablice,207,0.686909
,2,0.006637


Unnamed: 0_level_0,count,percentage [%]
ai_ownership,Unnamed: 1_level_1,Unnamed: 2_level_1
Vodi se na prodavca,12877,42.731044
,11749,38.987888
Ne vodi se na prodavca,5509,18.281069


Unnamed: 0_level_0,count,percentage [%]
ai_damage,Unnamed: 1_level_1,Unnamed: 2_level_1
Nije oštećen,29753,98.732371
Oštećen - u voznom stanju,233,0.773187
Oštećen - nije u voznom stanju,144,0.47785
,5,0.016592


Unnamed: 0_level_0,count,percentage [%]
ai_import_country,Unnamed: 1_level_1,Unnamed: 2_level_1
,22539,74.79343
Švajcarska,2035,6.752945
Italija,1370,4.546209
Francuska,1253,4.157956
Nemačka,1075,3.567281
Holandija,667,2.213373
Belgija,367,1.217853
Norveška,261,0.866103
Austrija,203,0.673635
Slovenija,157,0.520989


Unnamed: 0_level_0,count,percentage [%]
ai_sales_method,Unnamed: 1_level_1,Unnamed: 2_level_1
,27091,89.898789
Komisiona,2766,9.178696
Konsignaciona,278,0.922515


Unnamed: 0_level_0,count,percentage [%]
ai_credit,Unnamed: 1_level_1,Unnamed: 2_level_1
,28318,93.970466
DA,1817,6.029534


Unnamed: 0_level_0,count,percentage [%]
ai_deposit,Unnamed: 1_level_1,Unnamed: 2_level_1
,29662,98.430397
30 €,31,0.10287
3000 €,20,0.066368
6000 €,17,0.056413
4800 €,9,0.029866
...,...,...
1948 €,1,0.003318
3225 €,1,0.003318
3380 €,1,0.003318
2610 €,1,0.003318


Unnamed: 0_level_0,count,percentage [%]
ai_installment_no,Unnamed: 1_level_1,Unnamed: 2_level_1
,29525,97.975776
84.0,331,1.098391
60.0,137,0.454621
70.0,42,0.139373
71.0,24,0.079642
36.0,15,0.049776
48.0,13,0.043139
24.0,10,0.033184
42.0,8,0.026547
1.0,6,0.01991


Unnamed: 0_level_0,count,percentage [%]
ai_installment_amount,Unnamed: 1_level_1,Unnamed: 2_level_1
,29557,98.081964
115 €,18,0.059731
150 €,11,0.036502
94 €,9,0.029866
140 €,9,0.029866
...,...,...
304 €,1,0.003318
227 €,1,0.003318
700 €,1,0.003318
18 €,1,0.003318


Unnamed: 0_level_0,count,percentage [%]
ai_interest_free_credit,Unnamed: 1_level_1,Unnamed: 2_level_1
,29949,99.382778
DA,186,0.617222


Unnamed: 0_level_0,count,percentage [%]
ai_leasing,Unnamed: 1_level_1,Unnamed: 2_level_1
,29348,97.388419
DA,787,2.611581


Unnamed: 0_level_0,count,percentage [%]
ai_cash_payment,Unnamed: 1_level_1,Unnamed: 2_level_1
,29852,99.060893
3000 €,10,0.033184
5000 €,10,0.033184
6000 €,8,0.026547
2500 €,8,0.026547
...,...,...
19970 €,1,0.003318
3660 €,1,0.003318
23700 €,1,0.003318
2150 €,1,0.003318


Unnamed: 0_level_0,count,percentage [%]
ai_range_on_full_battery_km,Unnamed: 1_level_1,Unnamed: 2_level_1
,30008,99.578563
300.0,10,0.033184
140.0,7,0.023229
150.0,6,0.01991
400.0,6,0.01991
50.0,5,0.016592
120.0,5,0.016592
160.0,5,0.016592
130.0,5,0.016592
230.0,4,0.013274


Stripped 'Euro' and spaces from 'ai_engine_emission_class'
Stripped spaces and mapped 'ai_doors_no' so that True represents 4/5 doors and False 2/3 doors
Stripped 'sedišta' and spaces from 'ai_seats_no
Kept only cars that have steering wheele on the right side
Deleted 'ai_steering_wheel_side' feature (no longer useful)
Stripped spaces from 'ai_registered_until', replaced 'Nije registrovan' with NA and transformed to datetime
Stripped spaces and mapped 'ai_credit' so that True represents 'DA' and False <NA>
Stripped '€' and spaces from 'ai_deposit'
Stripped '€' and spaces from 'ai_installment_amount'
Stripped spaces and mapped 'ai_interest_free_credit' so that True represents 'DA' and False <NA>
Stripped spaces and mappped 'ai_leasing' so that True represents 'DA' and False <NA>
Stripped '€' and spaces from 'ai_cash_payment'
Converted binary columns to boolean
Converted ordinal columns to categorical types (ordinal)
Converted nominal columns to categorical types (nominal)
Converted nume

## Testing

### Test individual components

In [31]:
class TestNotebook:
    # @pytest.fixture
    def df(self):
        df = get_dataset_from_db()
        return df

    # @pytest.fixture
    def features_info(self):
        features_info = get_metadata()
        return features_info

    def run_cf_test(self, func_name_ua):
        df, features_info = self.df(), self.features_info()
        func_name_nb = f"{func_name_ua}{nb_utils.NB_SUFFIX}"

        initial_cleaner_obj = InitialCleaner()

        # Get function from class object from .py file
        func_ic = getattr(initial_cleaner_obj, func_name_ua)
        # Get function defined in this notebook
        func_nb = globals()[func_name_nb]

        df_ic, features_info_ua = func_ic(df=df, features_info=features_info)
        df_nb, features_info_nb = func_nb(df=df, features_info=features_info)

        pd.testing.assert_frame_equal(df_ic, df_nb)
        assert features_info_ua == features_info_nb

    def test_c_general_informations(self):
        df, features_info = self.df(), self.features_info()

        initial_cleaner_obj = InitialCleaner()

        df_ua, features_info_ua = initial_cleaner_obj.initial_preparation(
            df=df, features_info=features_info
        )
        df_nb, features_info_nb = initial_preparation_nb(
            df=df, features_info=features_info
        )

        df_ua, features_info_ua = initial_cleaner_obj.c_general_informations(
            df=df_ua, features_info=features_info_ua
        )
        df_nb, features_info_nb = c_general_informations_nb(
            df=df_nb, features_info=features_info_nb
        )

        pd.testing.assert_frame_equal(df_ua, df_nb)
        assert features_info_ua == features_info_nb

    def test_c_additional_informations(self):
        df, features_info = self.df(), self.features_info()

        initial_cleaner_obj = InitialCleaner()

        df_ua, features_info_ua = initial_cleaner_obj.initial_preparation(
            df=df, features_info=features_info
        )
        df_nb, features_info_nb = initial_preparation_nb(
            df=df, features_info=features_info
        )

        df_ua, features_info_ua = initial_cleaner_obj.c_additional_informations(
            df=df_ua, features_info=features_info_ua
        )
        df_nb, features_info_nb = c_additional_informations_nb(
            df=df_nb, features_info=features_info_nb
        )

        pd.testing.assert_frame_equal(df_ua, df_nb)
        assert features_info_ua == features_info_nb

    def start(self):
        # Get all methods in UACleaner class with prefix 'cf_'
        cf_methods = [
            method
            for method in dir(InitialCleaner)
            if method.startswith(nb_utils.CF_PREFIX)
            and callable(getattr(InitialCleaner, method))
        ]

        # Run tests for each cf_method
        for cf_method in cf_methods:
            logging.info(cf_method)
            self.run_cf_test(cf_method)

        self.test_c_general_informations()
        self.test_c_additional_informations()

In [None]:
test_obj = TestNotebook()
test_obj.start()

### Test whole component

In [33]:
@preprocess_init
def test_whole_component(df: pd.DataFrame, features_info: FeaturesInfo):
    init_cleaner_obj = InitialCleaner()

    df_ua = init_cleaner_obj.clean(df=get_dataset_from_db())
    features_info_ua = init_cleaner_obj.features_info

    pd.testing.assert_frame_equal(df, df_ua)
    assert features_info == features_info_ua

In [34]:
test_whole_component(df=df, features_info=features_info)

## Save cleaned data and metadata in artifacts

In [35]:
nb_utils.save_dataset_and_metadata(
    nb_utils.STAGES_DICT[STAGE]["name"],
    nb_utils.STAGES_DICT[STAGE]["folder_path"],
    df,
    features_info,
)