In [1]:
from __future__ import print_function
from packaging.version import parse as Version
from platform import python_version

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.10 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == Version(min_ver):
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))    
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(python_version())

if pyversion >= Version("3.10"):
    print(OK, "Python version is %s" % pyversion)
elif pyversion < Version("3.10"):
    print(FAIL, "Python version 3.10 is required,"
                " but %s is installed." % pyversion)
else:
    print(FAIL, "Unknown Python version: %s" % pyversion)

    
print()
requirements = {'numpy': "1.22.4", 'matplotlib': "3.5.2",'sklearn': "1.1.1", 
                'pandas': "1.4.2",'xgboost': "1.5.1", 'shap': "0.40.0"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.10.5

[42m[ OK ][0m numpy version 1.22.4 is installed.
[42m[ OK ][0m matplotlib version 3.5.2 is installed.
[42m[ OK ][0m sklearn version 1.1.1 is installed.
[42m[ OK ][0m pandas version 1.4.2 is installed.


  from pandas import MultiIndex, Int64Index


[42m[ OK ][0m xgboost version 1.5.1 is installed.
[42m[ OK ][0m shap version 0.40.0 is installed.


In [7]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
import seaborn as sb
from sklearn.model_selection import train_test_split 


In [4]:
df = pd.read_csv("../data/audi.csv")

### Preprocessing

**Splitting**

In [16]:
# Our dataset is large enough where we don't need to bother with KFolds
random_state = 42

y = df['price']
X = df.loc[:, df.columns != 'price']

# We're going to want to stratify on price
y_binned = pd.qcut(df['price'], q=10)

# first split to separate out the training set
X_train, X_other, y_train, y_other = train_test_split(X,y,\
                    train_size = 0.8,random_state = random_state, stratify = y_binned)
print('training set:',X_train.shape, y_train.shape) # 80% of points are in train
print(X_other.shape, y_other.shape) # 20% of points are in other

y_binned = pd.qcut(y_other, q=10)

# second split to separate out the validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,\
                    train_size = 0.5,random_state = random_state, stratify = y_binned)
print('validation set:',X_val.shape, y_val.shape) # 10% of points are in validation
print('test set:',X_test.shape, y_test.shape) # 10% of points are in test

training set: (8534, 8) (8534,)
(2134, 8) (2134,)
validation set: (1067, 8) (1067,)
test set: (1067, 8) (1067,)


**Preprocessing**

In [38]:
# We want to group very rare models into an "other" category- say models that are less than 0.5% of our training data

rare_models = pd.DataFrame(df['model'].value_counts() / len(df))
rare_models = rare_models[rare_models['model'] < 0.005]

df['model'] = np.where(df['model'].isin(rare_models.index), 'Other', df['model'])

In [27]:
# Create the preprocess ColumnTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler

# categorical features
onehot_ftrs = ['model','fuelType','transmission']

# continuous with well-defined min/max
minmax_ftrs = ['year', 'engineSize']
# continuous better suited for StandardScaler
std_ftrs = ['tax', 'mileage', 'mpg']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('std', StandardScaler(), std_ftrs),
        ('onehot' , OneHotEncoder(sparse=False,min_frequency = 0.01, handle_unknown = 'infrequent_if_exist'), onehot_ftrs)
    ], remainder = 'passthrough', verbose_feature_names_out = False)

clf = Pipeline(steps=[('preprocessor', preprocessor)]) 


X_train_prep = pd.DataFrame(clf.fit_transform(X_train), columns = clf.get_feature_names_out())
X_val_prep = pd.DataFrame(clf.transform(X_val),  columns = clf.get_feature_names_out())
X_test_prep = pd.DataFrame(clf.transform(X_test),  columns = clf.get_feature_names_out())




In [41]:
print(X_train_prep['model_infrequent_sklearn'].sum() / len(X_train_prep))
print(X_test_prep['model_infrequent_sklearn'].sum() / len(X_test_prep))
print(X_val_prep['model_infrequent_sklearn'].sum() / len(X_val_prep))

0.02636512772439653
0.033739456419868794
0.029053420805998126
