## imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

%matplotlib inline

## preprocessing

### Data Loading

In [None]:
file_path = "https://aml-team18.s3.amazonaws.com/vehicles.csv.zip"
dataset = pd.read_csv(file_path, compression = 'zip')

### Pre-transform

In [None]:
drop_columns = ['id','url', 'region_url', 'VIN', 'image_url', 'posting_date', 'county']
df = dataset.drop(drop_columns, axis = 1)
PRICE_CAP = 100000
YEAR_CAP_MIN = 2000
YEAR_CAP_MAX = 2020
df = df[(df.price <= PRICE_CAP) & (df.year >= YEAR_CAP_MIN) & (df.year <= YEAR_CAP_MAX)]
df_price_stat = df.price.describe()
df['price_cate'] = df.price.apply(lambda x:
      'low' if 0 <= x <= df_price_stat['25%'] else
      'medium low' if df_price_stat['25%'] <= x <= df_price_stat['50%'] else
      'medium high' if df_price_stat['50%'] <= x <= df_price_stat['75%'] else
      'high'
)

### Data Imputer

In [None]:
def simple_imputer():
    return SimpleImputer(strategy='most_frequent')

def groupby_imputer(by_cate = 'manufacturer'):
    def _groupby_imputer(x):
        cols = x.columns
        def _groupby_one_cate(by_cate, col):
            ref = x.groupby(by_cate)[col].apply(lambda x: x.dropna().unique()[0]\
                                                if len(x.dropna().unique()) !=0 else 'nan').to_dict()
            ref[np.NaN] = 'nan'
            return x.apply(lambda x: ref[x[by_cate]] if pd.isnull(x[col]) else x[col], axis = 1).values
        res = []
        for col in cols:
            res.append(_groupby_one_cate(by_cate, col))
        return np.vstack(res).T
    pl = Pipeline(
        steps = [
            ('indication', FunctionTransformer(_groupby_imputer, validate=False))
        ]
    )
    return pl

def unknown_flag_imputer():
    return SimpleImputer(strategy='constant', fill_value = 'nan')


def imputer(groupby_imputing_feats, unknown_imputing_feats, most_freq_imputing_feats):
    data_imputation = ColumnTransformer(
        transformers = [
           ('group_imputer', groupby_imputer(), groupby_imputing_feats + ['manufacturer']),
            ('unknown_imputing', unknown_flag_imputer(), unknown_imputing_feats),
            ('most_freq_imputing', simple_imputer(), most_freq_imputing_feats)
        ]
    )
    return data_imputation

### Preprocessor

In [None]:
def feature_preproc():
    groupby_imputing_feats = ['cylinders', 'fuel', 'transmission', 'drive', 'size', 'type']
    unknown_imputing_feats = ['manufacturer','title_status', 'paint_color', 'state']
    most_freq_imputing_feats = []

    onehotfeats = groupby_imputing_feats + unknown_imputing_feats + most_freq_imputing_feats

    imputation_pipe = Pipeline(
        steps = [
                 ('inputation', imputer(
                                groupby_imputing_feats, 
                                unknown_imputing_feats, 
                                most_freq_imputing_feats
                               )),
                 ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]
    )
    numericalfeats = ['year', 'odometer', 'lat', 'long']

    numerical_pipe = Pipeline(
        steps = [
                 ('inputation', SimpleImputer()),
                 ('scaler', StandardScaler())
        ]
    )

    preprocessor = ColumnTransformer(
        [
         ('imputation',imputation_pipe, onehotfeats),
         ('standard', numerical_pipe, numericalfeats),
        #  ('target', )
         ]
    )
    return preprocessor

def target_preproc():
    return FunctionTransformer(lambda x: np.log(x).values, validate=False)

### Data Splitting

In [None]:
X = df.drop(['price', 'price_cate'], axis = 1)
y = df.price
# y_cate = df.price_cate

X_dev, X_test, y_dev, y_test = train_test_split(
    X, y, test_size = .2, random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_dev, y_dev, test_size = .2, random_state=42
)

In [None]:
feature_preprocessor = feature_preproc()
target_preprocessor = target_preproc()

In [None]:
X_train = feature_preprocessor.fit_transform(X_train)
X_valid = feature_preprocessor.transform(X_valid)
X_test = feature_preprocessor.transform(X_test)
# y_train = target_preprocessor.transform(y_train)
# y_valid = target_preprocessor.transform(y_valid)
# y_test = target_preprocessor.transform(y_test)

## model

###Vanilla Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
import gc
import time 

In [None]:
del dataset, df, df_price_stat, X, y, X_dev, y_dev
gc.collect()

98

In [None]:
tree = DecisionTreeClassifier(random_state = 84, max_depth=30)
start_time = time.time()
tree.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
print("Performance on Development data :", tree.score(X_valid, y_valid)*100, "%")
print("Performance on Test data :", tree.score(X_test, y_test)*100, '%')