## imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

%matplotlib inline

## preprocessing

### Data Loading

In [None]:
file_path = "https://aml-team18.s3.amazonaws.com/vehicles.csv.zip"
dataset = pd.read_csv(file_path, compression = 'zip')

### Pre-transform

In [None]:
drop_columns = ['id','url', 'region_url', 'VIN', 'image_url', 'posting_date', 'county']
df = dataset.drop(drop_columns, axis = 1)
PRICE_CAP = 100000
YEAR_CAP_MIN = 2000
YEAR_CAP_MAX = 2020
df = df[(df.price <= PRICE_CAP) & (df.year >= YEAR_CAP_MIN) & (df.year <= YEAR_CAP_MAX)]
df_price_stat = df.price.describe()
df['price_cate'] = df.price.apply(lambda x:
      'low' if 0 <= x <= df_price_stat['25%'] else
      'medium low' if df_price_stat['25%'] <= x <= df_price_stat['50%'] else
      'medium high' if df_price_stat['50%'] <= x <= df_price_stat['75%'] else
      'high'
)

### Data Imputer

In [None]:
def simple_imputer():
    return SimpleImputer(strategy='most_frequent')

def groupby_imputer(by_cate = 'manufacturer'):
    def _groupby_imputer(x):
        cols = x.columns
        def _groupby_one_cate(by_cate, col):
            ref = x.groupby(by_cate)[col].apply(lambda x: x.dropna().unique()[0]\
                                                if len(x.dropna().unique()) !=0 else 'nan').to_dict()
            ref[np.NaN] = 'nan'
            return x.apply(lambda x: ref[x[by_cate]] if pd.isnull(x[col]) else x[col], axis = 1).values
        res = []
        for col in cols:
            res.append(_groupby_one_cate(by_cate, col))
        return np.vstack(res).T
    pl = Pipeline(
        steps = [
            ('indication', FunctionTransformer(_groupby_imputer, validate=False))
        ]
    )
    return pl

def unknown_flag_imputer():
    return SimpleImputer(strategy='constant', fill_value = 'nan')


def imputer(groupby_imputing_feats, unknown_imputing_feats, most_freq_imputing_feats):
    data_imputation = ColumnTransformer(
        transformers = [
           ('group_imputer', groupby_imputer(), groupby_imputing_feats + ['manufacturer']),
            ('unknown_imputing', unknown_flag_imputer(), unknown_imputing_feats),
            ('most_freq_imputing', simple_imputer(), most_freq_imputing_feats)
        ]
    )
    return data_imputation

### Decategorizer

In [None]:
# todo

### Preprocessor

In [None]:
def feature_preproc():
    groupby_imputing_feats = ['cylinders', 'fuel', 'transmission', 'drive', 'size', 'type']
    unknown_imputing_feats = ['manufacturer','title_status', 'paint_color', 'state']
    most_freq_imputing_feats = []

    onehotfeats = groupby_imputing_feats + unknown_imputing_feats + most_freq_imputing_feats

    imputation_pipe = Pipeline(
        steps = [
                 ('inputation', imputer(
                                groupby_imputing_feats, 
                                unknown_imputing_feats, 
                                most_freq_imputing_feats
                               )),
                 ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]
    )
    numericalfeats = ['year', 'odometer', 'lat', 'long']

    numerical_pipe = Pipeline(
        steps = [
                 ('inputation', SimpleImputer()),
                 ('scaler', StandardScaler())
        ]
    )

    preprocessor = ColumnTransformer(
        [
         ('imputation',imputation_pipe, onehotfeats),
         ('standard', numerical_pipe, numericalfeats),
        #  ('target', )
         ]
    )
    return preprocessor

def target_preproc():
    return FunctionTransformer(lambda x: np.log(x).values, validate=False)

### Data Splitting

In [None]:
X = df.drop(['price', 'price_cate'], axis = 1)
y = df.price
# y_cate = df.price_cate

X_dev, X_test, y_dev, y_test = train_test_split(
    X, y, test_size = .2, random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_dev, y_dev, test_size = .2, random_state=42
)

In [None]:
feature_preprocessor = feature_preproc()
target_preprocessor = target_preproc()

In [None]:
X_train = feature_preprocessor.fit_transform(X_train)
X_valid = feature_preprocessor.transform(X_valid)
X_test = feature_preprocessor.transform(X_test)
# y_train = target_preprocessor.transform(y_train)
# y_valid = target_preprocessor.transform(y_valid)
# y_test = target_preprocessor.transform(y_test)

## model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [None]:
lr.score(X_train, y_train)

0.4761100216010916

In [None]:
lr.score(X_valid, y_valid)

0.45508906041959907

In [None]:
lr.score(X_test, y_test)

0.46631611567360765

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rfr = RandomForestRegressor(n_jobs=-1, n_estimators=500, max_depth=13)
rfr.fit(X_train, y_train)

RandomForestRegressor(max_depth=13, n_estimators=500, n_jobs=-1)

In [None]:
rfr.score(X_train, y_train)

0.7227569056786043

In [None]:
rfr.score(X_valid, y_valid)

0.625410056901474

In [None]:
rfr.score(X_test, y_test)

0.6313915802997974

### Adaboost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor

abr = AdaBoostRegressor(base_estimator=LinearRegression(), n_estimators=500)
abr.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=LinearRegression(), n_estimators=500)

In [None]:
abr.score(X_train, y_train)

0.43152729810492396

In [None]:
abr.score(X_valid, y_valid)

0.3990289646001648

In [None]:
abr.score(X_test, y_test)

0.41231979661178

### Histogram-based Gradient Boosting Regression Tree

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

params = {'max_iter':(500, 1000, 1500), 'max_depth':[9, 13, 17]}
gs_hgbr = GridSearchCV(HistGradientBoostingRegressor(), params, n_jobs=-1)
gs_hgbr.fit(X_train.toarray(), y_train)

  "Since version 1.0, "


In [None]:
gs_hgbr.best_params_

In [None]:
gs_hgbr.best_score_

In [None]:
best_params = gs_hgbr.best_params_
hgbr = HistGradientBoostingRegressor(max_iter=best_params['max_iter'], max_depth=best_params['max_depth'])
hgbr.fit(X_train.toarray(), y_train)

In [None]:
hgbr.score(X_train.toarray(), y_train)

In [None]:
hgbr.score(X_valid.toarray(), y_valid)

In [None]:
hgbr.score(X_test.toarray(), y_test)

### Multi-layer Perceptron Regressor

In [None]:
from sklearn.neural_network import MLPRegressor

mlpr = MLPRegressor(hidden_layer_sizes=(256, 64, 16), max_iter=100, alpha=0.1)
mlpr.fit(X_train, y_train)

In [None]:
mlpr.score(X_train, y_train)

In [None]:
mlpr.score(X_valid, y_valid)

In [None]:
mlpr.score(X_test, y_test)

### XGBoost

In [None]:
def print_scores(model_name):
  scores = [model_name.score(X_train, y_train), model_name.score(X_valid, y_valid), model_name.score(X_test, y_test)]
  print("Scores: \n\tTrain {:.4f}\n\tValid {:.4f}\n\tTest  {:.4f}".format(*scores))

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# params = {'learning_rate':[0.1, 0.3], 'n_estimators':[30,300], 'max_depth'=[12, 15]}

# gs_xgb_model = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', alpha = 10), params)
# gs_xgb_model.fit(X_train, y_train)

In [None]:
# best_params = gs_xgb_model.best_params_
# print("Best params:", best_params)
# xgb_model_1 = xgb.XGBRegressor(objective='reg:squarederror', alpha = 10, max_depth=best_params['max_depth'],
#                              n_estimators = best_params['n_estimators']), learning_rate=best_params['learning_rate'])

# xgb_model_1.fit(X_train, y_train)

In [None]:
xgb_model_2 = xgb.XGBRegressor(objective='reg:squarederror', max_depth = 11, alpha = 4, n_estimators = 550, eta=.3)
xgb_model_2.fit(X_train, y_train)


XGBRegressor(alpha=4, eta=0.3, max_depth=11, n_estimators=550,
             objective='reg:squarederror')

In [None]:
print_scores(xgb_model_2)

Scores: 
	Train 0.9406
	Valid 0.7919
	Test  0.8074


In [None]:
# xgb_model_2.save_model('xgb_model_v1.1.json')

# xgb_model_2 = xgb.XGBRegressor()
# xgb_model_2.load_model('xgb_model_v1.1.json')

### Catboost

In [None]:
try: 
  from catboost import CatBoostRegressor
except:
  !pip install catboost
  from catboost import CatBoostRegressor

cb_model = CatBoostRegressor(max_depth = 13,  n_estimators = 300, learning_rate=.3, verbose=0)
cb_model.fit(X_train, y_train)


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

<catboost.core.CatBoostRegressor at 0x7f5876e35350>

In [None]:
print_scores(cb_model)

Scores: 
	Train 0.8954
	Valid 0.7281
	Test  0.7384
