# Cross Validation

In [1]:
import numpy as np
import pandas as pd
import used_cars

## Importing the dataset

In [2]:
dataset_path = "../Datasets/"
fullrawdata = pd.read_feather(f"{dataset_path}used_cars_data_medium.feather", 
                              used_cars.Info.columns).fillna(pd.NA)
# fullrawdata = pd.read_csv("used_cars_data.csv", sep=",", engine="c", header=0, 
#                           names=used_cars.Info.columns, 
#                           dtype=used_cars.Info.columns_dtype)

In [3]:
cols = ['back_legroom', 'front_legroom', 'fuel_tank_volume', 'height', 'length', 
        'maximum_seating', 'width', 'body_type', 'fuel_type', 'transmission', 
        'wheel_system', 'engine_type', 'power', 'torque']
fullrawdata = used_cars.remove_null_rows(fullrawdata, cols)

## Cross validation

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.model_selection import ShuffleSplit

In [5]:
sample_size = 10000
test_size = 0.1
n_splits = int(1/test_size)
random_state = 0

In [6]:
df = fullrawdata.sample(sample_size, random_state=random_state)

In [7]:
df.insert(df.shape[1]-1, "price", df.pop("price"))
X = pd.DataFrame(df.iloc[:, :-1])
y = pd.DataFrame(df.iloc[:, -1])["price"]

In [8]:
# Default hyperparameters
models = [
    ("Linear Regression", LinearRegression()),
    ("Decision Tree", DecisionTreeRegressor(random_state=0)),
    ("Random Forest", RandomForestRegressor(random_state=0)),
    ("XGBoost", XGBRegressor(random_state=0)),
]

# Final Hyperparameters from Grid Search
# models = [
#     ("Linear Regression", LinearRegression()),
#     ("Decision Tree", DecisionTreeRegressor(random_state=0)),
#     ("Random Forest", RandomForestRegressor(random_state=0, n_estimators=50)),
#     ("XGBoost", XGBRegressor(random_state=0, learning_rate=0.15, max_depth=10,
#                              n_estimators=125)),
# ]

imputers = [SimpleImputer()]

In [9]:
cv = ShuffleSplit(
    n_splits=n_splits, test_size=test_size, random_state=random_state
)

In [10]:
cols_8 = [
    'powerRPM', 'savings_amount', 'torqueRPM', 'pca_fuel_economy_1', 
    'pca_engine_1', 'pca_car_usage_1', 'avg_car_space', 
    'franchise_make_encoded_1'
]

In [11]:
test_results = used_cars.run_test(
    X, y, n_splits, sample_size, test_size, random_state, models, imputers, cv, 
    # final_cols=cols_8
)

## Exporting results

In [12]:
from datetime import datetime

In [13]:
output_path = "../Output/"
used_cars.write_test_results(
    "{}medium_10000_{}".format(
        output_path, datetime.now().strftime("%Y-%m-%d_%H-%M")), 
    test_results, models
)

## Exporting fitted models

In [14]:
import joblib
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

In [16]:
fitted_models = {
    "dataset": {
		"X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
	},
}
for label, model in models:
    for imputer in imputers:
        fit_model = used_cars.make_used_cars_pipeline(
            model=model, imputer=imputer,
            scaler=None if label != "Ridge Regression" else StandardScaler())
        fit_model.fit(X_train, y_train)
        fitted_models[f"{label}_{imputer}"] = fit_model

In [17]:
def export_model(file_name, fitted_models=fitted_models):
    """Export fitted_models dictionary to a joblib file."""
    with open(f"{file_name}.joblib", "wb") as f:
        joblib.dump(fitted_models, f)

In [18]:
export_model("{}medium_10000_model_{}".format(
    output_path,datetime.now().strftime("%Y-%m-%d_%H-%M")))