In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import os
from os.path import join, isfile
import warnings
warnings.filterwarnings("ignore")

## Project Setup

### Loading external Lib

In [2]:
# Load CONSTANT definition (As always, this is a bad practice but it keeps things tidy ;))
import constants as cnst

sys.path.append(cnst.BS_LIB_PATH)
from bs_lib import bs_file as bsf

In [3]:
# Load some data cleaning and outlier processing functions
import prepare_data as prepare
import build_model as build

### Creating needed directories

In [4]:
for folder in [cnst.PREPARED_DATASET_PATH,cnst.PROCESSED_FILES_PATH, cnst.MODEL_DIR_PATH]:
    try:
        os.mkdir(folder)
    except FileExistsError:
        print(f"{folder} already created")

In [5]:
#os.rmdir(cnst.PREPARED_DATASET_PATH)

## Processing Files 

Standardization of the different data structures:
* Opiniated correction of minor typing errors
* Changed the name of the `tax(£)` column.
* Added the brand name as a characteristic
* Cleaned up the empty space in the `model`.

Processed files are saved as **{brand_name}.csv** in the `processed_files` directory 

In [6]:
file_to_exclude = ['cclass.csv','unclean focus.csv','unclean cclass.csv','focus.csv']

all_df = bsf.load_all_csv(dataset_path=cnst.ORIGINAL_DATASET_PATH, exclude=file_to_exclude)
    
columns = ['model', 'year',
           'price', 'transmission',
           'mileage', 'fuel_type',
           'tax', 'mpg',
           'engine_size', 'brand']

# opiniated brand typo correction
all_df['hyundai'] = all_df.pop('hyundi')
all_df['mercedes'] = all_df.pop('merc')
all_df['opel'] = all_df.pop('vauxhall')

# merge column 'tax' and 'tax(£)' for Hyundai
all_df['hyundai']['tax'] = all_df['hyundai']['tax(£)']
all_df['hyundai'].drop(labels='tax(£)', axis=1, inplace=True)

for brand, dataframe in all_df.items():
    print(f"Source: {brand}, Shape: {dataframe.shape}")
    
    # Add brand name as feature
    all_df[brand]['brand'] = brand
    
    # Sanitize `model` blank space
    all_df[brand]['model'] = all_df[brand]['model'].str.strip()
    
    # Save as csv
    brand_df = pd.DataFrame(all_df[brand],columns=columns)
    dest_file_path = join(cnst.PROCESSED_FILES_PATH,f'{brand}.csv')
    brand_df.to_csv(dest_file_path)
    print(f"Dest:   {brand} data saved @ {dest_file_path}\n")

## Preparing Data

The following routine will performed brand by brand:
1. Modification of the data type:
   * Object to categorical data
   * All others to numeric data
2. Standardization and normalization
3. Removal of 'Other' and 'Electrical' categories
4. Replacement of the target by the logarithm of the target (RMSLE) 
5. Marking of outliers
6. outlier imputations

In [7]:
for brand, dataframe in all_df.items():
    filename = f"{brand}.csv"
    prepared_df = prepare.load_prepared_file(filename=filename)
    if isinstance(prepared_df, pd.DataFrame):
        current_df = prepared_df
    else:
        current_df = all_df[brand]
        current_df = prepare.clean_variables(current_df)
        current_df = prepare.nan_outliers(current_df)
        current_df = prepare.numerical_imputer(current_df, 
                                               n_neighbors=10, 
                                               weights='distance', 
                                               imputer_type='KNN')
        prepare.save_prepared_file(current_df, filename=filename)

## Merge an "all brands"dataset

In [8]:
 # if file doesn't already exist
filename = "all_brands_dataset.csv"
file_path = join(cnst.OUTPUT_PATH, filename)
if isfile(file_path):
    all_brands_df = bsf.load_csv_file(file_path, index=0)
else:
    all_df_dict = bsf.load_all_csv(dataset_path=cnst.PREPARED_DATASET_PATH, 
                               exclude=[], 
                               index=0)
    # load all csv files containing prepared data
    all_brands_df = pd.DataFrame(columns=columns, dtype=float)
    
    # merge all those files
    for brand, dataframe in all_df_dict.items():
        all_brands_df = pd.concat([all_brands_df, dataframe], 
                                  ignore_index=True)
    
    # clean, standardize and normalize again but using all data
    all_brands_df = prepare.clean_variables(all_brands_df)
    all_brands_df = prepare.nan_outliers(all_brands_df)
    all_brands_df = prepare.numerical_imputer(all_brands_df, 
                                              n_neighbors=10, 
                                              weights='distance', 
                                              imputer_type='KNN')
    # save all_brand_dataset.csv 
    all_brands_df.to_csv(file_path)

In [9]:
all_brands_df.info()

## Split dataset

1. Extract the dependent variable `price`
2. Create a dictionary containing, for each categorical variable, an ordered list according to the target value
3. Split the dataset into training, validation and test data. 
   
The resulting sizes of these splitted data sets will be respectively 75%, 15%, 10% of the size of the original data set

The dictionary of ordered categories will be used to give an implicit order to categories that do not intrinsically have one.

In [10]:
target = 'price'

df_target = all_brands_df[target]

ordered_categories = prepare.get_ordered_categories(data=all_brands_df, by=target)

all_brands_df.drop(target, axis=1, inplace=True)

X_train, X_val, X_test, y_train, y_val, y_test = prepare.train_val_test_split(X=all_brands_df,
                                                                      y=df_target,
                                                                      train_size=.75,
                                                                      val_size=.15,
                                                                      test_size=.1,
                                                                      random_state=1,
                                                                      show=True)

## Model building

In [11]:
model = build.get_best_model('all_brands', X_train, y_train, X_val, y_val, ordered_categories, verbose=False)

## Model Evaluation