## Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_log_error

## Define functions
1. `get_all_data()` will return combined train and test data 
2. `get_result(return_preds = False)` will return the msle error
3. `get_result(return_preds = True)` will return test predictions

In [8]:
def get_all_data():
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')

    all_data = pd.concat([train, test])
    all_data.drop('ID', axis = 1, inplace = True)
    all_data.columns = all_data.columns.str.replace(' ', '_')
    return all_data

n = 19237
n_test = 8245

In [9]:
def get_result(model, return_preds = False):
    folds = 5

    train_oof = np.zeros((n_train,))
    if return_preds:
        preds = np.zeros((n_test,))

    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train[features], train['Price'])):
        X_train, X_valid = train.iloc[train_idx], train.iloc[valid_idx]
        y_train = np.log1p(X_train['Price'])
        y_valid = np.log1p(X_valid['Price'])
        X_train = X_train.drop('Price', axis=1)
        X_valid = X_valid.drop('Price', axis=1)

        model =  model.fit(X_train, y_train)

        y_pred = abs(model.predict(X_valid))
        y_pred = np.exp(y_pred)
        y_valid = np.exp(y_valid)
        train_oof[valid_idx] = y_pred

        err = np.sqrt(mean_squared_log_error(y_valid, y_pred))

        if return_preds:
            preds += model.predict(test)

        print(f'Fold {fold} Error: ', err)
    error = np.sqrt(mean_squared_log_error(train['Price'], train_oof))
    print(f'OOF Error: ', error)
    
    if return_preds:
        preds = preds/5
        return preds
    else:
        error = error 
        return (error)

## EDA & FE

In [4]:
idx = 'supercombo1'
iteration = 'all'

all_data = get_all_data()

# ---------------------Outliers----------------------------#
all_data.drop([16983,8541,17777,18957,18984], axis = 0, inplace = True)


# ---------------------Mileage----------------------------#
all_data['Mileage'] = all_data['Mileage'].apply(lambda x: x.split(' ')[0]).astype(float)

all_data['Mileage_trans'] = np.log1p(all_data['Mileage'])
all_data['is0_or_outlier'] = all_data['Mileage_trans'].apply(lambda x: 1 if ((x == 0) or (x>13.8155)) else 0).astype(str)
all_data.drop('Mileage_trans', axis = 1, inplace = True)


#--------------------Engine-----------------------------#
all_data['Engine_size'] = all_data['Engine_volume'].apply(lambda x: x.split(' ')[0]).astype(float)
all_data['isTurbo'] = all_data['Engine_volume'].apply(lambda x: 1 if len(x.split(' '))>1 else 0)


#--------------------Manuf & Model-----------------------------#
all_data['Model'] = all_data['Model'].str.lower()
all_data['Manufacturer'] = all_data['Manufacturer'].str.lower()

all_data['Model_type'] = all_data['Model'].str.split(' ').str.get(0)

all_data['Model_Name'] = all_data['Manufacturer'] + all_data['Model']

all_data.drop(['Model', 'Manufacturer'], axis = 1, inplace = True)


#--------------------Levi-----------------------------#
all_data['islevi0'] = all_data['Levy'].apply(lambda x: 1 if (x == '-') else 0).astype(str)


#------------------Prod. year-------------------------#
all_data['Prod._year'] = all_data['Prod._year'] - all_data['Prod._year'].min()


#--------------------Cylinders-----------------------------#
all_data['Cylinders'] = all_data['Cylinders'].astype(str)


#-------------------------------------------------------#

cat_features = ['Levy', 'Category', 'Leather_interior', 'Fuel_type', 'Engine_volume', 
                'Gear_box_type', 'Drive_wheels', 'Doors', 'Wheel', 'Color',
               'is0_or_outlier','isTurbo','Model_Name', 'Model_type','islevi0', 'Cylinders']

n_train = n-5

## Encoding

In [5]:
cont_features = [col for col in all_data.columns if col not in cat_features + ['Price']]
features = cat_features + cont_features

label_encoder = LabelEncoder()
for col in cat_features:
    all_data[col] = label_encoder.fit_transform(all_data[col])

train = all_data.iloc[:n_train, :]
test = all_data.iloc[n_train:, :]
test = test.drop('Price', axis=1)

## Training data

In [6]:
model = ExtraTreesRegressor(
    max_depth = 100,
    max_features = 0.7352752459022777,
    n_estimators = 150,
    min_samples_split = 2,
    random_state = 42,
    n_jobs = -1,
    verbose = 0,
)

err = get_result(model)

Fold 0 Error:  0.9428321231896353
Fold 1 Error:  0.9135322650663795
Fold 2 Error:  0.9214210871082499
Fold 3 Error:  0.9667184785592778
Fold 4 Error:  0.9245400467267056
OOF Error:  0.937538713345269


## Test set predictions

In [10]:
preds = get_result(model, return_preds = True)

Fold 0 Error:  0.9428321231896353
Fold 1 Error:  0.9135322650663795
Fold 2 Error:  0.9214210871082499
Fold 3 Error:  0.9667184785592777
Fold 4 Error:  0.9245400467267056
OOF Error:  0.9375387133452688


In [11]:
submission = pd.read_csv('submission.csv')
submission.loc[:, 'Price'] = preds
submission.to_csv('my_submission_file.csv', index=False)

___