In [1]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import TargetEncoder, OrdinalEncoder, BinaryEncoder
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import dill

In [2]:
def get_data(path):
    return pd.read_csv(path)

def clear_data(data):
    data = data.drop_duplicates(keep=False)
    q1 = data.quantile(0.1)
    q3 = data.quantile(0.9)
    iqr = q3-q1
    data = data[~(((data>q3+3*iqr) | (data<q1-3*iqr)).any(axis=1))]
    return data

def data_split(data, target_col_name):
    y = data[target_col_name]
    X = data.drop(target_col_name, axis=1)
    return train_test_split(X, y, test_size=.3, random_state=10, shuffle=True)

In [3]:
def calc_score(y, y_pred):
    return r2_score(y , y_pred)

In [4]:
class TypeConverter(BaseEstimator, TransformerMixin):
    def __init__(self, key, to_type):
        self.key = key
        self.to_type = to_type

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[self.key] = X[self.key].apply(self.to_type)
        return X

class StringReplace(BaseEstimator, TransformerMixin):
    def __init__(self, key, target, replacer):
        self.key = key
        self.target = target
        self.replacer = replacer

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[self.key] = X[self.key].str.replace(self.target, self.replacer)
        return X

class GenderateTurboFeature(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['Turbo'] = X['Engine volume'].str.strip().apply(lambda x : 1 if len(x) > 3 else 0)
        return X

In [5]:
num_features = ['Prod. year', 'Airbags']
one_hot_encoded_features = ['Fuel type', 'Gear box type', 'Drive wheels']
ordinal_encoded_features = ['Manufacturer', 'Color']
target_encoded_features = ['Model', 'Category']
binary_encoded_features = ['Wheel', 'Leather interior']

str_scaler_step = ('std_scaler', StandardScaler())

levy_transformer = Pipeline([
    ('replacer', StringReplace(key='Levy', target='-', replacer='0')),
    ('converter', TypeConverter(key='Levy', to_type=int)),
    str_scaler_step
])

mileage_transformer = Pipeline([
    ('replacer', StringReplace(key='Mileage', target='km', replacer='')),
    ('converter', TypeConverter(key='Mileage', to_type=int)),
    str_scaler_step
])

doors_transformer = Pipeline([
    ('replacer_1', StringReplace(key='Doors', target='04-May', replacer='4-5')),
    ('replacer_2', StringReplace(key='Doors', target='02-Mar', replacer='2-3')),
    ('encoder', OneHotEncoder())
])

engine_volume_transformer = Pipeline([
    ('genderate Turbo feature', GenderateTurboFeature()),
    ('replacer', StringReplace(key='Engine volume', target=r"([a-z,' ',A-Z])", replacer='')),
    ('converter', TypeConverter(key='Engine volume', to_type=float)),
    str_scaler_step
])

num_features_transform = Pipeline([str_scaler_step])

In [6]:
preprocessor = ColumnTransformer(transformers=[
    ('Levy', levy_transformer, ['Levy']),
    ('Mileage', mileage_transformer, ['Mileage']),
    ('Doors', doors_transformer, ['Doors']),
    ('Engine volume', engine_volume_transformer, ['Engine volume']),
    ('rest numeric features', num_features_transform, num_features),
    ('binary encoder', BinaryEncoder(), binary_encoded_features),
    ('ordinal encoder', OrdinalEncoder(), ordinal_encoded_features),
    ('one hot encoder', OneHotEncoder(), one_hot_encoded_features),
    ('target encoder', TargetEncoder(), target_encoded_features),
])



In [7]:
model_params = {
    'random_state': 10,
    'booster': 'gbtree',
    'learning_rate': .1,
    'max_depth': 6,
    'n_estimators': 500
}
model = XGBRegressor(**model_params)

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)
])

In [8]:
df = get_data('car_price_prediction.csv')
cleared_df = clear_data(df)

X_train, X_test, y_train, y_test = data_split(cleared_df, 'Price')

  data = data[~(((data>q3+3*iqr) | (data<q1-3*iqr)).any(axis=1))]


In [9]:
pipeline.fit(X_train, y_train)

  X[self.key] = X[self.key].str.replace(self.target, self.replacer)


In [10]:
y_pred_train = pipeline.predict(X_train)
y_pred = pipeline.predict(X_test)

  X[self.key] = X[self.key].str.replace(self.target, self.replacer)
  X[self.key] = X[self.key].str.replace(self.target, self.replacer)


In [11]:
score_train = calc_score(y_train , y_pred_train)
score = calc_score(y_test , y_pred)
print("Train R-square: ", score_train)
print("Test R-square: ", score)

Train R-square:  0.9456300759507689
Test R-square:  0.795480298278294


In [12]:
with open("pipeline.dill", "wb") as pipeline_file:
    dill.dump(pipeline, pipeline_file)