## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Old Feature

In [2]:
class OldFeature(BaseEstimator, TransformerMixin):
    """Creates the old feature which is the difference between acquisition data and year"""
    
    def fit(self, X:pd.DataFrame, y:pd.Series = None) -> 'OldFeature':
        """Fit statement to accomodate the sklearn pipeline."""
        
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""
        
        X = X.copy()
        X['acquisition_date'] = pd.to_datetime(X['acquisition_date']) # this converts the acquisition_date from string to datetime
        X.last_updated = pd.to_datetime(X.last_updated)
        
        X['old'] = X.acquisition_date.dt.year - X.year
        
        return X

## Categorical Imputer

In [3]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""
    
    def __init__(self, variables=None)-> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X:pd.DataFrame, y:pd.Series = None) -> 'CategoricalImputer': # this mean we want to return categorical imputer
        """Fit statement to accomodate the sklearn pipeline."""
        
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""
        
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].fillna('Missing')
            
        return X

## Numerical Imputer

In [4]:
class NumericalImputer(BaseEstimator, TransformerMixin):
    """Numerical missing value imputer."""
    
    def __init__(self, variables = None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
        
    def fit(self, X:pd.DataFrame, y:pd.Series = None) -> 'NumericalImputer':
        # persist the mean in a dictionary
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = X[feature].mean()
        
        return self
    
    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature]= X[feature].fillna(self.imputer_dict[feature])
        return X

## Feature Selector

In [5]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """Selects the subset of the features"""
    
    def __init__(self,variables=None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X:pd.DataFrame, y:pd.Series = None) -> 'CategoricalImputer':
        """Fit statement to accomodate the sklearn pipeline."""
        
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        
        return X[self.variables]

## Categorical Encoder

In [6]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encodes the categorical features"""
    
    def __init__(self, variables = None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
            
    def fit(self, X:pd.DataFrame, y:pd.Series = None) -> 'CategoricalImputer':
        """Applies label encoding to each feature."""
        self.le_dict = {feature:LabelEncoder() for feature in self.variables}
        for key,le in self.le_dict.items():
            le.fit(X[key])
            le.classes_ = np.append(le.classes_,'Not Found')
        
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        for key, le in self.le_dict.items():
            X[key][~X[key].isin(le.classes_)]  = 'Not Found'
            X[key] = le.transform(X[key])
        
        return X

## Setting up the features

In [7]:
CAT_FEATURES = ['badge','body_type','category','colour', 'fuel','make','model','transmission']
NUM_FEATURES = ['cylinders','economy','litres','odometer','old']

## Creating the pipeline

In [8]:
pipeline = Pipeline([
    ("old_feature", OldFeature()),
    ("feature_selector", FeatureSelector(CAT_FEATURES+NUM_FEATURES)),
    ("cat_imputer", CategoricalImputer(CAT_FEATURES)),
    ("cat_encoder", CategoricalEncoder(CAT_FEATURES)),
    ("num_imputer", NumericalImputer(NUM_FEATURES)),
    ("lr", LinearRegression())
])

## Reading the data

In [16]:
df = pd.read_csv("car_train.csv")

In [17]:
df = df[df.price.notnull()]

In [18]:
y = np.log(df.price)
X = df.drop('price',axis=1)

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1234)

In [20]:
pipeline.fit(X_train,y_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Pipeline(memory=None,
     steps=[('old_feature', OldFeature()), ('feature_selector', FeatureSelector(variables=['badge', 'body_type', 'category', 'colour', 'fuel', 'make', 'model', 'transmission', 'cylinders', 'economy', 'litres', 'odometer', 'old'])), ('cat_imputer', CategoricalImputer(variables=['badge', 'body_type', 'cate...), ('lr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

In [21]:
pipeline.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.8606362817574529

In [22]:
df.isna().sum()

rownum                 0
price                  0
acquisition_date       0
badge               1573
body_type              2
category            2485
colour              1961
cylinders           2485
economy             3917
fuel                  28
last_updated           0
litres              2485
location               0
make                   0
model                  0
odometer            1550
transmission           7
year                   0
dtype: int64