# Feature engineering

Create new features using existing knowledge and features.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set()
import warnings
warnings.filterwarnings('ignore')



In [2]:
#Load the data and infer the date format for the date features

df = pd.read_csv('BikeSales-Dusty2.csv', # load the cleaned data.
                  parse_dates=['First_Seen','Last_Seen','Last_Modified','Registration_Expiry'], 
                  dayfirst=True)


In [3]:
df.head()

Unnamed: 0,ABS,Adjustable_Seat,Bore,Cam_Type,Carburettor,Charging_Method,Clutch_Type,Colour,Compression_Ratio,Cooling,...,Trail,URL,Valves_Per_Cylinder,Warranty_in_Months_from_First_Registration,Wet_Operational_Weight,Wheel_Type,Wheelbase,Width,Release_Year,Seller
0,Standard,No,78.0,DOHC_(Double_Over_Head_Cam),Electronic_fuel_injection,Engine,Multi_plate,black,13.0,Liquid,...,105.0,https://www.bikesales.com.au/bikes/details/201...,4.0,24,202.0,Spoke,1420.0,735.0,2011,Private
1,Standard,No,78.0,DOHC_(Double_Over_Head_Cam),Electronic_fuel_injection,Engine,Multi_plate,yellow,13.0,Liquid,...,105.0,https://www.bikesales.com.au/bikes/details/201...,4.0,24,202.0,Spoke,1420.0,735.0,2012,Private
2,Standard,No,78.0,DOHC_(Double_Over_Head_Cam),Electronic_fuel_injection,Engine,Other,black,13.0,Liquid,...,105.0,https://www.bikesales.com.au/bikes/details/201...,4.0,24,202.0,Spoke,1443.0,720.385,2014,Private
3,Standard,No,78.0,DOHC_(Double_Over_Head_Cam),Electronic_fuel_injection,Engine,Other,black,13.0,Liquid,...,105.0,https://www.bikesales.com.au/bikes/details/201...,4.0,24,202.0,Spoke,1443.0,720.385,2014,Private
4,Standard,No,78.0,DOHC_(Double_Over_Head_Cam),Electronic_fuel_injection,Engine,Other,black,13.0,Liquid,...,105.0,https://www.bikesales.com.au/bikes/details/201...,4.0,24,202.0,Spoke,1443.0,720.385,2014,Private


In [4]:
df.isnull().sum().sum()

0

In [5]:
# network id is essentially a unique identifier for each bike.
df.drop(['Network_ID'],axis=1, inplace=True)

In [6]:
# Make the feature labels clearer on what they describe.
df['Brand'] = df['Make']

df.drop(['Make'],axis=1, inplace=True)

## Remaining Warranty

This is the warranty left to on the Bike since it was first registered. Here we assume the bike is registered at the end of the release year.

In [7]:
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month


df['Remaining_Warranty'] = (datetime.now().year - df['Release_Year']) * 12 + datetime.now().month - 12
df['Remaining_Warranty'][df['Remaining_Warranty'] < 0] = df['Warranty_in_Months_from_First_Registration']
df['Remaining_Warranty'][df['Warranty_in_Months_from_First_Registration'] == 0] = 0

df.drop(['Warranty_in_Months_from_First_Registration'],axis=1,inplace=True)


## Remaining Registration

This is the registration that remains on the bike as of the last date the data was processed. The bike is assumed to have a current registration if there is an expiry, otherwise, the bike is assumed not to have a current registration.

In [8]:

df['Remaining_Registration'] = (df['Registration_Expiry'].dt.year - datetime.now().year) * 12 + \
                                    df['Registration_Expiry'].dt.month - datetime.now().month
df['Remaining_Registration'][df['Remaining_Registration'] < 0] = 0

df.drop(['Registration_Expiry'],axis=1, inplace=True)


## Age

Any asset decreases in value with age. We can deterime the age using todays date and the release year.



In [9]:
df['Age'] = datetime.now().year - df['Release_Year']
df.drop(['Release_Year'], axis=1, inplace=True)

## Live Advertisment

This will be the time the advertisement is live on the website. I would expect this to provide some insight in how long it takes to sell a bike on the site, once there is a sufficient number of bikes sold in the data set.


In [10]:
days = (df['Last_Seen'] - df['First_Seen'])
df['Live_Add'] = days.astype('timedelta64[D]')

## Sold

We can assume the bike is sold if the last time the advertisement was seen, isn't the most recent time. ie, the advertisement has been taken down. We will assume this means the bike has been sold, even though the seller could have decided to not sell the bike.


In [11]:
df['Sold'] = 'For Sale'
df['Sold'][(df['Last_Seen'] != df['Last_Seen'].max()) == True] = 'Sold'


## Engine Size

This is the same as the Engine description, just converted to the more common value.


In [12]:

allbins =[0,65,80,90,110,130,180,225,270,320,370,420,470,520,570,625,675,730,820,870,920,970,
          1050,1150,1250,1350,1450,1550,1650,1750,1850,1950,2050,2250,2400]
labels = [50,70,90,100,125,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,
          1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2300]

df['Engine_Size'] = pd.to_numeric(pd.cut(df['Engine_Description'],
                                    bins=allbins,
                                    labels=labels))

df.drop(['Engine_Description'], axis=1, inplace=True)


Write the updated data to file.

In [13]:

df.to_csv('Bikesales_Features.csv',index=False)