# BikeSales Modelling

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set()
import warnings
warnings.filterwarnings('ignore')



In [45]:
#Load the data and infer the date format for the date features

df = pd.read_csv('Bikesales_Features.csv', 
                  parse_dates=['First_Seen','Last_Seen','Last_Modified'], 
                  infer_datetime_format=True)
df.head()

Unnamed: 0,ABS,Adjustable_Seat,Bike,Body,Bore,Cam_Type,Carburettor,Carburettor(s),Charging_Method,Clutch_Type,...,Wheel_Type,Wheelbase,Width,Windscreen,Release_Year,Seller,Brand,Model,Age,Time_to_Sale
0,No,No,Kuda Pro 250,Adventure Touring,62.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Alternator & Regulator-Rectifier,Multi plate,...,Spoke,1480.0,860.0,Standard,2009,Private,Atomik,Kuda pro 250,10,43.0
1,No,No,250 R Road Manual 5sp 250cc,Super Sport,77.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,...,Spoke,1350.0,818.262,Standard,2013,Private,Megelli,250 r,6,43.0
2,No,No,GN250,Naked,72.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,...,Spoke,1504.0,818.5,Standard,1991,Private,Suzuki,Gn250,28,43.0
3,No,No,GSX1100 ESD Road Manual 5sp 1100cc,Sport Touring,74.319,DOHC (Double Over Head Cam),Constant velocity,4.0,Engine,Multi plate,...,Spoke,1439.0,769.659,Standard,1984,Private,Suzuki,Gsx1100 esd,35,43.0
4,No,No,TS250 Trail Manual 5sp 250cc,Trail,81.5,Reed Valve,Electronic fuel injection,1.0,Engine,Multi plate,...,Spoke,1400.0,818.262,Standard,1976,Private,Suzuki,Ts250,43,0.0


In [28]:
# SELECT INDIVIDUAL FEATURES - we want to simplify the model

In [29]:
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Regression metrics
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [44]:
def add_dummy_variables(df, categorical_features):
    
    additional = pd.get_dummies(df[categorical_features])
    df[additional.columns] = additional
    return df.drop(categorical_features, axis=1)

def feature_plot(model,X,top=10):
    
    feats_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['FeatureImportance'])
    feats_imp = feats_imp.sort_values('FeatureImportance', ascending=False)[:top]

    feats_imp.plot(kind='barh', figsize=(12,6), legend=False)
    plt.title('Feature Importance from XgBoost Classifier')
    sns.despine(left=True, bottom=True)
    plt.gca().invert_yaxis()
    
def plot_accuracy_predictions(y_test, pred):
    fig, ax = plt.subplots(figsize=(12,10))
    ax.scatter(y_test,pred)
    ax.plot([y_test(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=0.8)
    ax.set_xlabel('Measured');
    ax.set_ylabel('Predicted');
    ax.set_title('Accuracy of Predictions');
    

In [46]:
sport = df[df['Body'] == 'Super Sport']
sport_numbers = sport.select_dtypes(include='number')
sport_numbers.shape


(3133, 40)

In [47]:
sport['Bike'].unique()

array(['250 R Road Manual 5sp 250cc', 'CBR900RR Fireblade',
       'CBR125R Road Manual 6sp 125cc', 'VJF250 MY12 Road Manual 5sp',
       'RS 125 Road Manual 6sp 125cc', 'GT250R EFI Road Manual 5sp 250cc',
       'Ninja 250R  Road Manual 6sp 250cc',
       'Ninja 250R Special Edition Road Manual 6sp 250cc',
       'V-Night Road 150cc', 'CBR600 F Road Manual 6sp 600cc',
       'CBR250R Road Manual 6sp 250cc', 'CBR125R MY09 Road Manual 6sp',
       'CBR250R MY13 Road Manual 6sp', 'CBR250RR Road Manual 6sp 250cc',
       'CBR600RR Road Manual 6sp 599cc',
       'Ninja 250R MY09 Road Manual 6sp',
       'Ninja ZX-6R  Road Manual 6sp 600cc',
       'GSX-R750 Road Manual 6sp 750cc', 'YZF-R15 MY13 Road Manual 5sp',
       'YZF-R6 Road Manual 6sp 600cc', 'YZF-R15 Road Manual 5sp 150cc',
       'VTR1000F  Road Manual 6sp 1000cc', 'ZX-2R Road Manual 6sp 250cc',
       'RS4 125  Road Manual 6sp 125cc',
       'CBR600F4i Road Manual 6sp 600cc', 'YZF-R125 MY09 Road Manual 6sp',
       'YZF-R125 Roa

In [35]:
sport.head()

Unnamed: 0,ABS,Adjustable_Seat,Bike,Body,Bore,Cam_Type,Carburettor,Carburettor(s),Charging_Method,Clutch_Type,...,Wheel_Type,Wheelbase,Width,Windscreen,Release_Year,Seller,Brand,Model,Age,Time_to_Sale
1,No,No,250 R Road Manual 5sp 250cc,Super Sport,77.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,...,Spoke,1350.0,818.262,Standard,2013,Private,Megelli,250 r,6,43.0
8,No,No,CBR900RR Fireblade,Super Sport,74.0,DOHC (Double Over Head Cam),Electronic fuel injection,4.0,Engine,Multi plate,...,Spoke,1397.0,680.0,Standard,2000,Private,Honda,Cbr900rr fireblade 929,19,43.0
36,No,No,CBR125R Road Manual 6sp 125cc,Super Sport,58.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,...,Spoke,1294.0,675.0,Standard,2007,Dealer,Honda,Cbr125r,12,43.0
64,No,No,VJF250 MY12 Road Manual 5sp,Super Sport,73.0,DOHC (Double Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,...,Spoke,1380.0,764.0,Standard,2011,Private,Daelim,Vjf250,8,43.0
106,No,No,CBR125R Road Manual 6sp 125cc,Super Sport,58.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,...,Spoke,1294.0,675.0,Standard,2008,Private,Honda,Cbr125r,11,43.0


In [48]:
numerical_features = list(df.select_dtypes(include='number').columns)
#categorical_features = list(df.select_dtypes(exclude='number').columns)

# categorical_features = ['Bike', 'Body', 'Colour', 'Cooling', 'Country_of_Origin', 
#                         'Engine_Config', 'Fuel_Type', 'State', 'Suburb', 'Seller', 
#                         'Brand', 'Model'] 

categorical_features = ['Bike', 'Body', 'Brand', 'Model', 'Seller']

In [49]:
sport = add_dummy_variables(sport, categorical_features)

In [56]:
sport.head()

Unnamed: 0,ABS,Adjustable_Seat,Bore,Cam_Type,Carburettor,Carburettor(s),Charging_Method,Clutch_Type,Colour,Compression_Ratio,...,Model_f4 rr 1000,Model_f4 serie oro,Model_f4ago 1000,Model_spyder rs se5,Model_spyder rs sm5,Model_spyder rss se5,Model_spyder rss sm5,Model_v11 sport,Seller_Dealer,Seller_Private
1,No,No,77.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,Red,11.0,...,0,0,0,0,0,0,0,0,0,1
8,No,No,74.0,DOHC (Double Over Head Cam),Electronic fuel injection,4.0,Engine,Multi plate,Gold,11.0,...,0,0,0,0,0,0,0,0,0,1
36,No,No,58.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,Black,11.0,...,0,0,0,0,0,0,0,0,1,0
64,No,No,73.0,DOHC (Double Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,White,10.463877,...,0,0,0,0,0,0,0,0,0,1
106,No,No,58.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,Red,11.0,...,0,0,0,0,0,0,0,0,0,1


In [66]:
sport_df = sport.select_dtypes(include='number')

target = 'Price'
features = list(set(sport_df.columns) - set([target]))
#features = ['Age']

sc = StandardScaler()
sc.fit(sport_df)
s = sc.transform(sport_df)
scaleSport = pd.DataFrame(s,columns=sport_df.columns)

X = scaleSport[features]
y = scaleSport[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [67]:
features

['Bike_F4 Claudio MY19 Road Manual 6sp',
 'Model_Gsx r1000 limited edition motogp',
 'Model_Rsv1000r mille',
 'Bike_VFR400 Road Manual 6sp 400cc',
 'Compression_Ratio',
 'Model_Tt600',
 'Model_Yzf r15',
 'Model_Rsv1000r',
 'Bike_CBR600RR MY11 Road Manual 6sp',
 'Bike_YZF750R',
 'Bike_Panigale V4 Road Manual 6sp 1100cc',
 'Bike_Daytona 675 MY12 Road Manual 6sp',
 'Bike_YZF-R125 Road Manual 6sp 125cc',
 'Bike_1199 Panigale R ABS Road Manual 1198cc',
 'Model_Cb750f super sports',
 'Bike_RS4 125  Road Manual 6sp 125cc',
 'Bike_RSV1000R Mille Road Manual 6sp 1000cc',
 'Bike_CBR600RR MY09 Road Manual 6sp',
 'Model_Rsv4 factory aprc abs',
 'Bike_F4 1000 Road Manual 6sp 1000cc',
 'Bike_Ninja ZX-6R MY11 Road Manual 6sp',
 'Bike_500 V DUE Road Manual 6sp 500cc',
 'Bike_Daytona 675 R MY12 Road Manual 6sp',
 'Bike_R nine T Racer Spezial MY19 Road Manual 6sp',
 'Bike_1000 Le Mans Road Manual 5sp 1000cc',
 'Bike_Ninja 300 MY14 Road Manual 6sp',
 'Bike_Hayabusa MY17 Road Manual 6sp',
 'Bike_999S Road

In [74]:
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.linear_model import PassiveAggressiveRegressor, RidgeCV


def run_model(scaleSport, features, target):
    X = scaleSport[features]
    y = scaleSport[target]


    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


    models = []

    models.append(("LassoCV",LassoCV(cv=5)))
    models.append(("LinearRegression",LinearRegression()))
    models.append(("PassiveAggressiveRegressor",PassiveAggressiveRegressor()))
    models.append(("RidgeCV",RidgeCV(cv=5)))

    results = []
    names = []
    for name, model in models:
        result = cross_val_score(model, X_train, y_train,  cv=5)
        names.append(name)
        results.append(result)

    for i in range(len(names)):
        print(names[i],results[i].mean())

    

In [75]:
bike_cols = [col for col in sport_df.columns if 'Bike' in col]
brand_cols = [col for col in sport_df.columns if 'Brand' in col]
model_cols = [col for col in sport_df.columns if 'Model' in col]


In [76]:
run_model(scaleSport, bike_cols, 'Price')

LassoCV 0.5377973935459325
LinearRegression -2.1761776253045095e+28
PassiveAggressiveRegressor 0.5208203122866684
RidgeCV 0.5535146732964771


In [77]:
run_model(scaleSport, brand_cols, 'Price')

LassoCV 0.32660155489403786
LinearRegression -4.661776790246086e+25
PassiveAggressiveRegressor -0.5912960882457965
RidgeCV 0.3263656536993283


In [78]:
run_model(scaleSport, model_cols, 'Price')

LassoCV 0.6090985419474653
LinearRegression -1.1913756065802226e+28
PassiveAggressiveRegressor 0.5572967393362116
RidgeCV 0.6173495706391856
