# BikeSales Modelling

In [56]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set()
import warnings
warnings.filterwarnings('ignore')



In [57]:
#Load the data and infer the date format for the date features

df = pd.read_csv('clean_sport.csv', 
                  parse_dates=['First_Seen','Last_Seen','Last_Modified'], 
                  infer_datetime_format=True)
df.head()
sport = df

In [58]:
sport.drop(['Body','Drive_Type','Fuel_Type','Gear_Change_Method','Gear_Type',
            'Network_ID','Registration_Plate','URL','Windscreen'], axis=1, inplace=True)

# SELECT INDIVIDUAL FEATURES - we want to simplify the model

In [59]:
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Regression metrics
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [60]:
def add_dummy_variables(df, categorical_features):
    
    additional = pd.get_dummies(df[categorical_features])
    df[additional.columns] = additional
    return df.drop(categorical_features, axis=1)

def feature_plot(model,X,top=10):
    
    feats_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['FeatureImportance'])
    feats_imp = feats_imp.sort_values('FeatureImportance', ascending=False)[:top]

    feats_imp.plot(kind='barh', figsize=(12,6), legend=False)
    plt.title('Feature Importance from XgBoost Classifier')
    sns.despine(left=True, bottom=True)
    plt.gca().invert_yaxis()
    
def plot_accuracy_predictions(y_test, pred):
    fig, ax = plt.subplots(figsize=(12,10))
    ax.scatter(y_test,pred)
    ax.plot([y_test(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=0.8)
    ax.set_xlabel('Measured');
    ax.set_ylabel('Predicted');
    ax.set_title('Accuracy of Predictions');
    

In [61]:
print (sport.shape)
sport_numbers = sport.select_dtypes(include='number')
print (sport_numbers.shape)



(2177, 80)
(2177, 34)


In [62]:
sport.head()

Unnamed: 0,Bore,Compression_Ratio,Cylinders,Dry_Weight,Engine_Stroke,Engine_Voltage,Front_Brake_Diameter,Front_Suspension_Travel,Fuel_Reserve_Capacity,Fuel_Tank_Capacity,...,State,Suburb,Swingarm_Type,Traction_Control,Warranty_KMs,Wheel_Type,Windscreen,Seller,Brand,Model
0,62.0,11.0,2.0,152.0,4.0,12.0,290.0,120.0,3.459387,18.0,...,QLD,Ashgrove,Dual Sided,No,No,Spoke,Standard,Private,Kawasaki,Ninja 250r ex250j
1,62.0,11.0,2.0,129.0,4.0,12.0,290.0,120.0,3.459387,18.0,...,WA,South perth,Dual Sided,No,No,Spoke,Standard,Private,Kawasaki,Ninja 250r special edition
2,62.0,11.0,2.0,152.0,4.0,12.0,290.0,120.0,3.459387,18.0,...,QLD,Arana hills,Dual Sided,No,No,Spoke,Standard,Private,Kawasaki,Ninja 250r ex250j
3,62.0,11.0,2.0,152.0,4.0,12.0,290.0,120.0,3.459387,18.0,...,WA,none,Dual Sided,No,No,Spoke,Standard,Dealer,Kawasaki,Ninja 250r ex250j
4,48.6,10.0,4.0,129.0,4.0,12.0,296.0,118.0,3.459387,13.0,...,WA,none,Dual Sided,No,No,Spoke,Standard,Dealer,Honda,Cbr250r


In [63]:
numerical_features = list(df.select_dtypes(include='number').columns)
categorical_features = list(df.select_dtypes(exclude='number').columns)

# categorical_features = ['Bike', 'Body', 'Colour', 'Cooling', 'Country_of_Origin', 
#                         'Engine_Config', 'Fuel_Type', 'State', 'Suburb', 'Seller', 
#                         'Brand', 'Model'] 

#categorical_features = ['Bike', 'Body', 'Brand', 'Model', 'Seller']

In [64]:
sport = add_dummy_variables(sport, categorical_features)

In [65]:
sport.head()

Unnamed: 0,Bore,Compression_Ratio,Cylinders,Dry_Weight,Engine_Stroke,Engine_Voltage,Front_Brake_Diameter,Front_Suspension_Travel,Fuel_Reserve_Capacity,Fuel_Tank_Capacity,...,Model_f4 1000 abs,Model_f4 1000s corse,Model_f4 312 r 1000,Model_f4 312 r 1000 11,Model_f4 750 s,Model_f4 750 s f4s 11,Model_f4 r,Model_f4 rr,Model_f4 rr 1000,Model_f4ago 1000
0,62.0,11.0,2.0,152.0,4.0,12.0,290.0,120.0,3.459387,18.0,...,0,0,0,0,0,0,0,0,0,0
1,62.0,11.0,2.0,129.0,4.0,12.0,290.0,120.0,3.459387,18.0,...,0,0,0,0,0,0,0,0,0,0
2,62.0,11.0,2.0,152.0,4.0,12.0,290.0,120.0,3.459387,18.0,...,0,0,0,0,0,0,0,0,0,0
3,62.0,11.0,2.0,152.0,4.0,12.0,290.0,120.0,3.459387,18.0,...,0,0,0,0,0,0,0,0,0,0
4,48.6,10.0,4.0,129.0,4.0,12.0,296.0,118.0,3.459387,13.0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
sport_df = sport.select_dtypes(include='number')

target = 'Price'
features = list(set(sport_df.columns) - set([target]))
#features = ['Age']

sc = StandardScaler()
sc.fit(sport_df)
s = sc.transform(sport_df)
scaleSport = pd.DataFrame(s,columns=sport_df.columns)

X = scaleSport[features]
y = scaleSport[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [67]:
features

['Suburb_Cairns north',
 'Suburb_Clayfield',
 'Suburb_Derrimut',
 'Suburb_Campbellfield',
 'Colour_black',
 'Suburb_Karalee',
 'Model_Ninja zx 6r zx600r',
 'Suburb_Main beach',
 'Suburb_Amamoor',
 'Registration_Expiry_01/02/2019',
 'Suburb_The junction',
 'Suburb_Robina',
 'Suburb_Cheltenham',
 'Suburb_Airlie beach',
 'Colour_green',
 'Bike_Ninja ZX-10R ABS SE  Road Manual 1000cc',
 'Windscreen_Standard',
 'Bike_YZF-R15 MY15 Road Manual 6sp',
 'Suburb_Barmera',
 'Suburb_Engadine',
 'Suburb_Bonnyrigg',
 'Suburb_Strathpine',
 'Suburb_Golden beach',
 'Suburb_Newcastle',
 'Bike_MY04 Road Manual 6sp',
 'Model_Zx 12r',
 'Suburb_Coorparoo',
 'Suburb_Allenstown',
 'Bike_CBR600RR Telefonica Road Manual 6sp 599cc',
 'Model_Rsv1000r mille',
 'Bike_Ninja ZX-10R KRT ABS Replica MY16 Road Manual 6sp',
 'Engine_Config_Forward-Inclined',
 'Bike_RSV1000R Mille Road Manual 6sp 1000cc',
 'Suburb_Stuart park',
 'Suburb_Wagga wagga',
 'Suburb_Tuggerah',
 'Model_996',
 'Suburb_Frenchs forest',
 'Suburb_Yoki

In [68]:
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.linear_model import PassiveAggressiveRegressor, RidgeCV


def run_model(scaleSport, features, target):
    X = scaleSport[features]
    y = scaleSport[target]


    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


    models = []

    models.append(("LassoCV",LassoCV(cv=5)))
    models.append(("LinearRegression",LinearRegression()))
    models.append(("PassiveAggressiveRegressor",PassiveAggressiveRegressor()))
    models.append(("RidgeCV",RidgeCV(cv=5)))

    results = []
    names = []
    for name, model in models:
        result = cross_val_score(model, X_train, y_train,  cv=5)
        names.append(name)
        results.append(result)

    for i in range(len(names)):
        print(names[i],results[i].mean())

    

In [69]:
bike_cols = [col for col in sport_df.columns if 'Bike' in col]
brand_cols = [col for col in sport_df.columns if 'Brand' in col]
model_cols = [col for col in sport_df.columns if 'Model' in col]
niave_feature = ['Kilometers','Age']

In [70]:
run_model(scaleSport, niave_feature, 'Price')

LassoCV 0.11998115911682602
LinearRegression 0.11974530672945663
PassiveAggressiveRegressor -1.4276443968065005
RidgeCV 0.11980789006148237


In [71]:
run_model(scaleSport, bike_cols, 'Price')

LassoCV 0.7171432066369668
LinearRegression -2.2217769940804915e+28
PassiveAggressiveRegressor 0.665813277808301
RidgeCV 0.7205659635489007


In [72]:
run_model(scaleSport, brand_cols, 'Price')

LassoCV 0.3387452676578575
LinearRegression -5.476875865223057e+23
PassiveAggressiveRegressor -0.2353987015663368
RidgeCV 0.33864325401493073


In [73]:
run_model(scaleSport, model_cols, 'Price')

LassoCV 0.725378471610332
LinearRegression -1.1988333866221114e+28
PassiveAggressiveRegressor 0.577371175882079
RidgeCV 0.7237746949167747
