# BikeSales Modelling

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set()
import warnings
warnings.filterwarnings('ignore')



In [4]:
#Load the data and infer the date format for the date features

df = pd.read_csv('Bikesales_Features.csv', 
                  parse_dates=['First_Seen','Last_Seen','Last_Modified'], 
                  infer_datetime_format=True)
df.head()

Unnamed: 0,ABS,Adjustable_Seat,Bike,Body,Bore,Cam_Type,Carburettor,Carburettor(s),Charging_Method,Clutch_Type,...,Wheel_Type,Wheelbase,Width,Windscreen,Release_Year,Seller,Brand,Model,Age,Time_to_Sale
0,No,No,Kuda Pro 250,Adventure Touring,62.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Alternator & Regulator-Rectifier,Multi plate,...,Other,1480.0,860.0,Standard,2009,Private,Atomik,Kuda pro 250,10,43.0
1,No,No,250 R Road Manual 5sp 250cc,Super Sport,77.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,...,9-Spoke,1350.0,818.262,Standard,2013,Private,Megelli,250 r,6,43.0
2,No,No,GN250,Naked,72.0,SOHC (Single Over Head Cam),Electronic fuel injection,1.0,Engine,Multi plate,...,Laced,1504.0,818.5,Standard,1991,Private,Suzuki,Gn250,28,43.0
3,No,No,GSX1100 ESD Road Manual 5sp 1100cc,Sport Touring,74.319,DOHC (Double Over Head Cam),Constant velocity,4.0,Engine,Multi plate,...,Laced,1439.0,769.659,Standard,1984,Private,Suzuki,Gsx1100 esd,35,43.0
4,No,No,TS250 Trail Manual 5sp 250cc,Trail,81.5,Reed Valve,Electronic fuel injection,1.0,Engine,Multi plate,...,Laced,1400.0,818.262,Standard,1976,Private,Suzuki,Ts250,43,0.0


In [13]:
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Regression metrics
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [14]:
def add_dummy_variables(df, categorical_features):
    
    additional = pd.get_dummies(df[categorical_features])
    df[additional.columns] = additional
    return df.drop(categorical_features, axis=1)

def feature_plot(model,X,top=10):
    
    feats_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['FeatureImportance'])
    feats_imp = feats_imp.sort_values('FeatureImportance', ascending=False)[:top]

    feats_imp.plot(kind='barh', figsize=(12,6), legend=False)
    plt.title('Feature Importance from XgBoost Classifier')
    sns.despine(left=True, bottom=True)
    plt.gca().invert_yaxis()
    
def plot_accuracy_predictions(y_test, pred):
    fig, ax = plt.subplots(figsize=(12,10))
    ax.scatter(y_test,pred)
    ax.plot([y_test(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=0.8)
    ax.set_xlabel('Measured');
    ax.set_ylabel('Predicted');
    ax.set_title('Accuracy of Predictions');
    

In [15]:
sport = df[df['Body'].str.contains('Sport') == True]
sport_numbers = sport.select_dtypes(include='number')
sport_numbers.shape


(5343, 40)

In [16]:
numerical_features = list(df.select_dtypes(include='number').columns)
#categorical_features = list(df.select_dtypes(exclude='number').columns)

# categorical_features = ['Bike', 'Body', 'Colour', 'Cooling', 'Country_of_Origin', 
#                         'Engine_Config', 'Fuel_Type', 'State', 'Suburb', 'Seller', 
#                         'Brand', 'Model'] 

categorical_features = ['Bike', 'Body', 'Brand', 'Model', 'Seller']

In [17]:
#sport = add_dummy_variables(sport, categorical_features)

In [18]:

target = 'Price'
features = list(set(sport_numbers.columns) - set([target]))

sc = StandardScaler()
sc.fit(sport_numbers)
s = sc.transform(sport_numbers)
scaleSport = pd.DataFrame(s,columns=sport_numbers.columns)

X = scaleSport[features]
y = scaleSport[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [20]:
model = LassoCV()
model.fit(X_train, y_train)

pred = model.predict(X_test)


In [21]:
print ('Traininng Score:',model.score(X_train,y_train))
print ('Testing Score:',model.score(X_test,y_test))
print ('Data Score',model.score(X,y))
print ('Mean Squared Error:',mean_squared_error(y_test,pred))




Traininng Score: 0.5535575578342427
Testing Score: 0.5137986321554682
Data Score 0.5434195394878047
Mean Squared Error: 0.4958080597069517


In [24]:
model

LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)