## Using Lasso Regression

In [1]:
import pandas as pd
import numpy as np
from scipy import stats


import matplotlib.pyplot as plt
import seaborn as sns 
plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")

# prepare data
from sklearn.model_selection import train_test_split

# model
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("data/vehicles.csv")


plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")

# prepare data
from sklearn.model_selection import train_test_split

# model
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("data/vehicles.csv")


# Too much memory was being used so the datafranme is reduced
df = df.take(np.random.permutation(len(df))[:5000])

# REMOVE OUTLIERS
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Limit the size for quicker Python run
df = df.take(np.random.permutation(len(df))[:1000])

string_cols = ['transmission','model','odometer', 'condition', 'manufacturer']
numeric_cols = ['price', 'odometer', 'year']

final_list = string_cols + numeric_cols
final_list

df_all = df[final_list].copy()

df_all_dummied = pd.get_dummies(df_all, drop_first = True)

# Fill in the median values at NA
df_all_dummied = df_all_dummied.fillna(df_all_dummied.median())

# DO THE SPLIT
X = df_all_dummied.drop(columns = 'price')
y = df_all_dummied['price']

alphas = np.logspace(-4, -0.5, 10)

tuned_parameters = [{"alpha": alphas}]
n_folds = 5

##############################################################################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 66)
lasso = Lasso()
lasso.fit(X_train, y_train)

#model = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)
#model.fit(X_train, y_train)
#scores = clf.cv_results_["mean_test_score"]
#scores_std = clf.cv_results_["std_test_score"]
print("Lasso Train RMSE:", np.round(np.sqrt(metrics.mean_squared_error(y_train, lasso.predict(X_train))), 5))
print("Lasso Test RMSE:", np.round(np.sqrt(metrics.mean_squared_error(y_test, lasso.predict(X_test))), 5))

Lasso Train RMSE: 5251.44603
Lasso Test RMSE: 11027.97486


## Using Ridge Regression

In [8]:
import pandas as pd
import numpy as np
from scipy import stats


import matplotlib.pyplot as plt
import seaborn as sns 
plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")

# prepare data
from sklearn.model_selection import train_test_split

# model
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("data/vehicles.csv")


plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")


# Too much memory was being used so the datafranme is reduced
df = df.take(np.random.permutation(len(df))[:5000])

# REMOVE OUTLIERS
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]


string_cols = ['transmission','model','odometer', 'condition', 'manufacturer']
numeric_cols = ['price', 'odometer', 'year']

final_list = string_cols + numeric_cols
final_list

df_all = df[final_list].copy()

df_all_dummied = pd.get_dummies(df_all, drop_first = True)

# Fill in the median values at NA
df_all_dummied = df_all_dummied.fillna(df_all_dummied.median())

# DO THE SPLIT
X = df_all_dummied.drop(columns = 'price')
y = df_all_dummied['price']

alphas = np.logspace(-4, -0.5, 10)

tuned_parameters = [{"alpha": alphas}]
n_folds = 5

##############################################################################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 66)

# Building a Ridge Regression Model

ridge = Ridge()

# Fitting model with x and y train sets

ridge.fit(X_train, y_train)

# Calculate Ridge Train and Test Set RMSE

print("Ridge Train RMSE:", np.round(np.sqrt(metrics.mean_squared_error(y_train, ridge.predict(X_train))), 5))
print("Ridge Test RMSE:", np.round(np.sqrt(metrics.mean_squared_error(y_test, ridge.predict(X_test))), 5))

feat_importances = pd.Series(ridge.feature_names_in_)
feat_importances.head(50)


Ridge Train RMSE: 6614.52657
Ridge Test RMSE: 9424.39974


0                               odometer
1                               odometer
2                                   year
3                    transmission_manual
4                     transmission_other
5                        model_01 MALIBU
6           model_1 series 128i coupe 2d
7     model_1 series 135i convertible 2d
8                        model_100 wagon
9                       model_124 spider
10             model_124 spider classica
11                     model_128 i coupe
12                            model_1500
13                        model_1500 4wd
14                        model_1500 4x4
15                   model_1500 big horn
16      model_1500 big horn truck low mi
17                    model_1500 classic
18           model_1500 classic crew cab
19           model_1500 classic quad cab
20        model_1500 classic regular cab
21    model_1500 classic slt crewcab 4x4
22                   model_1500 crew cab
23          model_1500 crew cab big horn
24    model_1500

## Using Lasso with GridsearchCV

In [9]:
#Using Lassso and GridSearchCV
import pandas as pd
import numpy as np
from scipy import stats


import matplotlib.pyplot as plt
import seaborn as sns 
plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")

# prepare data
from sklearn.model_selection import train_test_split

# model
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("data/vehicles.csv")


plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")


# Too much memory was being used so the datafranme is reduced
df = df.take(np.random.permutation(len(df))[:5000])

# REMOVE OUTLIERS
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]


string_cols = ['transmission','model','odometer', 'condition', 'manufacturer']
numeric_cols = ['price', 'odometer', 'year']

final_list = string_cols + numeric_cols
final_list

df_all = df[final_list].copy()

df_all_dummied = pd.get_dummies(df_all, drop_first = True)

# Fill in the median values at NA
df_all_dummied = df_all_dummied.fillna(df_all_dummied.median())

# DO THE SPLIT
X = df_all_dummied.drop(columns = 'price')
y = df_all_dummied['price']

alphas = np.logspace(-4, -0.5, 10)

tuned_parameters = [{"alpha": alphas}]
n_folds = 5

##############################################################################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 66)
lasso = Lasso()
lasso.fit(X_train, y_train)

model = GridSearchCV(lasso, tuned_parameters, cv=5, refit=True)
model.fit(X_train, y_train)
#scores = clf.cv_results_["mean_test_score"]
#scores_std = clf.cv_results_["std_test_score"]
print("Lasso Train with GridsearchCV RMSE:", np.round(np.sqrt(metrics.mean_squared_error(y_train, lasso.predict(X_train))), 5))
print("Lasso Test GridsearchCV RMSE:", np.round(np.sqrt(metrics.mean_squared_error(y_test, lasso.predict(X_test))), 5))



Lasso Train with GridsearchCV RMSE: 6369.4888
Lasso Test GridsearchCV RMSE: 10480.74084


## Using KFOLD

In [10]:
#Using Lassso and GridSearchCV
import pandas as pd
import numpy as np
from scipy import stats


import matplotlib.pyplot as plt
import seaborn as sns 
plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")

# prepare data
from sklearn.model_selection import train_test_split

# model
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from numpy import mean
from numpy import absolute
from numpy import sqrt

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("data/vehicles.csv")


plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")


# Too much memory was being used so the datafranme is reduced
df = df.take(np.random.permutation(len(df))[:5000])

# REMOVE OUTLIERS TO IMPROVE MODEL
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

# LIMIT the CATGORIES
string_cols = ['transmission','model','odometer', 'condition', 'manufacturer']
numeric_cols = ['price', 'odometer', 'year']

final_list = string_cols + numeric_cols
final_list

df_all = df[final_list].copy()

df_all_dummied = pd.get_dummies(df_all, drop_first = True)

# Fill in the median values at NA
df_all_dummied = df_all_dummied.fillna(df_all_dummied.median())

# DO THE SPLIT
X = df_all_dummied.drop(columns = 'price')
y = df_all_dummied['price']

alphas = np.logspace(-4, -0.5, 10)

tuned_parameters = [{"alpha": alphas}]
n_folds = 5

#############################################################################################


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 66)

# Define the cross-validation metod to use
cv = KFold(n_splits=10,random_state=1,shuffle=True)

# Build multiple linear regression model
model = LinearRegression()

#use k-fold CV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

print("K-Fold RMSE:")
sqrt(mean(absolute(scores)))



K-Fold RMSE:


7208.350612649905

In [11]:
# HEATMAP CORRELATION
#corr = df_all_dummied.corr()
#sns.heatmap(corr, annot = True);

## Random Forest Algorithm

In [12]:
#Using Linear Regression
import pandas as pd
import numpy as np
from scipy import stats


import matplotlib.pyplot as plt
import seaborn as sns 
plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")

# prepare data
from sklearn.model_selection import train_test_split

# model
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from numpy import mean
from numpy import absolute
from numpy import sqrt

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("data/vehicles.csv")


plt.rcParams["figure.figsize"] = (15,9)
plt.style.use("fivethirtyeight")


# Too much memory was being used so the datafranme is reduced
df = df.take(np.random.permutation(len(df))[:5000])

# REMOVE OUTLIERS TO IMPROVE MODEL
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

# LIMIT the CATGORIES
string_cols = ['transmission','model','odometer', 'condition', 'manufacturer']
numeric_cols = ['price', 'odometer', 'year']

final_list = string_cols + numeric_cols
final_list

df_all = df[final_list].copy()

df_all_dummied = pd.get_dummies(df_all, drop_first = True)

# Fill in the median values at NA
df_all_dummied = df_all_dummied.fillna(df_all_dummied.median())

# DO THE SPLIT
X = df_all_dummied.drop(columns = 'price')
y = df_all_dummied['price']

alphas = np.logspace(-4, -0.5, 10)

tuned_parameters = [{"alpha": alphas}]
n_folds = 5


################################################################
# RFE with n features
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn import linear_model

classifiers = dict(
    ols=linear_model.LinearRegression(), ridge=linear_model.Ridge(alpha=0.1)
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 66)

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [13]:
# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.09032258064516129
