In [1]:
#https://www.kaggle.com/austinreese/craigslist-carstrucks-data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import neighbors
import statsmodels.api as sm

In [3]:
#importing data
df = pd.read_csv('craigslistcars.csv')

# Data prep and cleaning

In [4]:
df.shape

(550313, 22)

In [5]:
#dropping columns that we will not use in this comparison
df.drop(['url', 'city', 'city_url', 'VIN', 'image_url', 'desc', 'lat', 'long', 'paint_color', 'cylinders', 'fuel'], axis=1, inplace=True)

In [6]:
df.columns

Index(['price', 'year', 'manufacturer', 'make', 'condition', 'odometer',
       'title_status', 'transmission', 'drive', 'size', 'type'],
      dtype='object')

In [7]:
#due to the size of our dataset we will drop rows with null values
#for our purposes this should not impact our results
df.dropna(inplace=True)

As we can see below, our dataset is too rich. We will limit our analysis to a few popular car brands to avoid using all of our 

In [8]:
df.head()

Unnamed: 0,price,year,manufacturer,make,condition,odometer,title_status,transmission,drive,size,type
0,9000,2009.0,chevrolet,suburban lt2,good,217743.0,clean,automatic,rwd,full-size,SUV
4,37000,2012.0,chevrolet,3500,excellent,178000.0,clean,automatic,4wd,full-size,pickup
12,9700,2010.0,cadillac,srx luxury collection,good,140000.0,clean,automatic,fwd,mid-size,SUV
13,2500,2001.0,chevrolet,silverado 1500,fair,220000.0,clean,automatic,rwd,full-size,pickup
14,24575,2016.0,ford,expedition el platinum,excellent,149000.0,clean,automatic,rwd,full-size,SUV


In [9]:
len(df['make'].unique())

12715

In [10]:
manufac_list = ['honda', 'toyota', 'subaru']
df = df.loc[df['manufacturer'].isin(manufac_list)]

In [11]:
len(df['make'].unique())

1558

In [12]:
df = pd.get_dummies(df, drop_first=True)

In [13]:
df.shape

(18383, 1590)

In [14]:
#examining which  columns are the most correlated with our group column
abs(df.corr()['price']).sort_values(ascending=False)

price                           1.000000
make_forrester                  0.447157
condition_fair                  0.034700
type_hatchback                  0.027360
manufacturer_subaru             0.018852
                                  ...   
make_tundra tro pro             0.000042
make_tundra crew max 4wd        0.000042
make_sienna xle minivan         0.000041
make_6 in. lifted 4wd tundra    0.000040
make_gr supra 3.0 premium       0.000033
Name: price, Length: 1590, dtype: float64

In [15]:
#defining predictive features
X = df.drop('price', axis=1)

#defining target variable
Y = df['price']

#test train split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=42)

# OLS Regression Model

In [16]:
#adding constant
X_train = sm.add_constant(X_train)

#model
OLS_model = sm.OLS(y_train,X_train).fit()

OLS_model.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,price,R-squared:,0.253
Model:,OLS,Adj. R-squared:,0.174
Method:,Least Squares,F-statistic:,3.203
Date:,"Tue, 15 Oct 2019",Prob (F-statistic):,3.73e-258
Time:,10:15:53,Log-Likelihood:,-265850.0
No. Observations:,14706,AIC:,534500.0
Df Residuals:,13296,BIC:,545200.0
Df Model:,1409,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.9e+07,7.99e+07,0.613,0.540,-1.08e+08,2.06e+08
year,-2.617e+04,3.75e+04,-0.697,0.486,-9.98e+04,4.74e+04
odometer,-0.6163,1.554,-0.397,0.692,-3.663,2.430
manufacturer_subaru,3.142e+06,2.57e+07,0.122,0.903,-4.72e+07,5.35e+07
manufacturer_toyota,-1.057e+06,1.83e+07,-0.058,0.954,-3.7e+07,3.49e+07
make_*rav4 7 seater,3.936e+06,2.56e+07,0.154,0.878,-4.62e+07,5.41e+07
make_- pilot,2.8e+06,3.15e+07,0.089,0.929,-5.89e+07,6.45e+07
make_./ m.r. 2,-1.973e+05,2.56e+07,-0.008,0.994,-5.04e+07,5e+07
make_2 door,-1.397e+06,2.56e+07,-0.055,0.956,-5.15e+07,4.87e+07

0,1,2,3
Omnibus:,52049.087,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44240469939.08
Skew:,69.526,Prob(JB):,0.0
Kurtosis:,8498.913,Cond. No.,1.59e+23


In [17]:
#adding constant
X_test = sm.add_constant(X_test)

#fitting linear regression model to data
OLS_model_test = sm.OLS(y_test, X_test).fit()

#summarizing model stats
OLS_model_test.summary()

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,price,R-squared:,0.61
Model:,OLS,Adj. R-squared:,0.522
Method:,Least Squares,F-statistic:,6.909
Date:,"Tue, 15 Oct 2019",Prob (F-statistic):,1.29e-306
Time:,10:15:58,Log-Likelihood:,-36233.0
No. Observations:,3677,AIC:,73820.0
Df Residuals:,2998,BIC:,78040.0
Df Model:,678,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.268e+06,4.3e+04,-29.524,0.000,-1.35e+06,-1.18e+06
year,638.5777,21.472,29.741,0.000,596.477,680.678
odometer,-0.0046,0.001,-5.979,0.000,-0.006,-0.003
manufacturer_subaru,-6001.9679,672.183,-8.929,0.000,-7319.954,-4683.981
manufacturer_toyota,-2807.0605,580.746,-4.834,0.000,-3945.761,-1668.360
make_*rav4 7 seater,2.019e-06,1.8e-06,1.122,0.262,-1.51e-06,5.55e-06
make_- pilot,1.811e-07,1e-06,0.181,0.856,-1.78e-06,2.14e-06
make_./ m.r. 2,-2.176e-07,5.73e-07,-0.380,0.704,-1.34e-06,9.05e-07
make_2 door,8.826e-07,1.19e-06,0.740,0.459,-1.45e-06,3.22e-06

0,1,2,3
Omnibus:,576.89,Durbin-Watson:,2.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10052.955
Skew:,0.069,Prob(JB):,0.0
Kurtosis:,11.099,Cond. No.,6.7e+22


# KNN Regression Model

In [18]:
df2 = np.array(X_test, y_test)

In [41]:
#Building model
knn = neighbors.KNeighborsRegressor(n_neighbors=11, weights='distance')

knn.fit(X_train, y_train)

y_pred = knn.predict(df2)

In [20]:
y_test.shape

(3677,)

In [21]:
y_pred.shape

(3677,)

In [42]:
knn.score(X_test, y_test)

-8.050512210411675

# Discussion

As we can see, both models performed rather poorly on this dataset. Technically when used on the test set, the OLS model performed better than the KNN model. It is possible that this data set is not great for prediction. There could be a lot of reasons for this. One major issue is the size, because we have so many categorical variables, when we start one-hot encoding the number of features we hvae grows drastically. In order to make this data easier to work with significant portions of the data are dropped. It is possible that we are training and testing our models on data that is too diluted because we have dropped the incorrect features. Its possible that our features also vary too wildly as they are from a source where they are being sold by individuals who might not be pricing their vehicles in the most accurate way. 

The regression model works better here because the data is likely parametric and there is some linear relationship between the input and prediction. KNN regression would be better suited for a situation where we had non-parametric data or where there was not a linear relationship between the input and output. 