In [46]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('data_test/cars.csv')
data.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7


In [7]:
data.columns

Index(['manufacturer_name', 'model_name', 'transmission', 'color',
       'odometer_value', 'year_produced', 'engine_fuel', 'engine_has_gas',
       'engine_type', 'engine_capacity', 'body_type', 'has_warranty', 'state',
       'drivetrain', 'price_usd', 'is_exchangeable', 'location_region',
       'number_of_photos', 'up_counter', 'feature_0', 'feature_1', 'feature_2',
       'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7',
       'feature_8', 'feature_9', 'duration_listed'],
      dtype='object')

In [33]:
data_summary = data.loc[:,['manufacturer_name', 'model_name', 'transmission','odometer_value','year_produced','engine_capacity','has_warranty','price_usd']]
data_summary

Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_capacity,has_warranty,price_usd
0,Subaru,Outback,automatic,190000,2010,2.5,False,10900.00
1,Subaru,Outback,automatic,290000,2002,3.0,False,5000.00
2,Subaru,Forester,automatic,402000,2001,2.5,False,2800.00
3,Subaru,Impreza,mechanical,10000,1999,3.0,False,9999.00
4,Subaru,Legacy,automatic,280000,2001,2.5,False,2134.11
...,...,...,...,...,...,...,...,...
38526,Chrysler,300,automatic,290000,2000,3.5,False,2750.00
38527,Chrysler,PT Cruiser,mechanical,321000,2004,2.2,False,4800.00
38528,Chrysler,300,automatic,777957,2000,3.5,False,4300.00
38529,Chrysler,PT Cruiser,mechanical,20000,2001,2.0,False,4000.00


In [34]:
data_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38531 entries, 0 to 38530
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   manufacturer_name  38531 non-null  object 
 1   model_name         38531 non-null  object 
 2   transmission       38531 non-null  object 
 3   odometer_value     38531 non-null  int64  
 4   year_produced      38531 non-null  int64  
 5   engine_capacity    38521 non-null  float64
 6   has_warranty       38531 non-null  bool   
 7   price_usd          38531 non-null  float64
dtypes: bool(1), float64(2), int64(2), object(3)
memory usage: 2.1+ MB


In [35]:
data_summary['has_warranty'].loc[data_summary['has_warranty'] == True] = 1
data_summary['has_warranty'].loc[data_summary['has_warranty'] == False] = 0
data_summary.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_capacity,has_warranty,price_usd
0,Subaru,Outback,automatic,190000,2010,2.5,0,10900.0
1,Subaru,Outback,automatic,290000,2002,3.0,0,5000.0
2,Subaru,Forester,automatic,402000,2001,2.5,0,2800.0
3,Subaru,Impreza,mechanical,10000,1999,3.0,0,9999.0
4,Subaru,Legacy,automatic,280000,2001,2.5,0,2134.11


In [36]:
data_summary.transmission.unique()

array(['automatic', 'mechanical'], dtype=object)

In [37]:
data_summary['transmission'].loc[data_summary['transmission'] == 'automatic'] = 1
data_summary['transmission'].loc[data_summary['transmission'] == 'mechanical'] = 0
data_summary.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_capacity,has_warranty,price_usd
0,Subaru,Outback,1,190000,2010,2.5,0,10900.0
1,Subaru,Outback,1,290000,2002,3.0,0,5000.0
2,Subaru,Forester,1,402000,2001,2.5,0,2800.0
3,Subaru,Impreza,0,10000,1999,3.0,0,9999.0
4,Subaru,Legacy,1,280000,2001,2.5,0,2134.11


In [38]:
data_summary['year_produced'] = 2020-data_summary['year_produced'] 
data_summary

Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_capacity,has_warranty,price_usd
0,Subaru,Outback,1,190000,10,2.5,0,10900.00
1,Subaru,Outback,1,290000,18,3.0,0,5000.00
2,Subaru,Forester,1,402000,19,2.5,0,2800.00
3,Subaru,Impreza,0,10000,21,3.0,0,9999.00
4,Subaru,Legacy,1,280000,19,2.5,0,2134.11
...,...,...,...,...,...,...,...,...
38526,Chrysler,300,1,290000,20,3.5,0,2750.00
38527,Chrysler,PT Cruiser,0,321000,16,2.2,0,4800.00
38528,Chrysler,300,1,777957,20,3.5,0,4300.00
38529,Chrysler,PT Cruiser,0,20000,19,2.0,0,4000.00


In [39]:
#dont wanna write datasummary anymore. lets create df
df = data_summary
df

Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_capacity,has_warranty,price_usd
0,Subaru,Outback,1,190000,10,2.5,0,10900.00
1,Subaru,Outback,1,290000,18,3.0,0,5000.00
2,Subaru,Forester,1,402000,19,2.5,0,2800.00
3,Subaru,Impreza,0,10000,21,3.0,0,9999.00
4,Subaru,Legacy,1,280000,19,2.5,0,2134.11
...,...,...,...,...,...,...,...,...
38526,Chrysler,300,1,290000,20,3.5,0,2750.00
38527,Chrysler,PT Cruiser,0,321000,16,2.2,0,4800.00
38528,Chrysler,300,1,777957,20,3.5,0,4300.00
38529,Chrysler,PT Cruiser,0,20000,19,2.0,0,4000.00


In [40]:
df.columns

Index(['manufacturer_name', 'model_name', 'transmission', 'odometer_value',
       'year_produced', 'engine_capacity', 'has_warranty', 'price_usd'],
      dtype='object')

In [41]:
df2 = df.loc[:,['transmission', 'odometer_value',       'year_produced', 'engine_capacity', 'has_warranty','price_usd']]
df2

Unnamed: 0,transmission,odometer_value,year_produced,engine_capacity,has_warranty,price_usd
0,1,190000,10,2.5,0,10900.00
1,1,290000,18,3.0,0,5000.00
2,1,402000,19,2.5,0,2800.00
3,0,10000,21,3.0,0,9999.00
4,1,280000,19,2.5,0,2134.11
...,...,...,...,...,...,...
38526,1,290000,20,3.5,0,2750.00
38527,0,321000,16,2.2,0,4800.00
38528,1,777957,20,3.5,0,4300.00
38529,0,20000,19,2.0,0,4000.00


In [44]:
 df2.corr()

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd
odometer_value,1.0,0.488679,0.105704,-0.421204
year_produced,0.488679,1.0,-0.005059,-0.705511
engine_capacity,0.105704,-0.005059,1.0,0.296597
price_usd,-0.421204,-0.705511,0.296597,1.0


In [47]:
lm1 = smf.ols('price_usd ~ odometer_value + year_produced + engine_capacity', data=df2)
fit1 = lm1.fit()
fit1.summary()

0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.599
Model:,OLS,Adj. R-squared:,0.599
Method:,Least Squares,F-statistic:,19190.0
Date:,"Wed, 30 Dec 2020",Prob (F-statistic):,0.0
Time:,08:50:55,Log-Likelihood:,-374810.0
No. Observations:,38521,AIC:,749600.0
Df Residuals:,38517,BIC:,749700.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.087e+04,80.825,134.477,0.000,1.07e+04,1.1e+04
odometer_value,-0.0068,0.000,-38.695,0.000,-0.007,-0.006
year_produced,-504.6729,2.952,-170.964,0.000,-510.459,-498.887
engine_capacity,2954.5468,31.125,94.925,0.000,2893.541,3015.553

0,1,2,3
Omnibus:,23486.427,Durbin-Watson:,1.441
Prob(Omnibus):,0.0,Jarque-Bera (JB):,339113.613
Skew:,2.7,Prob(JB):,0.0
Kurtosis:,16.495,Cond. No.,1160000.0


In [49]:
df = pd.get_dummies(df, columns=['manufacturer_name'])

In [56]:
df = df.rename(columns={"manufacturer_name_Alfa Romeo": "manufacturer_name_Alfa_Romeo" }, errors="raise")

In [57]:
lm2 = smf.ols('price_usd ~ odometer_value + year_produced + engine_capacity + manufacturer_name_Acura+manufacturer_name_Alfa_Romeo+manufacturer_name_Audi +manufacturer_name_Subaru+manufacturer_name_Suzuki+manufacturer_name_Toyota+manufacturer_name_Volkswagen+manufacturer_name_Volvo+manufacturer_name_ВАЗ+manufacturer_name_ГАЗ+manufacturer_name_ЗАЗ+manufacturer_name_Москвич+manufacturer_name_УАЗ', data=df )
fit2 = lm2.fit()
fit2.summary()

0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.614
Model:,OLS,Adj. R-squared:,0.614
Method:,Least Squares,F-statistic:,3823.0
Date:,"Wed, 30 Dec 2020",Prob (F-statistic):,0.0
Time:,08:56:33,Log-Likelihood:,-374090.0
No. Observations:,38521,AIC:,748200.0
Df Residuals:,38504,BIC:,748400.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.075e+04,80.928,132.788,0.000,1.06e+04,1.09e+04
odometer_value,-0.0064,0.000,-36.092,0.000,-0.007,-0.006
year_produced,-522.6289,3.064,-170.559,0.000,-528.635,-516.623
engine_capacity,2939.1155,31.175,94.277,0.000,2878.011,3000.220
manufacturer_name_Acura,998.4927,493.348,2.024,0.043,31.519,1965.467
manufacturer_name_Alfa_Romeo,-1210.0143,278.868,-4.339,0.000,-1756.602,-663.426
manufacturer_name_Audi,1853.2576,84.661,21.890,0.000,1687.321,2019.195
manufacturer_name_Subaru,-448.5155,235.520,-1.904,0.057,-910.141,13.110
manufacturer_name_Suzuki,-859.3108,262.464,-3.274,0.001,-1373.747,-344.875

0,1,2,3
Omnibus:,23957.969,Durbin-Watson:,1.498
Prob(Omnibus):,0.0,Jarque-Bera (JB):,375290.618
Skew:,2.744,Prob(JB):,0.0
Kurtosis:,17.272,Cond. No.,8630000.0


In [60]:
df['price_usd'] = np.log(df['price_usd'])

In [62]:
lm2 = smf.ols('price_usd ~ odometer_value + year_produced + engine_capacity + manufacturer_name_Acura+manufacturer_name_Alfa_Romeo+manufacturer_name_Audi +manufacturer_name_Subaru+manufacturer_name_Suzuki+manufacturer_name_Toyota+manufacturer_name_Volkswagen+manufacturer_name_Volvo+manufacturer_name_ВАЗ+manufacturer_name_ГАЗ+manufacturer_name_ЗАЗ+manufacturer_name_Москвич+manufacturer_name_УАЗ', data=df )
fit2 = lm2.fit()
fit2.summary()

0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.763
Model:,OLS,Adj. R-squared:,0.763
Method:,Least Squares,F-statistic:,7743.0
Date:,"Wed, 30 Dec 2020",Prob (F-statistic):,0.0
Time:,19:17:19,Log-Likelihood:,-27974.0
No. Observations:,38521,AIC:,55980.0
Df Residuals:,38504,BIC:,56130.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.1844,0.010,906.014,0.000,9.164,9.204
odometer_value,-2.087e-07,2.24e-08,-9.336,0.000,-2.52e-07,-1.65e-07
year_produced,-0.1028,0.000,-267.919,0.000,-0.104,-0.102
engine_capacity,0.4405,0.004,112.792,0.000,0.433,0.448
manufacturer_name_Acura,0.1301,0.062,2.105,0.035,0.009,0.251
manufacturer_name_Alfa_Romeo,-0.2563,0.035,-7.338,0.000,-0.325,-0.188
manufacturer_name_Audi,0.3644,0.011,34.361,0.000,0.344,0.385
manufacturer_name_Subaru,0.0942,0.030,3.193,0.001,0.036,0.152
manufacturer_name_Suzuki,-0.0354,0.033,-1.077,0.282,-0.100,0.029

0,1,2,3
Omnibus:,9031.377,Durbin-Watson:,1.703
Prob(Omnibus):,0.0,Jarque-Bera (JB):,387741.817
Skew:,-0.326,Prob(JB):,0.0
Kurtosis:,18.529,Cond. No.,8630000.0
