In [1]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
#filterwarnings('ignore')

In [2]:
df = pd.read_csv("vehicles.csv")


In [3]:
df = df.drop(['id','vin', 'url', 'region_url', 'image_url', 'description', 'county', 'lat', 'long','size','paint_color','type'], axis=1)

In [4]:
df.columns

Index(['region', 'price', 'year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission',
       'drive', 'state'],
      dtype='object')

### Task 2

In [12]:
categorical = df.dtypes == object
categorical

region           True
price           False
year            False
manufacturer     True
model            True
condition        True
cylinders        True
fuel             True
odometer        False
title_status     True
transmission     True
drive            True
size             True
type             True
paint_color      True
state            True
dtype: bool

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'price'], df['price'], random_state=1)

In [31]:
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='constant'),
    OneHotEncoder(handle_unknown='ignore'))
cont_preprocessing = make_pipeline(
    SimpleImputer(),
    StandardScaler())
preprocess = make_column_transformer(
    (cat_preprocessing, make_column_selector(dtype_include='object')),
    remainder=cont_preprocessing)


In [32]:
pipe_lr = make_pipeline(preprocess, Ridge())
scores = cross_val_score(pipe_lr, X_train, y_train, cv=5)
np.mean(scores)

-3.725155200002958

The linear model we picked is Ridge regression, and by the R2 score produced by the model we can conclude that the dataset fits very poorly on a linear model and we should explore other more complex models for this dataset. 

### Task 3

We first clean the dataset by dropping rows with price equal to 0 for the purpose of this homework.

In [5]:
from sklearn.preprocessing import PolynomialFeatures

In [6]:
#removing outliers
df_clean = df[df['price'] != 0]
df_clean = df_clean[df_clean['price'] < 300000]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df_clean.iloc[:, df_clean.columns != 'price'], df_clean['price'], random_state=1)

In [51]:
cat_oh_preprocessing = make_pipeline(
    SimpleImputer(strategy='constant'),
    OneHotEncoder(handle_unknown='ignore'))
    
cont_preprocessing = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    PolynomialFeatures(degree=2))
    
preprocess = make_column_transformer(
    (cat_preprocessing, make_column_selector(dtype_include='object')),
    remainder=cont_preprocessing)


In [None]:
pipe_lr = make_pipeline(preprocess, Ridge())
scores = cross_val_score(pipe_lr, X_train, y_train, cv=5)
np.mean(scores)

By cleaning the data (removing rows with price of 0) and other preprocessing techniques: 1. removing standardscaler, 2. changing the imputing strategy to be 'median' for continuous variables and 3. adding PolynomialFeatures of degree 2, the R2 score from cross validation increased significantly to close to 0. It is still a very bad score for the linear model, but we can see that in-depth preprocessing can improve the model.  