In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from itertools import combinations

In [2]:
data=pd.read_csv('cardata.csv',na_values="?",index_col=0)
data=data.dropna(how='any',axis=0)
data

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
6,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.40,8.5,110.0,5500.0,19,25,17710.0
8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.40,8.3,140.0,5500.0,17,20,23875.0
10,2,192.0,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.50,2.80,8.8,101.0,5800.0,23,29,16430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


In [3]:
data.astype({"symboling":'int64','normalized-losses':'float64','city-mpg':'float64','highway-mpg':'float64','engine-size':'float64','curb-weight':'float64'}).dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight          float64
engine-type           object
num-of-cylinders      object
engine-size          float64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg             float64
highway-mpg          float64
price                float64
dtype: object

In [4]:
def onehot_encoder(df, feature):
    result = pd.DataFrame()
    if feature in df.columns:
        # The following line is important, refer to Assignment 2
        result = pd.get_dummies(df, columns=[feature])
        return result
    else:
        return print("Please select a feature in this df!")

In [5]:
data.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [6]:
features_numerical=['normalized-losses','wheel-base', 'length', 'width', 'height', 'curb-weight',
                    'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg']
features_categorical=['symboling','make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
                      'engine-type','num-of-cylinders', 'fuel-system']
target=['price']
features = features_categorical + features_numerical
data_p = data[features_numerical + features_categorical + target]
data_p

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,...,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system,price
3,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,10.0,...,gas,std,four,sedan,fwd,front,ohc,four,mpfi,13950.0
4,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,8.0,...,gas,std,four,sedan,4wd,front,ohc,five,mpfi,17450.0
6,158.0,105.8,192.7,71.4,55.7,2844,136,3.19,3.40,8.5,...,gas,std,four,sedan,fwd,front,ohc,five,mpfi,17710.0
8,158.0,105.8,192.7,71.4,55.9,3086,131,3.13,3.40,8.3,...,gas,turbo,four,sedan,fwd,front,ohc,five,mpfi,23875.0
10,192.0,101.2,176.8,64.8,54.3,2395,108,3.50,2.80,8.8,...,gas,std,two,sedan,rwd,front,ohc,four,mpfi,16430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,95.0,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,9.5,...,gas,std,four,sedan,rwd,front,ohc,four,mpfi,16845.0
201,95.0,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,8.7,...,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi,19045.0
202,95.0,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,8.8,...,gas,std,four,sedan,rwd,front,ohcv,six,mpfi,21485.0
203,95.0,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,23.0,...,diesel,turbo,four,sedan,rwd,front,ohc,six,idi,22470.0


In [7]:
def kfold_cv(data, target, n):
    # We need a vector to record mse from k-fold
    MSE = np.array([]) 
    
    # KFold is a build-in function in Scikit-learn
    #    it can help us cut data into n pieces 
    #    (compare with simple cross validation)
    kf = KFold(n_splits = n, random_state=0,shuffle=True)
    
    for train_index, validation_index in kf.split(data):

        # obtain the train and validation part
        train, valid = data.loc[train_index,:], data.loc[validation_index,:]
        
        # extract X and Y to be fit in a model
        X_train = train.drop(target, axis = 1)
        Y_train = train[target]        
        X_valid = valid.drop(target, axis = 1)
        Y_valid = valid[target]

        # build linear regression model
        model = linear_model.LinearRegression()
        
        # fit model using training data
        model.fit(X_train,Y_train)
        
        # predict using validation data
        Y_valid_fit = model.predict(X_valid)
        
        # Calculate MSE
        MSE_temp = mean_squared_error(Y_valid_fit, Y_valid)
        # Add MSE to the list
        MSE = np.append(MSE, MSE_temp)
        
    return MSE.mean()

In [8]:
train_valid, test = train_test_split(data_p, test_size = 0.2, random_state = 20190227)
train_valid = train_valid.reset_index(drop = True)
test = test.reset_index(drop = True)


In [9]:
greedy_select = []

# and a numpy array to save their model MSE
MSE_greedy_algo = np.array([])

for i in range(len(features)):
    MSE = np.array([])
    features_left = list(set(features) - set(greedy_select))
    
    for new in features_left:
        features_new = greedy_select + [new]
        train_valid_sub = train_valid[features_new + target]
        
        # get all categorical features in sub
        categorical_sub = list(set(features_new) & set(features_categorical))
        
         # if there really are categorical features, 
        # we need to do onthot encoding.
        if len(categorical_sub) != 0:
            for i in categorical_sub:
                # Again, this line is important. Refer to Assignment 2
                train_valid_sub = onehot_encoder(train_valid_sub, i)   
            
        # CrossValidation, compute the mse and save it into MSE_sub
        MSE_sub = kfold_cv(train_valid_sub, 'price', 5)
        MSE = np.append(MSE, MSE_sub)
        
    # pick the features that gives the smallest MSE
    # and add it into our features list
    # meanwhile, save the corresponding MSE
    greedy_select += [features_left[MSE.argmin()]]
    MSE_greedy_algo = np.append(MSE_greedy_algo, MSE.min())

In [10]:
MSE_greedy_algo.argmin()

12

In [11]:
features_greedy = greedy_select[:(MSE_greedy_algo.argmin()+1)]
features_greedy

['curb-weight',
 'make',
 'horsepower',
 'body-style',
 'symboling',
 'drive-wheels',
 'peak-rpm',
 'engine-location',
 'width',
 'length',
 'wheel-base',
 'fuel-system',
 'fuel-type']

In [12]:
data_greedy = data[features_greedy + target]
data_greedy

Unnamed: 0,curb-weight,make,horsepower,body-style,symboling,drive-wheels,peak-rpm,engine-location,width,length,wheel-base,fuel-system,fuel-type,price
3,2337,audi,102.0,sedan,2,fwd,5500.0,front,66.2,176.6,99.8,mpfi,gas,13950.0
4,2824,audi,115.0,sedan,2,4wd,5500.0,front,66.4,176.6,99.4,mpfi,gas,17450.0
6,2844,audi,110.0,sedan,1,fwd,5500.0,front,71.4,192.7,105.8,mpfi,gas,17710.0
8,3086,audi,140.0,sedan,1,fwd,5500.0,front,71.4,192.7,105.8,mpfi,gas,23875.0
10,2395,bmw,101.0,sedan,2,rwd,5800.0,front,64.8,176.8,101.2,mpfi,gas,16430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,2952,volvo,114.0,sedan,-1,rwd,5400.0,front,68.9,188.8,109.1,mpfi,gas,16845.0
201,3049,volvo,160.0,sedan,-1,rwd,5300.0,front,68.8,188.8,109.1,mpfi,gas,19045.0
202,3012,volvo,134.0,sedan,-1,rwd,5500.0,front,68.9,188.8,109.1,mpfi,gas,21485.0
203,3217,volvo,106.0,sedan,-1,rwd,4800.0,front,68.9,188.8,109.1,idi,diesel,22470.0


In [13]:
categorical_cv = list(set(features_greedy) & set(features_categorical))

if len(categorical_cv) != 0:
    for i in categorical_cv:
        data_greedy = onehot_encoder(data_greedy, i)

In [14]:
data_greedy

Unnamed: 0,curb-weight,horsepower,peak-rpm,width,length,wheel-base,price,fuel-system_1bbl,fuel-system_2bbl,fuel-system_idi,...,make_plymouth,make_porsche,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd
3,2337,102.0,5500.0,66.2,176.6,99.8,13950.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2824,115.0,5500.0,66.4,176.6,99.4,17450.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,2844,110.0,5500.0,71.4,192.7,105.8,17710.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,3086,140.0,5500.0,71.4,192.7,105.8,23875.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
10,2395,101.0,5800.0,64.8,176.8,101.2,16430.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,2952,114.0,5400.0,68.9,188.8,109.1,16845.0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
201,3049,160.0,5300.0,68.8,188.8,109.1,19045.0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
202,3012,134.0,5500.0,68.9,188.8,109.1,21485.0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
203,3217,106.0,4800.0,68.9,188.8,109.1,22470.0,0,0,1,...,0,0,0,0,0,0,1,0,0,1


In [15]:
train_valid_greedy, test_greedy = train_test_split(data_greedy, test_size = 0.2, random_state = 20190227)

In [18]:

model_greedy = linear_model.LinearRegression()

# features traget split
X_greedy = train_valid_greedy.drop(target, axis = 1)
Y_greedy = train_valid_greedy[target]

# fit model
model_greedy.fit(X_greedy, Y_greedy)

# Use model
X_test_greedy = test_greedy.drop(target, axis = 1)
Y_test_greedy = test_greedy[target]

Y_test_greedy_fit = model_greedy.predict(X_test_greedy)
mean_squared_error(Y_test_greedy_fit, Y_test_greedy)

3610374.4223495927

In [19]:
print(r2_score(Y_test_greedy_fit, Y_test_greedy))

0.8620774463359875


In [20]:
np.sqrt(mean_squared_error(Y_test_greedy_fit, Y_test_greedy))

1900.0985296425006