# Exploratory Data Analysis

### Import necessary packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

### Read and preprocess dataset

In [2]:
def preprocess(df):
    features_df = pd.DataFrame()
    target_ser = pd.Series()
    df.dropna(subset=['price'], axis='index', inplace=True)
    numeric_cols_to_use = ['engine-size', 'curb-weight', 'highway-mpg', 'horsepower', 'width',
       'length', 'normalized-losses', 'compression-ratio', 'city-mpg',
       'wheel-base', 'peak-rpm', 'height', 'stroke', 'bore']
    categorical_cols_to_use = ['make_bmw']
    for col in numeric_cols_to_use:
        if skew(np.abs(df[col]) > 1.0):
            features_df[col] = np.log(1 + df[col])
        else:
            features_df[col] = df[col]
    features_df.fillna(features_df.median(), inplace=True)
    features_df['make_bmw'] = df['make'].apply(lambda x: 1 if x == 'bmw' else 0)
    target_ser = df['price']
    return features_df, target_ser

In [3]:
df = pd.read_csv('Auto1-DS-TestData.csv', na_values=['?'])
X, y = preprocess(df)

### Split into training and test sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Ridge Regression

In [5]:
pipe = Pipeline(steps=[
    ('polynomial_features', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('regressor', Ridge())
])
param_grid = {
    'polynomial_features__degree': [2],
    'regressor__alpha': np.logspace(2, 3, 10)

}
model = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, verbose=1)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.best_score_)
print(model.score(X_test, y_test))
print(model.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
0.8748974765235651
0.8085186941663377
0.8234390768609023
{'polynomial_features__degree': 2, 'regressor__alpha': 100.0}


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.7s finished


### Support Vector Machine

In [6]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('regressor', SVR())
])
param_grid = {
    'regressor__kernel': ['rbf', 'poly'],
    'regressor__degree': [2],
    'regressor__C': np.logspace(-3, 3, 10)

}
model = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, verbose=1)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.best_score_)
print(model.score(X_test, y_test))
print(model.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
0.5351661113420898
0.4969566583275798
0.3520884176022876
{'regressor__C': 1000.0, 'regressor__degree': 2, 'regressor__kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished


### Random Forest

In [7]:
pipe = Pipeline(steps=[
    ('polynomial_features', PolynomialFeatures()),
    ('regressor', RandomForestRegressor())
])
param_grid = {
    'polynomial_features__degree': [2],
    'regressor__n_estimators': [200],
    'regressor__max_depth': np.arange(1, 6)

}
model = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, verbose=1)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.best_score_)
print(model.score(X_test, y_test))
print(model.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    1.9s finished


0.9681720286267168
0.8366349745119124
0.9098164830482945
{'polynomial_features__degree': 2, 'regressor__max_depth': 5, 'regressor__n_estimators': 200}


### Gradient Boosted Trees

In [8]:
pipe = Pipeline(steps=[
    ('polynomial_features', PolynomialFeatures()),
    ('regressor', XGBRegressor())
])
param_grid = {
    'polynomial_features__degree': [2],
    'regressor__n_estimators': [100],
    'regressor__max_depth': np.arange(1, 6),
    'regressor__learning_rate': np.logspace(-3, -1, 10)
}
model = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, verbose=1)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.best_score_)
print(model.score(X_test, y_test))
print(model.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:    0.8s


0.9853375053922139
0.8517978549097149
0.943920560702983
{'polynomial_features__degree': 2, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 2, 'regressor__n_estimators': 100}


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    2.9s finished


### Model Tuning to reduce overfitting

In [12]:
pipe = Pipeline(steps=[
    ('polynomial_features', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('regressor', XGBRegressor())
])
param_grid = {
    'polynomial_features__degree': [2],
    'pca__n_components': [0.9],
    'regressor__n_estimators': [200],
    'regressor__max_depth': np.arange(1, 3),
    'regressor__learning_rate': np.logspace(-3, -1, 10)
}
model = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, verbose=1)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.best_score_)
print(model.score(X_test, y_test))
print(model.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s


0.9528275179876677
0.8392798679625612
0.8903113641600432
{'pca__n_components': 0.9, 'polynomial_features__degree': 2, 'regressor__learning_rate': 0.03593813663804626, 'regressor__max_depth': 2, 'regressor__n_estimators': 200}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.9s finished
