In [1]:
import pickle

with open('../data/unprocessed_data.pkl', 'rb') as f:
    X_train, y_train, X_test, y_test = pickle.load(f)
    
X_train

Unnamed: 0,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand,Model,Age
2952,Mumbai,22000,Petrol,Manual,First,16.47,1198.0,74.0,5.0,Volkswagen,Polo,5
1647,Pune,69000,Diesel,Automatic,First,,2987.0,165.0,5.0,Mercedes-Benz,M-Class,5
5301,Coimbatore,20026,Petrol,Automatic,First,19.00,1199.0,88.7,5.0,Honda,Jazz,2
657,Hyderabad,13000,Petrol,Automatic,First,22.00,1197.0,81.8,5.0,Maruti,Dzire,3
2273,Mumbai,122000,Diesel,Manual,Second,11.50,2982.0,171.0,7.0,Toyota,Fortuner,10
...,...,...,...,...,...,...,...,...,...,...,...,...
3772,Delhi,70000,Petrol,Manual,First,19.00,998.0,66.1,5.0,Maruti,A-Star,10
5191,Kolkata,28000,Petrol,Manual,First,18.90,1197.0,82.0,5.0,Hyundai,Grand,7
5226,Chennai,123000,Diesel,Manual,Second,12.55,2982.0,168.5,7.0,Toyota,Fortuner,7
5390,Hyderabad,78000,Petrol,Manual,Second,20.92,998.0,67.1,5.0,Maruti,Alto,9


In [2]:
with open('../data/preprocessed_data.pkl', 'rb') as f:
    X_train_prep, _, X_test_prep, _ = pickle.load(f)

X_train_prep

array([[-1.08423424, -0.4720888 , -0.72760357, ...,  0.        ,
         1.        ,  3.        ],
       [ 0.4521476 ,  0.02363623,  2.62788967, ...,  1.        ,
         0.        ,  3.        ],
       [-1.14876227,  0.14573599, -0.72572795, ...,  1.        ,
         1.        ,  3.        ],
       ...,
       [ 2.21735226, -1.42935092,  2.61851154, ...,  0.        ,
         1.        ,  2.        ],
       [ 0.74634838,  0.61459907, -1.10272859, ...,  0.        ,
         0.        ,  2.        ],
       [-0.72465551, -0.14730344, -0.72760357, ...,  1.        ,
         1.        ,  3.        ]])

# Linear Regression


In [3]:
from sklearn.linear_model import LinearRegression

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_prep, y_train)   # Learn Weights from training data Using OLS (Normal Equation)


print('Intercept:', lr.intercept_)
print('Coefficients:', lr.coef_)

Intercept: 10.088285094548091
Coefficients: [-0.8788273  -0.23932939  1.394484    5.13364552 -2.69583179 -0.30241251
 -0.82917759  0.99944076  0.84449496 -0.44357373  0.40923293  0.09343068
 -1.56538317  0.85226665 -0.85226665  6.52082861 -0.12007836 -1.34716553
  0.3222884  -2.68695667  1.5630123   1.28358729 -1.27600301  0.0320822
  0.04448995 -1.12593053 -0.60769188  0.77858218  0.68489547]


In [4]:
# Predictions
y_pred = lr.predict(X_test_prep)

# Model Evaluation
print('Training Accuracy:', lr.score(X_train_prep, y_train))
print('Testing Accuracy:', lr.score(X_test_prep, y_test))

Training Accuracy: 0.7403918928031255
Testing Accuracy: 0.7582095006214017


In [5]:
# Model Evaluation
from sklearn.metrics import root_mean_squared_error, r2_score


print('RMSE:', root_mean_squared_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

RMSE: 4.665008097180856
R2: 0.7582095006214017


> Evaluation on Test Set is not Right. 
* We should do Cross Validation on Training Set to get the best model and then evaluate it on Test Set.

# Polynomial Regression


In [6]:
num_cols = ['Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Age', 'Seats']  # Impute Missing Values and Scale
nom_cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Brand', 'Model']  # USE Binary ENCODING
ord_cat_cols = ['Owner_Type']   # USE ORDINAL ENCODING

In [7]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from  category_encoders import BinaryEncoder


numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')),
                                         ('poly', PolynomialFeatures(degree=2)),
                                         ('scaler', StandardScaler())])

# Preprocessing for categorical data
nominal_categorical_transformer = Pipeline(steps=[('binary', BinaryEncoder())])
ordinal_categorical_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(categories=[['Fourth & Above', 'Third', 'Second', 'First']]))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num_prep', numerical_transformer, num_cols),
                                                ('nom_prep', nominal_categorical_transformer, nom_cat_cols),
                                                ('ord_prep', ordinal_categorical_transformer, ord_cat_cols)])

X_train_poly = preprocessor.fit_transform(X_train)
X_test_poly = preprocessor.transform(X_test)

In [8]:
X_test_poly.shape

(1124, 51)

In [9]:
lr_poly = LinearRegression()

lr_poly.fit(X_train_poly, y_train)

# Accuracy
print('Training Accuracy', lr_poly.score(X_train_poly, y_train))
print('Testing Accuracy', lr_poly.score(X_test_poly, y_test))


Training Accuracy 0.8643384764640017
Testing Accuracy 0.8718695948596691


In [10]:
# Ridge Regression
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_train_prep, y_train)

# Predictions
y_pred = ridge.predict(X_test_prep)

# Model Evaluation
print('RMSE:', root_mean_squared_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

RMSE: 4.664805592944952
R2: 0.7582304920265913


In [11]:
# Ridge Regression with polynomial features
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.1)
ridge.fit(X_train_poly, y_train)

# Predictions
y_pred = ridge.predict(X_test_poly)

# Model Evaluation
print('RMSE:', root_mean_squared_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

RMSE: 3.3960322746306404
R2: 0.8718619689648504


In [12]:
# Lasso Regression
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X_train_poly, y_train)

# Predictions
y_pred = lasso.predict(X_test_poly)

# Model Evaluation
print('RMSE:', root_mean_squared_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

RMSE: 3.4051631016863215
R2: 0.871171999618801


In [13]:
for alpha in [0, 0.01, 0.1, 1, 10, 100]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_poly, y_train)
    y_pred = ridge.predict(X_test_poly)
    print('alpha:', alpha)
    print('RMSE:', root_mean_squared_error(y_test, y_pred))
    print('R2:', r2_score(y_test, y_pred))
    print('-----------------------------')

alpha: 0
RMSE: 3.399460304271883
R2: 0.8716031477094215
-----------------------------
alpha: 0.01
RMSE: 3.3959398850096654
R2: 0.871868940902763
-----------------------------
alpha: 0.1
RMSE: 3.3960322746306404
R2: 0.8718619689648504
-----------------------------
alpha: 1
RMSE: 3.397313168743842
R2: 0.8717652901412402
-----------------------------
alpha: 10
RMSE: 3.396873830009929
R2: 0.8717984544857936
-----------------------------
alpha: 100
RMSE: 3.454468337901782
R2: 0.8674142464094409
-----------------------------


In [14]:
def prep(poly_degree):
    numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')),
                                         ('poly', PolynomialFeatures(degree=poly_degree)),
                                         ('scaler', StandardScaler())])

    # Preprocessing for categorical data
    nominal_categorical_transformer = Pipeline(steps=[('binary', BinaryEncoder())])
    ordinal_categorical_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(categories=[['Fourth & Above', 'Third', 'Second', 'First']]))])

    # Bundle preprocessing for numerical and categorical data
    prep = ColumnTransformer(transformers=[('num_prep', numerical_transformer, num_cols),
                                                    ('nom_prep', nominal_categorical_transformer, nom_cat_cols),
                                                    ('ord_prep', ordinal_categorical_transformer, ord_cat_cols)])

    return prep

In [15]:
for degree in [2, 3, 4, 5]:
    ridge = Ridge(alpha=0.1)
    preprocessor = prep(degree)
    X_train_poly = preprocessor.fit_transform(X_train)
    X_test_poly = preprocessor.transform(X_test)
    ridge.fit(X_train_poly, y_train)
    y_pred = ridge.predict(X_test_poly)
    print('degree:', degree)
    print('RMSE:', root_mean_squared_error(y_test, y_pred))
    print('R2:', r2_score(y_test, y_pred))
    print('-----------------------------')

degree: 2
RMSE: 3.3960322746306404
R2: 0.8718619689648504
-----------------------------
degree: 3
RMSE: 3.307184600437261
R2: 0.8784790090068351
-----------------------------
degree: 4
RMSE: 3.285812236560358
R2: 0.8800445695076108
-----------------------------
degree: 5
RMSE: 3.330267907095806
R2: 0.876776717720078
-----------------------------


In [16]:
for degree in [2, 3, 4, 5]:
    for alpha in [0.01, 0.1, 1, 10]:
         ridge = Ridge(alpha=alpha)
         preprocessor = prep(degree)
         X_train_poly = preprocessor.fit_transform(X_train)
         X_test_poly = preprocessor.transform(X_test)
         ridge.fit(X_train_poly, y_train)
         y_pred = ridge.predict(X_test_poly)
         print('degree:', degree, 'alpha:', alpha)
         print('RMSE:', root_mean_squared_error(y_test, y_pred))
         print('R2:', r2_score(y_test, y_pred))
         print('-----------------------------')

degree: 2 alpha: 0.01
RMSE: 3.3959398850096654
R2: 0.871868940902763
-----------------------------
degree: 2 alpha: 0.1
RMSE: 3.3960322746306404
R2: 0.8718619689648504
-----------------------------
degree: 2 alpha: 1
RMSE: 3.397313168743842
R2: 0.8717652901412402
-----------------------------
degree: 2 alpha: 10
RMSE: 3.396873830009929
R2: 0.8717984544857936
-----------------------------
degree: 3 alpha: 0.01
RMSE: 3.310767086467055
R2: 0.8782155927790078
-----------------------------
degree: 3 alpha: 0.1
RMSE: 3.307184600437261
R2: 0.8784790090068351
-----------------------------
degree: 3 alpha: 1
RMSE: 3.3018724651809586
R2: 0.8788690794597677
-----------------------------
degree: 3 alpha: 10
RMSE: 3.319657531019332
R2: 0.8775606561985052
-----------------------------
degree: 4 alpha: 0.01
RMSE: 3.310323793958581
R2: 0.8782482030471096
-----------------------------
degree: 4 alpha: 0.1
RMSE: 3.285812236560358
R2: 0.8800445695076108
-----------------------------
degree: 4 alpha: 1
RM

In [17]:
# Final Model

model = Pipeline(steps=[('preprocessor', prep(2)),
                        ('model', LinearRegression())])

model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
print('R2:', model.score(X_test, y_test))

R2: 0.8718695948596691


In [18]:
def prep_and_fit(poly_degree, alpha):
    ridge = Ridge(alpha=alpha)
    model = Pipeline(steps=[('preprocessor', prep(poly_degree)),
                        ('model', Ridge(alpha=alpha))])
    model.fit(X_train, y_train)
    test_score = r2_score(y_test, model.predict(X_test))
    return test_score

In [19]:
prep_and_fit(3, 0.1)

0.8784790090068351