# Import all required packages

In [12]:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
    import statsmodels.formula.api as sm
    import matplotlib.pylab as plt
    from dmba import regressionSummary, exhaustive_search
    from dmba import backward_elimination, forward_selection, stepwise_selection
    from dmba import adjusted_r2_score, AIC_score, BIC_score

# Read the data, use only the first 1000 records and select columns for regression analysis

In [2]:
car_df = pd.read_csv('TrainingDataToyotaCorollaa.csv')  
car_df = car_df.iloc[0:1000]   
predictors = ['Age_08_04','KM','Fuel_Type','HP','Automatic','Doors','Quarterly_Tax','Mfr_Guarantee','Guarantee_Period', 
              'Airco','Automatic_airco','CD_Player','Powered_Windows','Sport_Model','Tow_Bar']
outcome = 'Price'
car_df

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,Fuel_Type_Dis,Fuel_Type_Petr,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,1,0,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,1,0,...,0,1,0,0,0,1,0,0,0,0
2,3,�TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,1,0,...,0,1,0,0,0,1,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,1,0,...,0,1,0,0,0,1,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,1,0,...,1,1,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1000,TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-...,9950,68,1,1999,42750,Petrol,0,0,...,0,1,0,0,0,1,0,0,0,0
996,1001,TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors,9950,67,2,1999,42102,Petrol,0,0,...,1,1,0,1,0,1,1,0,0,0
997,1002,TOYOTA Corolla 1.6 LB Linea Terra 4/5-Doors,9950,63,6,1999,41586,Petrol,0,0,...,0,1,0,0,0,0,0,0,0,0
998,1003,TOYOTA Corolla 1.6 4/5-Doors,9900,64,5,1999,41200,Petrol,0,0,...,0,1,0,0,0,0,0,0,0,0


# Partition data into predictors (x) and output (y)

In [3]:
X = pd.get_dummies(car_df[predictors], drop_first=True)
y = car_df[outcome]


# Split the data into training and validation

In [4]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

# Built the Linear Model based on the training data

In [5]:
car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

LinearRegression()

# Print Intecept and coefficients 

In [6]:
print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

intercept  9744.974290680371
           Predictor  coefficient
0          Age_08_04  -119.971261
1                 KM    -0.020107
2                 HP    39.985589
3          Automatic   434.849618
4              Doors   179.677145
5      Quarterly_Tax    16.016163
6      Mfr_Guarantee   214.116842
7   Guarantee_Period    77.794604
8              Airco    61.585158
9    Automatic_airco  2646.771233
10         CD_Player   109.675454
11   Powered_Windows   619.771884
12       Sport_Model   587.474471
13           Tow_Bar  -204.504593
14  Fuel_Type_Diesel  1800.153047
15  Fuel_Type_Petrol  1638.701742


#  Print performance measures

In [7]:
regressionSummary(train_y, car_lm.predict(train_X))


Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 1243.6231
            Mean Absolute Error (MAE) : 933.9162
          Mean Percentage Error (MPE) : -0.9085
Mean Absolute Percentage Error (MAPE) : 8.3497


# Use the developed model (car_lm) to predict validation data and  Use predict() to make predictions on a new set

In [8]:
car_lm_pred = car_lm.predict(valid_X)
result = pd.DataFrame({'Predicted': car_lm_pred, 'Actual': valid_y,'Residual': valid_y - car_lm_pred})
print(result.head(20))

        Predicted  Actual     Residual
507  10267.242476   11500  1232.757524
818  10492.746598    8950 -1542.746598
452  10795.033480   11450   654.966520
368  13857.501236   11450 -2407.501236
242  12010.436101   11950   -60.436101
929   9574.954432    9995   420.045568
262  13074.916963   13500   425.083037
810   9247.787071    7950 -1297.787071
318  11571.172007    9900 -1671.172007
49   20898.309711   21950  1051.690289
446  11179.666620   11950   770.333380
142  20524.149171   19950  -574.149171
968  10914.071674    9950  -964.071674
345  13787.083560   14950  1162.916440
971   8780.079982   10495  1714.920018
133  17313.019213   15950 -1363.019213
104  18459.969902   19450   990.030098
6    15406.879890   16900  1493.120110
600  12462.412526   11250 -1212.412526
496  11301.278038   11750   448.721962


# Compute common accuracy measures

In [9]:
regressionSummary(train_y, car_lm.predict(train_X))


Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 1243.6231
            Mean Absolute Error (MAE) : 933.9162
          Mean Percentage Error (MPE) : -0.9085
Mean Absolute Percentage Error (MAPE) : 8.3497


# Forward selection The initial model is the constant model - this requires special handling in train_model and score_model

In [10]:
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model
   
def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)


print(best_variables) 

Variables: Age_08_04, KM, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, Tow_Bar, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=11565.07, constant
Step: score=10689.71, add Age_08_04
Step: score=10538.84, add Automatic_airco
Step: score=10463.29, add HP
Step: score=10436.29, add Quarterly_Tax
Step: score=10366.91, add KM
Step: score=10339.07, add Powered_Windows
Step: score=10326.83, add Sport_Model
Step: score=10309.96, add Guarantee_Period
Step: score=10302.13, add Doors
Step: score=10300.94, add Mfr_Guarantee
Step: score=10298.68, add Fuel_Type_Diesel
Step: score=10289.26, add Fuel_Type_Petrol
Step: score=10287.32, add Automatic
Step: score=10286.64, add Tow_Bar
Step: score=10286.64, add None
['Age_08_04', 'Automatic_airco', 'HP', 'Quarterly_Tax', 'KM', 'Powered_Windows', 'Sport_Model', 'Guarantee_Period', 'Doors', 'Mfr_Guarantee', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Automatic', 'Tow_Bar']

# Backward elimination

In [11]:
def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model
    
def score_model(model, variables):
    return AIC_score(train_y, model.predict(train_X[variables]), model)
    
best_model, best_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)
  
print(best_variables)

Variables: Age_08_04, KM, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, Tow_Bar, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=10289.67
Step: score=10287.88, remove Airco
Step: score=10286.64, remove CD_Player
Step: score=10286.64, remove None
['Age_08_04', 'KM', 'HP', 'Automatic', 'Doors', 'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Automatic_airco', 'Powered_Windows', 'Sport_Model', 'Tow_Bar', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol']
