In [10]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_excel('car_fuel_efficiency.xlsx')
df.head()

Unnamed: 0,Car Weight (kg),Engine Size (L),Number of Cylinders,Horsepower,Car Age (years),Fuel Efficiency (mpg)
0,855,3.593258,7,433,16,10.039439
1,1627,3.390346,5,229,5,17.948654
2,1487,2.820005,11,328,3,30.304928
3,3764,3.801203,6,368,1,30.903905
4,3585,2.239086,11,372,10,23.808264


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Car Weight (kg)        100 non-null    int64  
 1   Engine Size (L)        100 non-null    float64
 2   Number of Cylinders    100 non-null    int64  
 3   Horsepower             100 non-null    int64  
 4   Car Age (years)        100 non-null    int64  
 5   Fuel Efficiency (mpg)  100 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 4.8 KB


In [5]:
df.describe()

Unnamed: 0,Car Weight (kg),Engine Size (L),Number of Cylinders,Horsepower,Car Age (years),Fuel Efficiency (mpg)
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,2512.95,3.147257,7.07,281.89,9.04,24.511267
std,899.861071,1.087315,2.610217,131.609377,5.519332,8.200815
min,811.0,1.137102,3.0,54.0,1.0,10.039439
25%,1764.75,2.229272,5.0,186.0,4.75,17.521594
50%,2514.0,3.440423,7.0,279.0,8.0,24.379989
75%,3340.75,4.040964,10.0,402.0,14.0,30.856605
max,3993.0,4.902996,11.0,487.0,19.0,38.864366


In [6]:
df.isnull().sum()

Car Weight (kg)          0
Engine Size (L)          0
Number of Cylinders      0
Horsepower               0
Car Age (years)          0
Fuel Efficiency (mpg)    0
dtype: int64

In [7]:
corr_matrix = df.corr()
corr_matrix

Unnamed: 0,Car Weight (kg),Engine Size (L),Number of Cylinders,Horsepower,Car Age (years),Fuel Efficiency (mpg)
Car Weight (kg),1.0,-0.067402,0.147115,-0.115735,-0.195907,-0.06008
Engine Size (L),-0.067402,1.0,-0.123948,0.055295,-0.132809,0.105577
Number of Cylinders,0.147115,-0.123948,1.0,-0.034144,0.07763,-0.056425
Horsepower,-0.115735,0.055295,-0.034144,1.0,-0.035064,-0.118598
Car Age (years),-0.195907,-0.132809,0.07763,-0.035064,1.0,-0.113195
Fuel Efficiency (mpg),-0.06008,0.105577,-0.056425,-0.118598,-0.113195,1.0


In [8]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

print(X)
print(y)

    Car Weight (kg)  Engine Size (L)  Number of Cylinders  Horsepower  \
0               855         3.593258                    7         433   
1              1627         3.390346                    5         229   
2              1487         2.820005                   11         328   
3              3764         3.801203                    6         368   
4              3585         2.239086                   11         372   
..              ...              ...                  ...         ...   
95             1768         3.919993                    8         461   
96             2711         3.574219                    7         402   
97             2977         1.966789                    5         208   
98             2212         2.378990                   11          82   
99             2774         2.199827                    5         357   

    Car Age (years)  
0                16  
1                 5  
2                 3  
3                 1  
4            

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [12]:
lr_model = LinearRegression()
dt_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()

In [13]:
lr_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

In [15]:
lr_pred = lr_model.predict(X_test)
dt_pred = dt_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

In [16]:
print(lr_pred)

[25.86960024 25.93743991 24.68205535 28.26671813 29.34097589 25.50588126
 22.7526404  21.17409102 23.64278407 24.84216255 25.34791857 21.0064313
 22.69230307 22.88104393 26.1962871  21.88656526 21.23668581 26.74113672
 22.53350769 27.31650404 25.84418779 25.00093091 24.41582456 25.34483352
 28.26133274 26.12861229 24.29915676 23.41451619 22.85336923 27.67497516]


In [17]:
print(dt_pred)

[27.53492741 12.96400295 29.53085463 28.22922982 26.66467301 12.96400295
 28.1634109  11.54332625 37.92592192 23.06455403 10.64214363 21.24462254
 24.24803183 26.66467301 28.22922982 24.24803183 24.4881326  26.66467301
 16.36498745 16.36498745 11.86006608 12.96400295 11.54332625 37.67930145
 15.14864435 26.66467301 34.25993578 23.06455403 15.87472258 29.53085463]


In [18]:
print(rf_pred)

[25.92787017 26.07066817 27.66228999 29.01227675 27.1025268  26.41911418
 29.14605367 15.75353087 25.92359017 23.04877371 21.1652869  21.05300248
 24.2704694  23.24810273 24.54717867 26.78867799 20.52894714 27.67183294
 18.72501263 24.724536   28.21988933 26.0365036  20.33662669 25.4773326
 29.966906   23.3184759  25.47049507 28.70022522 21.31083748 27.24499967]


In [19]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

print('Linear Regression', mean_squared_error(y_test, lr_pred))
print('Decision Tree Regressor', mean_squared_error(y_test, dt_pred))
print('Random Forest Regressor', mean_squared_error(y_test, rf_pred))

print('\n')

print('Linear Regression', r2_score(y_test, lr_pred))
print('Decision Tree Regressor', r2_score(y_test, dt_pred))
print('Random Forest Regressor', r2_score(y_test, rf_pred))

Linear Regression 75.71152451238781
Decision Tree Regressor 176.26879107789202
Random Forest Regressor 88.58318797644795


Linear Regression -0.08809123818406173
Decision Tree Regressor -1.5332540636633163
Random Forest Regressor -0.2730768705075286


In [20]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
lasso = Lasso()

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

ridge_cv = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
lasso_cv = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')

ridge_cv.fit(X_train, y_train)
lasso_cv.fit(X_train, y_train)

print("Best alpha for Ridge:", ridge_cv.best_params_)
print("Best cross-validated score (MSE) for Ridge:", ridge_cv.best_score_)
print('\n')
print("Best alpha for Lasso:", lasso_cv.best_params_)
print("Best cross-validated score (MSE) for Lasso:", lasso_cv.best_score_)


Best alpha for Ridge: {'alpha': 100}
Best cross-validated score (MSE) for Ridge: -76.00492035861232


Best alpha for Lasso: {'alpha': 100}
Best cross-validated score (MSE) for Lasso: -72.1224712307799


In [21]:
rid_pred = ridge_cv.predict(X_test)
lasso_pred = lasso_cv.predict(X_test)

print('Ridge', mean_squared_error(y_test, rid_pred))
print('Lasso', mean_squared_error(y_test, lasso_pred))

print('\n')

print('Ridge', r2_score(y_test, rid_pred))
print('Lasso', r2_score(y_test, lasso_pred))

Ridge 72.76703777337912
Lasso 72.27325530407238


Ridge -0.04577442786623065
Lasso -0.03867801312776331


In [22]:
parameter = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],  # These are valid for regressor
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5],
    'max_features': ['sqrt', 'log2', None]  # 'auto' is not valid, use None for all features
}

cv = GridSearchCV(dt_model, param_grid=parameter, cv=5, scoring='neg_mean_squared_error')  # For regression, use 'neg_mean_squared_error'

cv.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", cv.best_params_)
print("Best score (negative MSE):", cv.best_score_)

Best parameters: {'criterion': 'poisson', 'max_depth': 4, 'max_features': 'log2', 'splitter': 'random'}
Best score (negative MSE): -57.84795687597521


In [23]:
pre_dt_pred = cv.predict(X_test)

print('Preprunning', mean_squared_error(y_test, pre_dt_pred))

print('\n')

print('Preprunning', r2_score(y_test, pre_dt_pred))


Preprunning 85.72829518038412


Preprunning -0.23204766316613412
