In [85]:

%pip install -q pandas numpy scikit-learn wget

Note: you may need to restart the kernel to use updated packages.


In [86]:

import wget
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge

CSV_URL = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"

def download_if_needed(path="car_fuel_efficiency.csv"):
    if not pd.io.common.file_exists(path):
        wget.download(CSV_URL, path)
    return path

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [87]:

csv_path = download_if_needed()
cols = ['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']
df = pd.read_csv(csv_path, usecols=cols)
df.head(3)

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341


## Q1: There's one column with missing values. What is it?



In [65]:
missing_counts = df.isna().sum()
missing_counts
print(missing_counts)
print(f'Missing column is horsepower')

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64
Missing column is horsepower


## Q2: Median for horse power

In [66]:
print(f'Median for horse power is ', df['horsepower'].dropna().median())

Median for horse power is  149.0


## Q3: Prepare and split the dataset

- Shuffle the dataset (the filtered one you created above), use seed 42.
- Split your data in train/val/test sets, with 60%/20%/20% distribution.

In [67]:
def split_60_20_20(dataframe, seed=42):
    df_sh = dataframe.sample(frac=1, random_state=seed).reset_index(drop=True)
    n = len(df_sh)
    t = int(0.6 * n)
    v = t + int(0.2 * n)
    return df_sh.iloc[:t].reset_index(drop=True), df_sh.iloc[t:v].reset_index(drop=True), df_sh.iloc[v:].reset_index(drop=True)

In [68]:

def prepare_xy(df_part):
    X = df_part.drop(columns=['fuel_efficiency_mpg']).copy()
    # coerce to numeric
    for c in X.columns:
        X[c] = pd.to_numeric(X[c], errors='coerce')
    y = df_part['fuel_efficiency_mpg'].values
    return X, y

## Q3: 
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?


In [89]:

import numpy as np
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
    
train, val, test = split_60_20_20(df, seed=42)
X_train, y_train = prepare_xy(train)
X_val, y_val = prepare_xy(val)

# Option A: fill 0
Xtr0 = X_train.fillna(0)
Xvl0 = X_val.fillna(0)
model0 = LinearRegression().fit(Xtr0, y_train)
rmse0 = round(rmse(y_val, model0.predict(Xvl0)), 2)

# Option B: fill train mean (compute mean on X_train)
train_mean = X_train.mean()
Xtr_mean = X_train.fillna(train_mean)
Xvl_mean = X_val.fillna(train_mean)
model_mean = LinearRegression().fit(Xtr_mean, y_train)
rmse_mean = round(rmse(y_val, model_mean.predict(Xvl_mean)), 2)


if rmse0 < rmse_mean:
    q3 = "With 0"
elif rmse_mean < rmse0:
    q3 = "With mean"
else:
    q3 = "Both are equally good"
print("Q3:", q3, "| RMSE fill(0) =", rmse0, "| RMSE fill(mean) =", rmse_mean)

Q3: With mean | RMSE fill(0) = 0.52 | RMSE fill(mean) = 0.46


## Question 4
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?
- If multiple options give the same best RMSE, select the smallest r.

In [90]:
Xtr_r = X_train.fillna(0)
Xvl_r = X_val.fillna(0)
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
results_q4 = {}

for r in r_list:
    if r == 0:
        m = LinearRegression().fit(Xtr_r, y_train)
    else:
        m = Ridge(alpha=r).fit(Xtr_r, y_train)
    results_q4[r] = round(rmse(y_val, m.predict(Xvl_r)), 2)

results_q4
min_rmse = min(results_q4.values())
best_rs = sorted([r for r,v in results_q4.items() if v==min_rmse])
q4 = best_rs[0]
print("Q4: best r =", q4, "| RMSE =", results_q4[q4])

Q4: best r = 0 | RMSE = 0.52


## Question 5
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

In [81]:
seed_scores = []
for s in range(10):
    tr, vl, _ = split_60_20_20(df, seed=s)
    Xtr_s, ytr_s = prepare_xy(tr)
    Xvl_s, yvl_s = prepare_xy(vl)
    Xtr_s = Xtr_s.fillna(0)
    Xvl_s = Xvl_s.fillna(0)
    m = LinearRegression().fit(Xtr_s, ytr_s)
    seed_scores.append(rmse(yvl_s, m.predict(Xvl_s)))

std_scores = round(np.std(seed_scores), 3)
seed_scores_rounded = [round(x,3) for x in seed_scores]
std_scores, seed_scores_rounded
print("Q5: std =", std_scores)

Q5: std = 0.007


## Q6: Evaluation on test

In [83]:
tr, vl, tst = split_60_20_20(df, seed=9)
train_comb = pd.concat([tr, vl], ignore_index=True)
X_train_full, y_train_full = prepare_xy(train_comb)
X_test, y_test = prepare_xy(tst)

X_train_full = X_train_full.fillna(0)
X_test = X_test.fillna(0)

model_q6 = Ridge(alpha=0.001).fit(X_train_full, y_train_full)
q6_rmse = round(rmse(y_test, model_q6.predict(X_test)), 3)
print("Q6: rmse=", q6_rmse)

Q6: rmse= 0.515


## ------------ End of Assignment-2 ------------