# Machine Learning Zoomcamp

## Homework 1

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
# Getting the data
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv" 
df = pd.read_csv(url) 

print(df.head())

   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  170            3.0       159.0     3413.433759   
1                  130            5.0        97.0     3149.664934   
2                  170            NaN        78.0     3079.038997   
3                  220            4.0         NaN     2542.392402   
4                  210            1.0       140.0     3460.870990   

   acceleration  model_year  origin fuel_type         drivetrain  num_doors  \
0          17.7        2003  Europe  Gasoline    All-wheel drive        0.0   
1          17.8        2007     USA  Gasoline  Front-wheel drive        0.0   
2          15.1        2018  Europe  Gasoline  Front-wheel drive        0.0   
3          20.2        2009     USA    Diesel    All-wheel drive        2.0   
4          14.4        2009  Europe  Gasoline    All-wheel drive        2.0   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  
3         

###  Question 1. Missing values 

In [34]:
df.isnull().sum()  # Answer: 'horsepower'

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

### Question 2. Median for horse power

In [35]:
df['horsepower'].median() # Answer: 149

np.float64(149.0)

### Question 3. Filling NAs

In [36]:
# df = df.fillna(df.mean(numeric_only=True))
# df.isnull().sum()

### Question 4. Best regularization

In [37]:
X = df.drop(columns=['fuel_efficiency_mpg']) 
y = df['fuel_efficiency_mpg']

In [38]:
from sklearn.model_selection import train_test_split, GridSearchCV , cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

num_cols = ['engine_displacement','num_cylinders','horsepower',
            'vehicle_weight','acceleration','model_year','num_doors']
cat_cols = ['origin','fuel_type','drivetrain']


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])

categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

model = Ridge() 

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

param_grid = {
    'model__alpha': [0, 0.01, 1, 10, 100]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='r2')
grid.fit(X_train, y_train)

print("Best alpha (regularization):", grid.best_params_)
print("Best CV R^2 score:", grid.best_score_)
print("Test R^2 score:", grid.score(X_test, y_test))

Best alpha (regularization): {'model__alpha': 0.01}
Best CV R^2 score: 0.9759925722369773
Test R^2 score: 0.9757064038178345


### Question 5. RMSE Standard Deviation and Question 6. Evaluation on test 

In [None]:
# Question 5: RMSE Standard Deviation from Cross-Validation
best_pipe = grid.best_estimator_
cv_mse_scores = cross_val_score(
    best_pipe, X_train, y_train,
    cv=5,
    scoring='neg_mean_squared_error'
)
cv_rmse_scores = np.sqrt(-cv_mse_scores)
print("Question 5 - RMSE std:", cv_rmse_scores.std())
print("Answer: 0.006")

# Question 6: Evaluation on Test Set (MSE)
y_test_pred = grid.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print("\nQuestion 6 - Test MSE:", test_mse)
print("Answer: 0.15")

Question 5 - RMSE std: 0.007125934824150676
Answer: 0.006

Question 6 - Test MSE: 0.15643959513865643
Answer: 0.15
