<a href="https://colab.research.google.com/github/arbeeorlar/data_science/blob/main/ml_zoom_camp_week_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
# to load and manipulate data
import pandas as pd
import numpy as np

# to visualize data
import matplotlib.pyplot as plt
import seaborn as sns

# to split the data into train and test sets
from sklearn.model_selection import train_test_split

# to build a linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

# to check a regression model's performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
original_data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [5]:
data = original_data.copy()

In [7]:
data.shape

(9704, 11)

In [8]:
data.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [9]:
data.tail()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.52739,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551
9703,270,3.0,140.0,2908.043477,14.7,2005,Asia,Diesel,All-wheel drive,-1.0,14.884467


In [12]:
data = data[["engine_displacement","horsepower","vehicle_weight","model_year","fuel_efficiency_mpg"] ]   #drop(columns=['model', 'transmission', 'fuelType'])

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   horsepower           8996 non-null   float64
 2   vehicle_weight       9704 non-null   float64
 3   model_year           9704 non-null   int64  
 4   fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 379.2 KB


In [14]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
engine_displacement,9704.0,199.708368,49.455319,10.0,170.0,200.0,230.0,380.0
horsepower,8996.0,149.657292,29.879555,37.0,130.0,149.0,170.0,271.0
vehicle_weight,9704.0,3001.280993,497.89486,952.681761,2666.248985,2993.226296,3334.957039,4739.077089
model_year,9704.0,2011.484027,6.659808,2000.0,2006.0,2012.0,2017.0,2023.0
fuel_efficiency_mpg,9704.0,14.985243,2.556468,6.200971,13.267459,15.006037,16.707965,25.967222


In [15]:
#data = data.sample(frac=1, random_state=42).reset_index(drop=True)
#since we are using sckit learn library , we donot need manual shuffle

In [20]:
X = data.drop("fuel_efficiency_mpg", axis=1)
y = data["fuel_efficiency_mpg"]

In [19]:
#Split your data in train/val/test sets, with 60%/20%/20% distribution.
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

In [26]:
# Copy data so we don't overwrite original
X_train_0 = X_train.copy()
X_val_0 = X_val.copy()

# Fill missing values with 0
X_train_0["horsepower"] = X_train_0["horsepower"].fillna(0)
X_val_0["horsepower"] = X_val_0["horsepower"].fillna(0)

# Train linear regression
model_0 = LinearRegression()
model_0.fit(X_train_0, y_train)

# Predict and evaluate
y_pred_0 = model_0.predict(X_val_0)
mse_0 = mean_squared_error(y_val, y_pred_0)
print("MSE (filled with 0):", mse_0)

MSE (filled with 0): 0.28478968453018466


In [28]:
# Copy data again
X_train_mean = X_train.copy()
X_val_mean = X_val.copy()

# Compute mean from training data
mean_value = X_train_mean["horsepower"].mean()

# Fill missing values with mean
X_train_mean["horsepower"] = X_train_mean["horsepower"].fillna(mean_value)
X_val_mean["horsepower"] = X_val_mean["horsepower"].fillna(mean_value)

# Train linear regression
model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train)

# Predict and evaluate
y_pred_mean = model_mean.predict(X_val_mean)
mse_mean = mean_squared_error(y_val, y_pred_mean)
print("MSE (filled with mean):", mse_mean)

MSE (filled with mean): 0.2177709510550316


In [29]:

# RMSE for the model where missing values were filled with 0
rmse_0 = np.sqrt(mean_squared_error(y_val, y_pred_0))
rmse_0 = round(rmse_0, 2)

# RMSE for the model where missing values were filled with the mean
rmse_mean = np.sqrt(mean_squared_error(y_val, y_pred_mean))
rmse_mean = round(rmse_mean, 2)

print("RMSE (filled with 0):", rmse_0)
print("RMSE (filled with mean):", rmse_mean)

RMSE (filled with 0): 0.53
RMSE (filled with mean): 0.47


In [36]:
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)

In [38]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores[r] = round(rmse, 2)

# Display results
for r, score in rmse_scores.items():
    print(f"r = {r}: RMSE = {score}")

r = 0: RMSE = 0.53
r = 0.01: RMSE = 0.53
r = 0.1: RMSE = 0.53
r = 1: RMSE = 0.53
r = 5: RMSE = 0.53
r = 10: RMSE = 0.53
r = 100: RMSE = 0.53
