In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define the URL for the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"

# 1. Load the dataset
print("Loading data from:", url)
df = pd.read_csv(url)

# 2. Select only the specified columns
required_columns = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]
df_subset = df[required_columns]

print("\nDataset Head (Subset):")
print(df_subset.head())
print("\nDataset Info (Subset):")
print(df_subset.info())

# 3. EDA on 'fuel_efficiency_mpg'
target_variable = 'fuel_efficiency_mpg'
print(f"\n--- EDA for Target Variable: {target_variable} ---")

# Calculate and print basic statistics and skewness
target_stats = df_subset[target_variable].describe()
print(target_stats)

skewness = df_subset[target_variable].skew()
print(f"\nSkewness of '{target_variable}': {skewness:.3f}")

# Visualize the distribution using a histogram
plt.figure(figsize=(10, 6))
sns.histplot(df_subset[target_variable], kde=True, bins=30, color='indigo')
plt.title(f'Distribution of {target_variable}', fontsize=16)
plt.xlabel('Fuel Efficiency (MPG)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', alpha=0.5)
plt.show()

# Conclusion based on the skewness value and plot
if skewness > 0.5:
    print("\nCONCLUSION: The 'fuel_efficiency_mpg' distribution is positively (right) skewed, indicating a **long tail**.")
    print("This suggests that a logarithmic transformation (log(MPG)) might be beneficial for the regression model.")
elif skewness < -0.5:
    print("\nCONCLUSION: The 'fuel_efficiency_mpg' distribution is negatively (left) skewed.")
else:
    print("\nCONCLUSION: The 'fuel_efficiency_mpg' distribution is fairly symmetric.")
%matplotlib inline

In [None]:
# Check for missing values in the subset
print("\n--- Missing Value Count per Column ---")
missing_values = df_subset.isnull().sum()
print(missing_values)

# 4. Identify the column with missing values
missing_column = missing_values[missing_values > 0].index.tolist()

print("\n--- Result ---")
if missing_column:
    print(f"The column with missing values is: '{missing_column[0]}'.")
else:
    print("No missing values found in the selected columns.")


In [None]:
import numpy as np

SEED = 42

# Impute missing 'horsepower' values with the median (93.0)
median_horsepower = df['horsepower'].median() # This should be 93.0
print("Median horsepower for imputation:", median_horsepower)

df['horsepower'] = df['horsepower'].fillna(median_horsepower)

# 2. Shuffle the dataset
np.random.seed(SEED)
n = len(df)
n_val = int(0.2 * n)  # 20% for validation
n_test = int(0.2 * n) # 20% for testing
n_train = n - n_val - n_test # 60% for training

# Create shuffled indices
idx = np.arange(n)
np.random.shuffle(idx)

# Apply shuffled indices to the DataFrame
df_shuffled = df.iloc[idx]

# 3. Split the data
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
df_test = df_shuffled.iloc[n_train + n_val:].copy()

# 4. Create the target variable arrays for the train/val/test sets
# Note: It's good practice to apply the log transformation here, 
# but for now, we will create the raw target variable as instructed.
# If you want to use the log transformation, you'd apply it to the y_train/val/test.

y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values

# Remove the target variable from the feature sets (X)
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

# 5. Output confirmation
print("--- Dataset Preparation Summary ---")
print(f"Total rows: {n}")
print(f"Train set size (60%): {len(df_train)}")
print(f"Validation set size (20%): {len(df_val)}")
print(f"Test set size (20%): {len(df_test)}")
print("-" * 35)

# Verify no missing values remain in the training set features
print(f"'horsepower' missing values in train set after imputation: {df_train['horsepower'].isnull().sum()}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# --- Configuration ---
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
SEED = 42
REQUIRED_COLUMNS = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]
TARGET = 'fuel_efficiency_mpg'

# --- Linear Regression Functions from Lectures ---

def train_linear_regression(X, y):
    """Trains a simple linear regression model."""
    # Add the bias column (column of ones)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    # Normal Equation: w = (X^T * X)^-1 * (X^T * y)
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    # w0 is the bias, w[1:] are the weights for the features
    return w[0], w[1:]

def rmse(y, y_pred):
    """Calculates the Root Mean Squared Error (RMSE)."""
    error = y - y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

# --- Data Preparation (Reload and Split) ---

df = pd.read_csv(url)
df = df[REQUIRED_COLUMNS]

# Apply train/val/test split
n = len(df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - n_val - n_test

np.random.seed(SEED)
idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx].reset_index(drop=True)

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
df_test = df_shuffled.iloc[n_train + n_val:].copy()

# Extract target variables (y)
y_train = df_train[TARGET].values
y_val = df_val[TARGET].values
y_test = df_test[TARGET].values

# Drop the target from feature dataframes (X)
del df_train[TARGET]
del df_val[TARGET]
del df_test[TARGET]

# Identify the missing value column
MISSING_COL = 'horsepower'
train_mean = df_train[MISSING_COL].mean()

print(f"Calculated training mean for 'horsepower': {round(train_mean, 2)}\n")

# --- Imputation Strategy 1: Fill with 0 ---

print("--- Strategy 1: Impute with 0 ---")
X_train_0 = df_train.fillna(0).values
X_val_0 = df_val.fillna(0).values

# Train the model
w0_0, w_0 = train_linear_regression(X_train_0, y_train)

# Predict on validation set
y_pred_0 = w0_0 + X_val_0.dot(w_0)

# Evaluate RMSE
rmse_0 = rmse(y_val, y_pred_0)
print(f"RMSE (Impute with 0): {rmse_0:.4f}")
print(f"Rounded RMSE (Impute with 0): {round(rmse_0, 2)}")

# --- Imputation Strategy 2: Fill with Training Mean ---

print("\n--- Strategy 2: Impute with Training Mean ---")
X_train_mean = df_train.fillna(train_mean).values
X_val_mean = df_val.fillna(train_mean).values

# Train the model
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)

# Predict on validation set
y_pred_mean = w0_mean + X_val_mean.dot(w_mean)

# Evaluate RMSE
rmse_mean = rmse(y_val, y_pred_mean)
print(f"RMSE (Impute with Mean): {rmse_mean:.4f}")
print(f"Rounded RMSE (Impute with Mean): {round(rmse_mean, 2)}")

print("\n--- Final Comparison ---")
if round(rmse_0, 2) < round(rmse_mean, 2):
    print("Option 'With 0' gives better RMSE.")
elif round(rmse_mean, 2) < round(rmse_0, 2):
    print("Option 'With mean' gives better RMSE.")
else:
    print("Both options are equally good.")


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# --- Configuration ---
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
SEED = 42
REQUIRED_COLUMNS = [
    'engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg'
]
TARGET = 'fuel_efficiency_mpg'
REGULARIZATION_PARAMS = [0, 0.01, 0.1, 1, 5, 10, 100]

# --- Linear Regression Functions ---

def train_linear_regression_reg(X, y, r):
    """Trains a regularized (Ridge) linear regression model."""
    # Add the bias column (column of ones)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    # Calculate XTX with regularization term
    XTX = X.T.dot(X)
    
    # Create the identity matrix and add the regularization term (r * I)
    # The bias term (first element) should NOT be regularized, so the top-left element is 0.
    reg = r * np.eye(XTX.shape[0])
    reg[0, 0] = 0 
    
    XTX = XTX + reg

    # Solve for weights w = (XTX + rI)^-1 * (X^T * y)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

def rmse(y, y_pred):
    """Calculates the Root Mean Squared Error (RMSE)."""
    error = y - y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

# --- Data Preparation (Load, Split, Impute) ---

# Load and filter data
df = pd.read_csv(url)
df = df[REQUIRED_COLUMNS]

# Split indices and data
n = len(df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - n_val - n_test

np.random.seed(SEED)
idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx].reset_index(drop=True)

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
df_test = df_shuffled.iloc[n_train + n_val:].copy()

# Extract target variables (y)
y_train = df_train[TARGET].values
y_val = df_val[TARGET].values
y_test = df_test[TARGET].values

# Drop the target from feature dataframes (X)
del df_train[TARGET]
del df_val[TARGET]
del df_test[TARGET]

# Impute all missing 'horsepower' values with 0
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)
df_test = df_test.fillna(0)

# Convert DataFrames to numpy arrays for training
X_train = df_train.values
X_val = df_val.values

# --- Tuning the Regularization Parameter (r) ---

results = []
best_rmse = float('inf')
best_r = -1

print("--- Ridge Regression Tuning Results ---")
for r in REGULARIZATION_PARAMS:
    # Train model
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    
    # Predict and evaluate on validation set
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    rounded_score = round(score, 2)
    
    # Store results and check for best r
    results.append({'r': r, 'rmse': rounded_score})
    print(f"r = {r:<5}: RMSE = {rounded_score}")

    # Logic to select the smallest r if scores are equal
    if rounded_score < best_rmse:
        best_rmse = rounded_score
        best_r = r
    elif rounded_score == best_rmse and r < best_r:
        best_r = r

print("-" * 40)
print(f"The best RMSE is {best_rmse}, achieved with the smallest r of: {best_r}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# --- Configuration ---
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
SEEDS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
REQUIRED_COLUMNS = [
    'engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg'
]
TARGET = 'fuel_efficiency_mpg'

# --- Linear Regression Functions ---

def train_linear_regression(X, y):
    """Trains a simple linear regression model (no regularization)."""
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    # Solve for weights w = (X^T * X)^-1 * (X^T * y)
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

def rmse(y, y_pred):
    """Calculates the Root Mean Squared Error (RMSE)."""
    error = y - y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

# --- Main Analysis Loop ---

df_original = pd.read_csv(url)
df_original = df_original[REQUIRED_COLUMNS]

scores = []

print("--- RMSE Scores by Seed ---")
for seed in SEEDS:
    # 1. Prepare data for the current seed
    df = df_original.copy()
    
    n = len(df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - n_val - n_test

    # 2. Shuffle and split using the current seed
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx].reset_index(drop=True)

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()

    # 3. Extract target variables (y)
    y_train = df_train[TARGET].values
    y_val = df_val[TARGET].values

    # 4. Drop the target from feature dataframes (X)
    del df_train[TARGET]
    del df_val[TARGET]

    # 5. Impute all missing 'horsepower' values with 0
    X_train = df_train.fillna(0).values
    X_val = df_val.fillna(0).values

    # 6. Train the model (no regularization)
    w0, w = train_linear_regression(X_train, y_train)

    # 7. Predict and evaluate on validation set
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    scores.append(score)
    print(f"Seed {seed}: RMSE = {score:.4f}")

# --- Final Calculation ---

std_score = np.std(scores)
rounded_std = round(std_score, 3)

print("-" * 30)
print(f"RMSE Scores: {np.round(scores, 4)}")
print(f"Standard Deviation (STD) of RMSE scores: {std_score:.5f}")
print(f"Rounded STD: {rounded_std}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# --- Configuration ---
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
SEED = 9
REGULARIZATION_R = 0.001
REQUIRED_COLUMNS = [
    'engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg'
]
TARGET = 'fuel_efficiency_mpg'

# --- Linear Regression Functions ---

def train_linear_regression_reg(X, y, r):
    """Trains a regularized (Ridge) linear regression model."""
    # Add the bias column (column of ones)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    # Calculate XTX with regularization term
    XTX = X.T.dot(X)
    
    # Create the identity matrix and add the regularization term (r * I)
    # The bias term (first element) should NOT be regularized.
    reg = r * np.eye(XTX.shape[0])
    reg[0, 0] = 0 
    
    XTX = XTX + reg

    # Solve for weights w = (XTX + rI)^-1 * (X^T * y)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

def rmse(y, y_pred):
    """Calculates the Root Mean Squared Error (RMSE)."""
    error = y - y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

# --- Data Preparation (Load, Split, Combine) ---

# Load and filter data
df = pd.read_csv(url)
df = df[REQUIRED_COLUMNS]

# Split indices and data
n = len(df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - n_val - n_test

# 1. Shuffle and split using seed 9
np.random.seed(SEED)
idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx].reset_index(drop=True)

# Split into train/val (for combined set) and test
df_train_val = df_shuffled.iloc[:n_train + n_val].copy()
df_test = df_shuffled.iloc[n_train + n_val:].copy()

# Extract target variables (y)
y_train_val = df_train_val[TARGET].values
y_test = df_test[TARGET].values

# Drop the target from feature dataframes (X)
del df_train_val[TARGET]
del df_test[TARGET]

# 2. Impute all missing 'horsepower' values with 0
X_train_val = df_train_val.fillna(0).values
X_test = df_test.fillna(0).values

# --- Training and Evaluation ---

print(f"--- Final Evaluation (Seed {SEED}, r={REGULARIZATION_R}) ---")

# Train the model on the combined Train + Validation set
w0, w = train_linear_regression_reg(X_train_val, y_train_val, r=REGULARIZATION_R)

# Predict on the unseen Test set
y_pred_test = w0 + X_test.dot(w)

# Evaluate RMSE
rmse_test = rmse(y_test, y_pred_test)
rounded_rmse = round(rmse_test, 2)

print(f"RMSE on Test Dataset: {rmse_test:.4f}")
print(f"Rounded RMSE on Test Dataset: {rounded_rmse}")
