# Multiple Linear Regression.

In [None]:
# Importing Libraries
# For Data Handling.
import numpy as np
import pandas as pd
# For Visulization
import matplotlib.pyplot as plt
import seaborn as sns
# For Modeling,Evaluation and Preprocessing.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## EDA.

Loading the Dataset

In [None]:
df = pd.read_csv("ToyotaCorolla - MLR.csv")
df.head()

Dataset Information

In [None]:
df.info()

In [None]:
df.describe() #Statistics of Dataset

In [None]:
df.isnull().sum() #Checking Missing Values

Exploratory Visualizations.

In [None]:
# Distribution of the target (Price)
plt.figure(figsize=(8,4))
sns.histplot(df['Price'], kde=True, bins=30, color='steelblue')
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.show()

In [None]:
# Boxâ€‘plot of Price by Fuel type
plt.figure(figsize=(6,4))
sns.boxplot(x='Fuel_Type', y='Price', data=df, palette='Set2')
plt.title('Price by Fuel Type')
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

In [None]:
df.rename(columns={"Age_08_04":"Age"},inplace=True)

In [None]:
# Scatter Plots
sns.pairplot(df[["Price","Age","KM","HP","Weight"]])
plt.show()

Data Pre-processing

In [None]:
# Fix impossible engine sizes
df.loc[df['cc'] > 3000, 'cc'] = 1600

In [None]:
# Create features and target
X = df.drop('Price', axis=1)
y = df['Price']

In [None]:
# Preprocessing pipeline
numeric_features = ['Age', 'KM','Weight', 'HP', 'Automatic', 'cc', 'Doors']
categorical_features = ['Fuel_Type']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

## Train-Test Split.

Splitting the data into 80% and 20%

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Building.

Model 1: Basic Linear Regression

In [None]:
model1 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
model1.fit(X_train, y_train)

Model 2: Feature Engineering

In [None]:
class FeatureEngineer:
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        X['Age_KM_interaction'] = X['Age'] * X['KM']
        return X

In [None]:
model2 = Pipeline([
    ('engineer', FeatureEngineer()),
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
model2.fit(X_train, y_train)

## Ridge Regression

Model 3: Regularized Regression (Ridge)

In [None]:
model3 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RidgeCV(alphas=[0.1, 1.0, 10.0]))
])
model3.fit(X_train, y_train)

## Model Evaluation.

Evaluation Metrics

In [None]:
models = {
    'Linear Regression': model1,
    'With Interaction': model2, 
    'Ridge Regression': model3
}

In [None]:
results = []
for name, model in models.items():
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results.append([name, mae, rmse, r2])

results_df = pd.DataFrame(results, columns=['Model', 'MAE', 'RMSE', 'R2'])
print("\n=== MODEL COMPARISON ===")
print(results_df.round(2))

## Lasso Regression.

In [None]:
lasso_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LassoCV(cv=5, random_state=42))
])
lasso_model.fit(X_train, y_train)

In [None]:
y_pred_lasso = lasso_model.predict(X_test)
print(f"\nLasso MAE: {mean_absolute_error(y_test, y_pred_lasso):.0f}")
print(f"Lasso R2: {r2_score(y_test, y_pred_lasso):.3f}")

## Interpertation.

Showing coefficients for best model

In [None]:
best_model = model3  # Ridge is often most stable
feature_names = (numeric_features + 
                ['Fuel_Type_Petrol', 'Fuel_Type_CNG'] +  # from one-hot encoding
                ['Age_KM_interaction'])  # it is for using model2

In [None]:
if hasattr(best_model.named_steps['regressor'], 'coef_'):
    coefficients = pd.DataFrame({
        'feature': feature_names[:len(best_model.named_steps['regressor'].coef_)],
        'coefficient': best_model.named_steps['regressor'].coef_
    })
    print("\n=== COEFFICIENTS ===")
    print(coefficients.sort_values('coefficient', key=abs, ascending=False))

## Interview Questions

1. Normalization vs Standardization:

 * Normalization: Scales data to [0,1] range - good for neural networks.
 * Standardization: Centers data (mean=0, std=1) - required for regularized regression.

2. Handling Multicollinearity:

* Remove highly correlated features.
* Use regularization (Lasso/Ridge).
* Principal Component Analysis (PCA).
* Combine correlated variables into single feature.

## Key Assumptions

* Linear relationship between X and y
* No multicollinearity (addressed with regularization)
* Homoscedasticity (constant variance of errors)
* Normal distribution of errors
* No autocorrelation