In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# 1. Load the dataset
# Make sure you load the dataset 'ToyotaCorolla - MLR.csv' correctly
df = pd.read_csv('/content/ToyotaCorolla - MLR.csv')

# Show basic dataset info and summary statistics
print(df.head())      # Display the first few rows of the data
print(df.info())      # Info on data types and missing values
print(df.describe())  # Summary statistics for numerical features

# 2. Exploratory Data Analysis (EDA)

# Histograms for all numerical variables to understand their distribution
df.hist(bins=20, figsize=(10, 10))
plt.show()

# Pairplot to visualize relationships between numerical variables
sns.pairplot(df)
plt.show()

# Correlation matrix heatmap to identify linear relationships
# First, convert categorical variables into dummy/indicator variables
df = pd.get_dummies(df, drop_first=True)

# Compute correlation matrix only for numeric columns
corr_matrix = df.corr()

# Display the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

# 3. Preprocess the dataset
# Split the dataset into features (X) and target (y)
X = df.drop('Price', axis=1)  # All columns except 'Price' are features
y = df['Price']               # 'Price' is the target variable

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Build a Multiple Linear Regression Model

# Initialize the linear regression model
lr = LinearRegression()

# Fit the model to the training data
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
r2 = r2_score(y_test, y_pred)             # R² score

# Print the results
print(f"Linear Regression Mean Squared Error: {mse}")
print(f"Linear Regression R² Score: {r2}")

# Print the coefficients of the linear model
coefficients = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': lr.coef_})
print(coefficients)

# 5. Apply Regularization Techniques: Lasso and Ridge

# Lasso Regression (L1 Regularization)
lasso = Lasso(alpha=0.1)  # Adjust alpha (regularization strength) as needed
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

# Evaluate Lasso Regression
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"Lasso Regression Mean Squared Error: {mse_lasso}")
print(f"Lasso Regression R² Score: {r2_lasso}")

# Ridge Regression (L2 Regularization)
ridge = Ridge(alpha=0.1)  # Adjust alpha (regularization strength) as needed
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

# Evaluate Ridge Regression
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Ridge Regression Mean Squared Error: {mse_ridge}")
print(f"Ridge Regression R² Score: {r2_ridge}")

# 6. Optional: Standardization (for models like Lasso and Ridge to perform better)

# Initialize a standard scaler
scaler = StandardScaler()

# Standardize the training and test datasets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply Lasso on standardized data
lasso_scaled = Lasso(alpha=0.1)
lasso_scaled.fit(X_train_scaled, y_train)
y_pred_lasso_scaled = lasso_scaled.predict(X_test_scaled)

# Evaluate Lasso on standardized data
mse_lasso_scaled = mean_squared_error(y_test, y_pred_lasso_scaled)
r2_lasso_scaled = r2_score(y_test, y_pred_lasso_scaled)
print(f"Standardized Lasso Regression Mean Squared Error: {mse_lasso_scaled}")
print(f"Standardized Lasso Regression R² Score: {r2_lasso_scaled}")

# Apply Ridge on standardized data
ridge_scaled = Ridge(alpha=0.1)
ridge_scaled.fit(X_train_scaled, y_train)
y_pred_ridge_scaled = ridge_scaled.predict(X_test_scaled)

# Evaluate Ridge on standardized data
mse_ridge_scaled = mean_squared_error(y_test, y_pred_ridge_scaled)
r2_ridge_scaled = r2_score(y_test, y_pred_ridge_scaled)
print(f"Standardized Ridge Regression Mean Squared Error: {mse_ridge_scaled}")
print(f"Standardized Ridge Regression R² Score: {r2_ridge_scaled}")

# Interview Questions
# 1. What is Normalization & Standardization and how is it helpful?
# Normalization and standardization are techniques used to scale numeric data to a common range, usually between 0 and 1, to prevent features with large ranges from dominating the model.
# This is helpful because many machine learning algorithms are sensitive to the scale of the data, and scaling the data can improve the performance of the model.

# 2. What techniques can be used to address multicollinearity in multiple linear regression?
# Techniques to address multicollinearity include:
# 1. Dropping one of the correlated variables
# 2. Using dimensionality reduction techniques, such as PCA
# 3. Using regularization techniques, such as Lasso or Ridge regression
# 4. Using a different model, such as a generalized linear model or a decision tree model

# Assumptions made during the analysis:
# 1. Linearity: The relationship between the features and the target variable is assumed to be linear.
