In [4]:
#Q1
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

# Generate synthetic dataset
X, y = make_regression(n_samples=200, n_features=7, noise=10, random_state=42)
X = (X - X.mean(axis=0)) / X.std(axis=0)  # feature scaling
y = y.reshape(-1, 1)

# Add bias column
X = np.c_[np.ones(X.shape[0]), X]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def ridge_regression_gd(X, y, lr=0.01, alpha=0.1, epochs=1000):
    m, n = X.shape
    W = np.zeros((n, 1))
    for epoch in range(epochs):
        y_pred = X @ W
        gradient = (1/m) * (X.T @ (y_pred - y) + alpha * W)
        W -= lr * gradient
    return W

learning_rates = [0.0001, 0.001, 0.01, 0.1, 1.0]
alphas = [1e-15, 1e-10, 1e-5, 10, 100]

best_r2 = -np.inf
best_params = None

for lr in learning_rates:
    for alpha in alphas:
        W = ridge_regression_gd(X_train, y_train, lr, alpha, epochs=1000)
        y_pred = X_test @ W
        r2 = r2_score(y_test, y_pred)
        if r2 > best_r2:
            best_r2 = r2
            best_params = (lr, alpha)

print("âœ… Best parameters:")
print("Learning Rate:", best_params[0], " | Alpha:", best_params[1])
print("Maximum R2 Score:", best_r2)


âœ… Best parameters:
Learning Rate: 0.1  | Alpha: 1e-05
Maximum R2 Score: 0.9843988688748493


In [19]:
# Q2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

url = "https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Hitters.csv"
df = pd.read_csv(url)

print("âœ… Dataset loaded successfully!")
print(df.head(), "\n")

# 1. Drop rows with missing values
df = df.dropna()

# 2. Convert categorical columns to numeric (One-Hot Encoding)
df = pd.get_dummies(df, drop_first=True)

X = df.drop("Salary", axis=1)
y = df["Salary"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

# Ridge Regression
ridge_reg = Ridge(alpha=0.5748)
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_test)

# Lasso Regression
lasso_reg = Lasso(alpha=0.5748)
lasso_reg.fit(X_train, y_train)
y_pred_lasso = lasso_reg.predict(X_test)


def evaluate_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{name}:\n  MSE = {mse:.2f},  RÂ² = {r2:.4f}\n")

evaluate_model("Linear Regression", y_test, y_pred_lin)
evaluate_model("Ridge Regression", y_test, y_pred_ridge)
evaluate_model("Lasso Regression", y_test, y_pred_lasso)


âœ… Dataset loaded successfully!
          Unnamed: 0  AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  \
0     -Andy Allanson    293    66      1    30   29     14      1     293   
1        -Alan Ashby    315    81      7    24   38     39     14    3449   
2       -Alvin Davis    479   130     18    66   72     76      3    1624   
3      -Andre Dawson    496   141     20    65   78     37     11    5628   
4  -Andres Galarraga    321    87     10    39   42     30      2     396   

   CHits  ...  CRuns  CRBI  CWalks  League Division PutOuts  Assists  Errors  \
0     66  ...     30    29      14       A        E     446       33      20   
1    835  ...    321   414     375       N        W     632       43      10   
2    457  ...    224   266     263       A        W     880       82      14   
3   1575  ...    828   838     354       N        E     200       11       3   
4    101  ...     48    46      33       N        E     805       40       4   

   Salary  NewLeague  


In [18]:
# Q3. RidgeCV and LassoCV on Housing Dataset
import pandas as pd
import numpy as np
import urllib.request
import tarfile
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# --- Step 1: Download and extract dataset ---
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"
urllib.request.urlretrieve(url, "housing.tgz")

with tarfile.open("housing.tgz") as housing_tgz:
    housing_tgz.extractall(path="housing")

# Load the CSV file
df = pd.read_csv("housing/housing.csv")
print("âœ… Dataset loaded successfully!")
print(df.head())

# --- Step 2: Data Preprocessing ---
# Handle missing values
df = df.dropna()

# Convert categorical column 'ocean_proximity' to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

# Separate features and target
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Step 3: Split into training and testing data ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# --- Step 4: RidgeCV Implementation ---
ridge = RidgeCV(alphas=np.logspace(-3, 3, 7), cv=5)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

print("\nðŸ”¹ RidgeCV Results:")
print("Best alpha:", ridge.alpha_)
print("RÂ² score:", r2_score(y_test, y_pred_ridge))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))

# --- Step 5: LassoCV Implementation ---
lasso = LassoCV(alphas=np.logspace(-3, 3, 7), cv=5, max_iter=10000)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

print("\nðŸ”¹ LassoCV Results:")
print("Best alpha:", lasso.alpha_)
print("RÂ² score:", r2_score(y_test, y_pred_lasso))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lasso)))



  housing_tgz.extractall(path="housing")


âœ… Dataset loaded successfully!
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

ðŸ”¹ RidgeCV Results:
Best alpha: 10.0
RÂ

In [13]:
Q4
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# One-vs-Rest Logistic Regression
model = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9666666666666667

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30





# Task
Explain the error in the selected code, fix it, and incorporate the changes into the existing code.

## Download the dataset

### Subtask:
Download the gzipped file from the provided URL.


**Reasoning**:
The first step is to download the gzipped file from the provided URL using the `requests` library and save it to a local file.



In [8]:
import requests

url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"
response = requests.get(url)

# Save the downloaded content to a local file
with open("housing.tgz", "wb") as f:
    f.write(response.content)

## Extract the dataset

### Subtask:
Extract the CSV file from the downloaded gzipped file.


**Reasoning**:
Extract the CSV file from the downloaded gzipped file.



In [10]:
import tarfile

# Open the downloaded gzipped file in read mode
with tarfile.open("housing.tgz", "r:gz") as tar:
    # Extract all files from the archive
    tar.extractall()

  tar.extractall()


## Load and preprocess the data

### Subtask:
Load the extracted CSV file into a pandas DataFrame, handle missing values, and perform one-hot encoding.


**Reasoning**:
Load the 'housing.csv' file into a pandas DataFrame, handle missing values by dropping rows with any missing values, and perform one-hot encoding on the categorical columns, dropping the first category to avoid multicollinearity.



In [11]:
import pandas as pd

# Load dataset
df = pd.read_csv('housing.csv')

# Data preprocessing
df = df.dropna()
df = pd.get_dummies(df, drop_first=True)

# Display the first few rows of the processed DataFrame
display(df.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,True,False


## Split and scale the data

### Subtask:
Separate features and target, split the data into training and testing sets, and scale the features.


**Reasoning**:
Separate features and target, split the data, and scale the features as per the instructions.



In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate input/output
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Train and evaluate models

### Subtask:
Train Linear, Ridge, and Lasso regression models and evaluate their R2 scores.


**Reasoning**:
Instantiate and train Linear, Ridge, and Lasso regression models, then evaluate and print their R2 scores on the test set.



In [14]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

# Instantiate models
lin = LinearRegression()
ridge = Ridge(alpha=0.5748)
lasso = Lasso(alpha=0.5748)

# Train models
lin.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_test)

# Evaluate
models = {'Linear': lin, 'Ridge': ridge, 'Lasso': lasso}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"{name} R2 Score: {r2_score(y_test, y_pred):.4f}")

ValueError: Found input variables with inconsistent numbers of samples: [120, 30]

**Reasoning**:
The traceback indicates a ValueError because the Lasso model was trained with X_train (120 samples) and y_test (30 samples), which have inconsistent numbers of samples. The Lasso model should be trained with X_train and y_train. I will fix the code to train the Lasso model correctly.



In [15]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

# Instantiate models
lin = LinearRegression()
ridge = Ridge(alpha=0.5748)
lasso = Lasso(alpha=0.5748)

# Train models
lin.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train) # Corrected: train with y_train instead of y_test

# Evaluate
models = {'Linear': lin, 'Ridge': ridge, 'Lasso': lasso}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"{name} R2 Score: {r2_score(y_test, y_pred):.4f}")

Linear R2 Score: 0.9469
Ridge R2 Score: 0.9452
Lasso R2 Score: 0.7433


## Summary:

### Q&A
*   What was the error in the selected code?
    The error was a `ValueError` that occurred because the Lasso model was incorrectly trained using `X_train` and `y_test`, which have different numbers of samples.

### Data Analysis Key Findings
*   The initial code for training the Lasso model attempted to use `y_test` for training, which caused a `ValueError` because `X_train` and `y_test` have different numbers of samples.
*   The error was fixed by changing `lasso.fit(X_train, y_test)` to `lasso.fit(X_train, y_train)`.
*   After fixing the error, all three models (Linear, Ridge, and Lasso) were successfully trained and evaluated.
*   The R2 scores on the test set were: Linear: 0.9469, Ridge: 0.9452, and Lasso: 0.7433.

### Insights or Next Steps
*   The Linear and Ridge regression models performed significantly better than the Lasso model based on the R2 scores.
*   Further analysis could involve tuning the hyperparameters of the Ridge and Lasso models (e.g., the `alpha` value) to potentially improve their performance.
