# **Machine Learning Assignment – 3 (Linear & Logistic Regression)**


## **Part I: Linear Regression**
### **Task 1: Simple Linear Regression**

In [None]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load California Housing dataset
data = fetch_california_housing()
X = data.data[:, [3]]  # Using 'AveRooms' as a single feature
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Plot regression line
plt.scatter(X_test, y_test, color="blue", label="Actual Data")
plt.plot(X_test, y_pred, color="red", linewidth=2, label="Regression Line")
plt.xlabel("Average Rooms")
plt.ylabel("House Price")
plt.legend()
plt.show()


### **Task 2: Multiple Linear Regression**

In [None]:

from sklearn.metrics import r2_score, mean_squared_error
import math

# Use first 4 features
X = data.data[:, :4]
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)

print(f"R-squared: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print('Coefficients:', model.coef_)


### **Task 3: Feature Scaling and Normalization**

In [None]:

from sklearn.preprocessing import StandardScaler

# Without scaling
model = LinearRegression()
model.fit(X_train, y_train)
print("R2 without scaling:", r2_score(y_test, model.predict(X_test)))

# With scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train)
print("R2 with scaling:", r2_score(y_test, model_scaled.predict(X_test_scaled)))


### **Task 4: Model Interpretation**

In [None]:

import pandas as pd
import seaborn as sns

df = pd.DataFrame(data.data, columns=data.feature_names)
df['Price'] = data.target

plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix Heatmap")
plt.show()

# Features with strongest relationship
print(df.corr()['Price'].sort_values(ascending=False))


## **Part II: Logistic Regression**
### **Task 5: Binary Classification**

In [None]:

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_model = LogisticRegression(max_iter=5000)
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ROC curve
y_prob = log_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_prob):.2f}")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()


### **Task 6: Threshold Tuning and Probability Interpretation**

In [None]:

thresholds = [0.3, 0.5, 0.7]
for t in thresholds:
    y_pred_thr = (y_prob >= t).astype(int)
    print(f"\nThreshold: {t}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thr))
    print("F1-score:", classification_report(y_test, y_pred_thr, output_dict=True)['weighted avg']['f1-score'])


### **Task 7: Multiclass Classification (Optional)**

In [None]:

from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

log_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
log_model.fit(X, y)
y_pred = log_model.predict(X)

print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred))



## **Part III: General Questions**
1. **What are the assumptions of linear regression?**  
   - Linear relationship between features and target.  
   - Errors are normally distributed with constant variance.  
   - Independence of observations.  
   - No multicollinearity among features.

2. **When should you use logistic regression instead of linear regression?**  
   - When target variable is categorical (e.g., 0/1). Logistic regression predicts probabilities, not continuous values.

3. **What is the interpretation of coefficients in logistic regression?**  
   - Each coefficient represents change in log-odds of the outcome for one unit increase in the feature.

4. **What is the difference between sigmoid and softmax functions?**  
   - Sigmoid is for binary classification (0-1 probability), while softmax generalizes it for multi-class probabilities summing to 1.

5. **Why is R-squared not suitable for evaluating logistic regression models?**  
   - R-squared is for continuous target models. For classification, metrics like AUC, accuracy, precision, and recall are used.
