# Task 1: Simple Linear Regression

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = housing.target

# Using 'AveRooms' as the feature
X_rooms = X[['AveRooms']]
X_train, X_test, y_train, y_test = train_test_split(X_rooms, y, test_size=0.2, random_state=42)

# Model training
model = LinearRegression()
model.fit(X_train, y_train)

# Plot
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, model.predict(X_test), color='red', linewidth=2, label='Regression Line')
plt.xlabel('Average Rooms')
plt.ylabel('House Value')
plt.title('Simple Linear Regression')
plt.legend()
plt.show()


# Task 2: Multiple Linear Regression

In [None]:

from sklearn.metrics import mean_squared_error, r2_score

# Use multiple features
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveOccup']
X_multi = X[features]
X_train, X_test, y_train, y_test = train_test_split(X_multi, y, test_size=0.2, random_state=42)

model_multi = LinearRegression()
model_multi.fit(X_train, y_train)
y_pred = model_multi.predict(X_test)

# Evaluation
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("R-squared:", r2)
print("MSE:", mse)
print("RMSE:", rmse)
print("Coefficients:", model_multi.coef_)


# Task 3: Feature Scaling and Normalization

In [None]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_multi)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model_scaled = LinearRegression()
model_scaled.fit(X_train, y_train)
y_pred_scaled = model_scaled.predict(X_test)

r2_scaled = r2_score(y_test, y_pred_scaled)
rmse_scaled = np.sqrt(mean_squared_error(y_test, y_pred_scaled))

print("R-squared (scaled):", r2_scaled)
print("RMSE (scaled):", rmse_scaled)


# Task 4: Model Interpretation

In [None]:

import seaborn as sns

corr_matrix = X[features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# Discuss multicollinearity
print("Features with high correlation may indicate multicollinearity which can distort model coefficients.")


# Task 5: Binary Classification with Logistic Regression

In [None]:

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:
", confusion_matrix(y_test, y_pred))
print("Classification Report:
", classification_report(y_test, y_pred))


In [None]:

y_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
plt.plot(fpr, tpr, label='ROC curve')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

print("ROC AUC Score:", roc_auc_score(y_test, y_probs))


# Task 6: Threshold Tuning and Probability Interpretation

In [None]:

from sklearn.metrics import f1_score

for threshold in [0.3, 0.5, 0.7]:
    y_thresh = (y_probs >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print("Confusion Matrix:
", confusion_matrix(y_test, y_thresh))
    print("F1 Score:", f1_score(y_test, y_thresh))


# Task 7: Multiclass Classification (Optional)

In [None]:

from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf_multi = LogisticRegression(multi_class='ovr', max_iter=10000)
clf_multi.fit(X_train, y_train)

y_pred = clf_multi.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:
", classification_report(y_test, y_pred))


# Part III: General Questions


**1. What are the assumptions of linear regression?**  
Linear regression assumes linearity between features and target, independence of errors, homoscedasticity (constant variance of errors), and no multicollinearity among predictors.

**2. When should you use logistic regression instead of linear regression?**  
Use logistic regression when the target variable is categorical (especially binary), such as yes/no or 0/1.

**3. What is the interpretation of coefficients in logistic regression?**  
The coefficients represent the change in the log odds of the target variable for a one-unit increase in the predictor.

**4. What is the difference between sigmoid and softmax functions?**  
Sigmoid maps values to a probability between 0 and 1 for binary classification. Softmax generalizes this to multi-class classification, ensuring outputs sum to 1.

**5. Why is R-squared not suitable for evaluating logistic regression models?**  
R-squared measures variance explained, suitable for regression, not classification. Logistic regression uses measures like AUC, accuracy, or log loss instead.
