 # Supervised Learning: More Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Please read the sklearn documentation if you need to figure out the inputs and outputs of these functions
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error


 ## PART 1: SIMPLE LINEAR REGRESSION



 We'll analyze the relationship between study hours and exam scores using three different methods.

In [None]:
# Dataset: Study Hours vs Exam Scores
data = {
    'Hours_Studied': [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0],
    'Exam_Score': [45, 51, 58, 62, 68, 73, 78, 84, 88, 93]
}
df = pd.DataFrame(data)

print("Dataset:")
print(df)

# Visualize the data
plt.figure(figsize=(8, 6))
plt.scatter(df['Hours_Studied'], df['Exam_Score'], color='blue', s=100)
plt.xlabel('Hours Studied')
plt.ylabel('Exam Score')
plt.title('Study Hours vs Exam Scores')
plt.grid(True, alpha=0.3)
plt.show()


 ### TASK 1: Manual Least Squares Calculation



 Calculate the slope (m) and intercept (b) for y = mx + b using the **FIRST 5 DATA POINTS ONLY**.



 **Formulas:**

 - Slope: $m = \frac{n\sum xy - \sum x \sum y}{n\sum x^2 - (\sum x)^2}$

 - Intercept: $b = \frac{\sum y - m\sum x}{n}$

In [None]:
# Use first 5 data points
x_manual = df['Hours_Studied'][:5].values
y_manual = df['Exam_Score'][:5].values

n = len(x_manual)

# TODO: Calculate sums needed
sum_x = np.sum(x_manual)
sum_y = np.sum(y_manual)
sum_xy = np.sum(x_manual * y_manual)
sum_x_squared = np.sum(x_manual ** 2)

# TODO: Calculate slope and intercept using formulas above
m_manual = (n * sum_xy - sum_x * sum_y) / (n * sum_x_squared - (sum_x ** 2))
b_manual = (sum_y - m_manual * sum_x) / n

print("TASK 1 - Manual Calculation (first 5 points):")
print(f"Slope (m): {m_manual}")
print(f"Intercept (b): {b_manual}")
print(f"Equation: y = {m_manual:.2f}x + {b_manual:.2f}\n")

# TODO: Make predictions and calculate errors
y_pred_manual = m_manual * x_manual + b_manual
mae_manual = mean_absolute_error(y_manual, y_pred_manual)
mse_manual = mean_squared_error(y_manual, y_pred_manual)

print(f"MAE: {mae_manual:.2f}")
print(f"MSE: {mse_manual:.2f}")


 ### TASK 2: NumPy lstsq (Full Dataset)



 Use `np.linalg.lstsq` to find the best fit line for all 10 data points.



 #### Why Add a Column of Ones?



 When we use `np.linalg.lstsq`, we're solving the matrix equation: **Xβ = y**



 For a linear regression equation **y = mx + b**, we need to find both:

 - **m** (slope)

 - **b** (intercept)



 The design matrix must be structured so that matrix multiplication gives us: **b·1 + m·x**



 ```

 Design Matrix (X):        Coefficients (β):     Result (y):

 [1  x₁]                   [b]                   [b·1 + m·x₁]

 [1  x₂]           ×       [m]          =        [b·1 + m·x₂]

 [1  x₃]                                         [b·1 + m·x₃]

 ...

 ```



 **Key Points:**

 - The column of ones multiplies with the intercept **b** to add that constant term to each prediction

 - The column of x values multiplies with the slope **m** to add the variable component

 - **Without the ones column:** We'd only fit lines through the origin (b=0): y = mx

 - **With the ones column:** We can fit any line: y = mx + b

In [None]:
# TODO: Prepare the design matrix
# Hint: Use np.column_stack to combine a column of ones with the Hours_Studied values
X_numpy = np.column_stacck([np.ones(len(df)), df['Hours Studied'].values])
y_numpy = df['Exam_Score'].values

# TODO: Use np.linalg.lstsq
solution, residuals, rank, s = np.linalg.lstsq(X_numpy, y_numpy, rcond=None)

# TODO: Extract coefficients
b_numpy = solution[0]
m_numpy = solution[1]

print("TASK 2 - NumPy lstsq (all 10 points):")
print(f"Slope (m): {m_numpy}")
print(f"Intercept (b): {b_numpy}")
print(f"Equation: y = {m_numpy:.2f}x + {b_numpy:.2f}\n")

# TODO: Calculate predictions and errors
y_pred_numpy = X_numpy @ solution
mae_numpy = mean_absolute_error(y_numpy, y_pred_numpy)
mse_numpy = mean_squared_error(y_numpy, y_pred_numpy)

print(f"MAE: {mae_numpy:.2f}")
print(f"MSE: {mse_numpy:.2f}")


 ### TASK 3: Scikit-learn LinearRegression (Full Dataset)



 Use sklearn's `LinearRegression` class.



 **Note:** sklearn automatically handles the intercept internally (with `fit_intercept=True` by default), so we don't need to add a column of ones!

In [None]:
# TODO: Reshape X for sklearn (needs 2D array)
X_sklearn = df['Hours Studied'].values.reshape(-1,1)
y_sklearn = df['Exam_Score'].values

# TODO: Create and fit the model
model = LinearRegression()
model.fit(X_sklearn,y_sklearn)

# TODO: Extract coefficients
m_sklearn = model.coef_[0]
b_sklearn = model.intercept_

print("TASK 3 - Sklearn LinearRegression (all 10 points):")
print(f"Slope (m): {m_sklearn}")
print(f"Intercept (b): {b_sklearn}")
print(f"Equation: y = {m_sklearn:.2f}x + {b_sklearn:.2f}\n")

# TODO: Calculate predictions and errors
y_pred_sklearn = model.predict(X_sklearn)
mae_sklearn = mean_absolute_error(y_sklearn, y_pred_sklearn)
mse_sklearn = mean_squared_error(y_sklearn, y_pred_sklearn)

print(f"MAE: {mae_sklearn:.2f}")
print(f"MSE: {mse_sklearn:.2f}")


 ### TASK 4: Comparison and Discussion

In [None]:
print("="*70)
print("COMPARISON OF ALL THREE METHODS:")
print("="*70)
print(f"{'Method':<30} {'Slope':<12} {'Intercept':<12} {'MAE':<10} {'MSE':<10}")
print("-"*70)
# TODO: Fill in the comparison table with your results
print(f"{'Manual (5 points)':<30} {m_manual:<12.4f} {b_manual:<12.4f} {mae_manual:<10.2f} {mse_manual:<10.2f}")
print(f"{'NumPy (10 points)':<30} {m_numpy:<12.4f} {b_numpy:<12.4f} {mae_numpy:<10.2f} {mse_numpy:<10.2f}")
print(f"{'Sklearn (10 points)':<30} {m_sklearn:<12.4f} {b_sklearn:<12.4f} {mae_sklearn:<10.2f} {mse_sklearn:<10.2f}")


In [None]:
# TODO: Plot all three regression lines
plt.figure(figsize=(10, 6))
plt.scatter(df['Hours_Studied'], df['Exam_Score'], color='black', s=100, label='Data', zorder=3)

# TODO: Create x range for plotting
x_range = np.linspace(df['Hours Studied'].min(), df['Hours Studied'].max(), 100)

# TODO: Plot each regression line with different styles
plt.plot(x_range, m_manual*x_range+b_manual '--', label="Manual (5 pts)", linewidth=2)
plt.plot(x_range, m_numpy*x_range+b_numpy '-', label="NumPy lstsq (10 pts)", linewidth=2)
plt.plot(x_range, m_sklearn*x_range+b_sklearn ':', label="Sklearn (10 pts)", linewidth=3)

plt.xlabel('Hours Studied')
plt.ylabel('Exam Score')
plt.title('Comparison of Regression Methods')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


 ### Discussion Questions



 **1. Why do the manual calculations differ from NumPy and sklearn?**



 Manual calculations only uses first five points so it's fitted to smaller dataset. NumPy and Sklearn use all 10 points giving a more accurate estimate of the true relationship.



 ---



 **2. Why are NumPy and sklearn results nearly identical?**



 They both solve the same problem.



 ---



 **3. Which method produced the lowest error? Why?**



 NumPy and sklearn because they used all 10 data points giving them a better fit of the data. Manual was only trained on the first five points so it couldn't capture the full trend.



 ---

 ## PART 2: MULTIPLE LINEAR REGRESSION



 Now we'll predict house prices using multiple features.

In [None]:
# Dataset: House Prices
house_data = {
    'Size_sqft': [1200, 1500, 1800, 2000, 2200, 2500, 2800, 3000, 3200, 3500,
                  1300, 1600, 1900, 2100, 2400, 2600, 2900, 3100, 3300, 3600],
    'Bedrooms': [2, 3, 3, 3, 4, 4, 4, 4, 5, 5,
                 2, 3, 3, 4, 4, 4, 4, 5, 5, 5],
    'Age_years': [15, 20, 10, 5, 8, 3, 12, 6, 15, 2,
                  18, 12, 8, 10, 5, 7, 4, 9, 11, 1],
    'Price_1000s': [180, 210, 250, 280, 310, 350, 360, 400, 420, 480,
                    190, 230, 270, 300, 340, 370, 390, 430, 450, 510]
}
house_df = pd.DataFrame(house_data)

print("House Price Dataset:")
print(house_df)


 ### TASK 5: Justify Multiple Linear Regression



 Calculate correlations to understand relationships between features and price.

In [None]:
print("Correlations with Price:")
# TODO: Calculate correlation between each feature and price
for col in ['Size_sqft', 'Bedrooms', 'Age_years']:
    corr = house_df[col].corr(house_df['Price_1000s'])
    print(f"{col}: {corr:.3f}")


 **Why should we use multiple linear regression instead of simple linear regression?**



 Because there are multiple factors. Multiple linear regression will give us a more accurate prediction because it can take in multiple factors: size, # of bedrooms, age on house price.



 ---

 ### TASK 6: Implement Multiple Linear Regression

In [None]:
# TODO: Prepare features and target
X_multi = house_df[['Size_sqft', 'Bedrooms', 'Age_years']]
y_multi = house_df['Price_1000s']

# TODO: Split into training and testing sets (80-20 split)
# The purpose of splitting the data into a training and testing set is to save some of the data to test the model that we trained.
# The test data set needs to be separate from the training data so we can simulate "new" situations before we put our model out in the real world.
# Hint: There is a very helpful sklearn method you should use for this step
X_train, X_test, y_train, y_test = train_test_split(X_multi,y_multi,test_size=0.2)

# TODO: Create and train the model
mlr_model = LinearRegression()
# TODO: Fit the model
mlr_model.fit(X_train, y_train)

# TODO: Make predictions
y_train_pred = mlr_model.predict(X_train)
y_test_pred = mlr_model.predict(X_test)

# TODO: Extract coefficients
coefficients = mlr_model.coef_
intercept = mlr_model.intercept_

print("Model Coefficients:")
print(f"Intercept: {intercept:.2f}")
for i, col in enumerate(X_multi.columns):
    print(f"{col}: {coefficients[i]:.2f}")  # TODO: Fill in actual coefficient

print("\nModel Equation:")
print("Price = {intercept:.2f} + {coefficients[0]:.2f}*Size + {coefficients[1]:.2f}*Bedrooms + {coefficients[2]:.2f}*Age")  # TODO: Complete


 ### TASK 7: Calculate and Interpret Errors

In [None]:
# TODO: Calculate training errors (how far were predictions from the real prices)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

# TODO: Calculate testing errors (how far were predictions from the real prices)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)

print("Training Set Performance:")
print(f"MAE: $(train_mae:.2f)k")  
print(f"MSE: $(train_mse:.2f)k²")  
print(f"RMSE: $(train_rmse:.2f)k")  

print("\nTest Set Performance:")
print(f"MAE: $(test_mae:.2f)k")  
print(f"MSE: $(test_mse:.2f)k²")  
print(f"RMSE: $(test_rmse:.2f)k")  


 ### TASK 8: Visualize Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training set
axes[0].scatter(y_train, y_train_pred, alpha=0.6, s=100, color='blue')
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price ($1000s)')
axes[0].set_ylabel('Predicted Price ($1000s)')
axes[0].set_title('Training Set: Actual vs Predicted')
axes[0].grid(True, alpha=0.3)

# Testing set
# TODO: Complete the test set plot
axes[1].scatter(y_test, y_test_pred, alpha=0.6, s=100, color='green')
axes[1].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Price ($1000s)')
axes[1].set_ylabel('Predicted Price ($1000s)')
axes[1].set_title('Test Set: Actual vs Predicted')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


 ### Final Discussion Questions



 **1. How do the training and test errors compare? What does this tell you? (Recall our discussion on overfitting and underfitting from last week)**


 Good - training and test errors are similar and low;
 Overfitting - very low training error and high test error;
 Underfitting - high training and test errors;



 ---



 **2. Which feature has the strongest effect on house price? How can you tell?**



 Largest absolute coefficient because larger houses sell for more. 



 ---



 **3. What is one limitation of this model?**



 It assumes a linear relationship between features and target. If the relationship is nonlinear it will either over or under estimate.



 ---