In [None]:
# Linear Regression Example - Google Colab Ready
# This example demonstrates linear regression with a health scenario

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Generate sample data: Age vs Blood Pressure
np.random.seed(42)
n_samples = 100

# Create realistic age and blood pressure data
ages = np.random.normal(50, 15, n_samples)  # Ages around 50, std 15
ages = np.clip(ages, 20, 80)  # Keep ages between 20-80

# Blood pressure increases with age + some noise
blood_pressure = 90 + 0.8 * ages + np.random.normal(0, 8, n_samples)
blood_pressure = np.clip(blood_pressure, 100, 180)  # Realistic BP range

# Create DataFrame for easier handling
df = pd.DataFrame({
    'Age': ages,
    'Blood_Pressure': blood_pressure
})

print("Sample of our data:")
print(df.head(10))
print(f"\nDataset shape: {df.shape}")

# Prepare data for sklearn
X = df[['Age']]  # Features (must be 2D array)
y = df['Blood_Pressure']  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print model parameters
print(f"\nModel Results:")
print(f"Intercept (baseline blood pressure): {model.intercept_:.2f}")
print(f"Coefficient (BP increase per year): {model.coef_[0]:.2f}")
print(f"Equation: Blood Pressure = {model.intercept_:.2f} + {model.coef_[0]:.2f} × Age")

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"This model explains {r2*100:.1f}% of the variance in blood pressure")

# Create visualization
plt.figure(figsize=(12, 5))

# Plot 1: Training data and fitted line
plt.subplot(1, 2, 1)
plt.scatter(X_train, y_train, alpha=0.6, color='blue', label='Training Data')
plt.plot(X_train, model.predict(X_train), color='red', linewidth=2, label='Fitted Line')
plt.xlabel('Age')
plt.ylabel('Blood Pressure')
plt.title('Linear Regression: Age vs Blood Pressure')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Actual vs Predicted values
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.6, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
plt.xlabel('Actual Blood Pressure')
plt.ylabel('Predicted Blood Pressure')
plt.title('Actual vs Predicted Values')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Make some predictions for new data
print(f"\nPredictions for new patients:")
new_ages = np.array([[25], [45], [65]])
new_predictions = model.predict(new_ages)

for age, bp in zip(new_ages.flatten(), new_predictions):
    print(f"Age {age}: Predicted BP = {bp:.1f}")

# Demonstrate the relationship interpretation
print(f"\nInterpretation:")
print(f"For every 1 year increase in age, blood pressure increases by {model.coef_[0]:.2f} units")
print(f"A 20-year difference in age corresponds to {20 * model.coef_[0]:.1f} units difference in BP")

# Check residuals (difference between actual and predicted)
residuals = y_test - y_pred
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.scatter(y_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(residuals, bins=15, alpha=0.7, color='skyblue')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nResidual Analysis:")
print(f"Mean of residuals: {np.mean(residuals):.3f} (should be close to 0)")
print(f"Standard deviation of residuals: {np.std(residuals):.2f}")