In [1]:
# beside neural networks, classical statistical models, works well

# Regressions

# Key problem: optimal hyperparameters to use

# In the context of regression models, a hyperparameter is a configuration that is external 
# to the model and whose value is set before the learning process begins. 
# Hyperparameters are not derived from the data during the model's training; instead, 
# they influence the training process itself or the structure of the model. 
# Their settings can greatly affect the performance of the model on the training data and 
# its ability to generalize to new, unseen data.

In [2]:
# 1. Linear Regression

# find equation or distribution that best fits a given dataset.

# data modeling involves creating a model that uses features(columns) to predict output (with minimal error)

# Linear regression: y = ax1 + bx2 + cx3 + d, 
# where x1,x2,x3 comes from a columns, and belives that they can produce y
# d is a bias or intercept, the idea is to come up with constant if x1,x2,x3 equal to zero
# it's like average outcome or baseline


In [3]:
# least square regression, goal is to minimize sum of squared residuals 
# (residual means how far your prediction from actual value)

import numpy as np
from sklearn.linear_model import LinearRegression


In [4]:
# Features: [number of customers, average temperature in Celsius]
cafe_data = np.array([
    [75, 22],   # 75 customers, 22°C
    [92, 25],   # 92 customers, 25°C
    [65, 20],   # 65 customers, 20°C
    [120, 30],  # 120 customers, 30°C
    [80, 22],   # 80 customers, 22°C
    [98, 28]    # 98 customers, 28°C
])

# Target: Daily earnings in dollars
daily_earnings = np.array([300, 350, 280, 400, 310, 360])

In [5]:
reg = LinearRegression()
reg.fit(cafe_data,daily_earnings)


In [6]:
new_data = np.array([
    [85, 24],  # Predict earnings for a day with 85 customers and 24°C
    [110, 29]  # Predict earnings for a day with 110 customers and 29°C
])

In [7]:
predicted_earnings = reg.predict(new_data)
print("Predicted earnings: ", predicted_earnings)

Predicted earnings:  [326.25873968 383.46474807]


In [8]:
# Evaluate model:
# Coefficients indicate the influence of each feature (number of customers and temperature) on daily earnings.
# Intercept is the baseline earnings when both features are zero (not realistic, but part of the linear model).
# R² score tells us how well our model explains the variability in earnings. 
# A higher R² score means better model fit. Coefficient of determination


In [9]:
print("Coeffcients: ", reg.coef_)
print("Intercept: ", reg.intercept_)

Coeffcients:  [1.62479138 3.3172448 ]
Intercept:  108.53759754612295


In [10]:
r2_score = reg.score(cafe_data, daily_earnings)
print("R2 score: ", r2_score)

# Note: this is a toy example, actual r2_score must to be calculated on unseen data


R2 score:  0.989388410747095


In [12]:
# Case Study California Housing Dataset

import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [13]:
housing = fetch_california_housing()
X = housing.data 
y = housing.target 

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
model = LinearRegression()
model.fit(X_train,y_train)


In [16]:
y_pred = model.predict(X_test)

In [17]:
score = r2_score(y_test, y_pred)
print("R² score on test data:", score)

R² score on test data: 0.5757877060324524


In [19]:
# Predict on training data
y_train_pred = model.predict(X_train)

# Calculate R² score on training data
score = r2_score(y_train, y_train_pred)
print("R² score on training data:", score)

R² score on training data: 0.6125511913966952


In [21]:
print("Coeffcients: ", model.coef_)
print("Intercept: ", model.intercept_)

Coeffcients:  [ 4.48674910e-01  9.72425752e-03 -1.23323343e-01  7.83144907e-01
 -2.02962058e-06 -3.52631849e-03 -4.19792487e-01 -4.33708065e-01]
Intercept:  -37.02327770606391
