In [1]:
# LinearRegression
# Examples
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3

y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, y)
reg.score(X, y)
reg.coef_       # slope (estimated)
reg.intercept_  # y intercept (estimated)
reg.predict(np.array([[3, 5]]))
# This predicts the y value for a new input [3, 5]


"""
This code performs Linear Regression from scratch using numpy.
The dot fucntion in numpy does matrix multiplication or sum of products. 

Here, X is the input data, also called features.
Each row is a data points: [x0, x1]
So, there are 4 examples and each has 2 features

'y = np.dot(X, np.array([1, 2])) + 3'
    - np.array([1, 2]) represents the weights for the two features: 
        --> x0 is multiplied by 1, and x1 is multiplied by 2
    - np.dot() performs a dot product of each row of X with [1, 2]
    
What np.dot() does here:
    - it computes the sum of the products of corresponding elements.
    - Example for first row [1, 1]:
        1*1 + 1*2 = 1 + 2 = 3
    
So:
    y = [ (1*1 + 1*2) + 3,
      (1*1 + 2*2) + 3,
      (2*1 + 2*2) + 3,
      (2*1 + 3*2) + 3 ]
  = [6, 8, 9, 11]

So y becomes:
    y = np.array([6, 8, 9, 11])
"""

array([16.])

In [2]:
print(X)

[[1 1]
 [1 2]
 [2 2]
 [2 3]]


In [13]:
# Advanced Linear Regression With statsmodels TUTORIAL starts now
# STEP 1: Import packages
import numpy as np
import statsmodels.api as sm

# STEP 2: Provide data and transform inputs
x = np.array([
  [0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]
])
y = np.array([4, 5, 20, 14, 32, 22, 38, 43])

# The input and output arrays are created but not done yet
# Add the column of ones to the onputs if you want statsmodels to calculate the intercept.
x = sm.add_constant(x)
# We do this because statsmodels doesnt automatically include the intercepy
# What x is now:
# array([[ 1.,  0.,  1.],
#        [ 1.,  5.,  1.],
#        [ 1., 15.,  2.],
#        [ 1., 25.,  5.],
#        [ 1., 35., 11.],
#        [ 1., 45., 15.],
#        [ 1., 55., 34.],
#        [ 1., 60., 35.]])
# You can see that the modifies x has three columns:
# the first column of ones, corresponding to b0 and replacing the intercept
# as well as two columns of the original features.

# STEP 3: Create a model and fit it
# The regression model based on OLS squares is an instance of the class:
# statsmodels.regression.linear_model.OLS
# This is how you can obtain one
model = sm.OLS(y, x) # NOTICE first argument is output followed by the input

# Now that the model is created, you can apply .fit() on it:
results = model.fit()
"""
By calling .fit(), you obtain the variable results, which is an instance of the class 
statsmodels.regression.linear_model.RegressionResultsWrapper. 
    - This object holds a lot of information about the regression model.
"""

# STEP 4: Get results
# Use .summary() to get the table with the results of linear regression
print(results.summary())

"""
This table is very comprehensive. You can find many statistical values associated with
linear regression, including r2, b0, b1, and b1.
"""

# You can extract any of the values from the table above.

print("r2 score:", results.rsquared)

print("r2 score adjusted:", results.rsquared_adj)

print("regression coafficients:", results.params)

"""
1. .rsquared holds r2

2. .rsquared_adj represents adjusted r2 - that is, r2 corrected according to the number of
   input features.
   
3. .params refers to the array with b0, b1, and b2
"""

# STEP 5: Predict response
# use .fittedvalued or .predict() with the input array as the argument

# Predicted response using .fittedvalues:
print("Predicted response:", results.fittedvalues)

# Predicted response using .predict()
print("Predicted response:", results.predict(x))


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.862
Model:                            OLS   Adj. R-squared:                  0.806
Method:                 Least Squares   F-statistic:                     15.56
Date:                Tue, 22 Apr 2025   Prob (F-statistic):            0.00713
Time:                        16:39:43   Log-Likelihood:                -24.316
No. Observations:                   8   AIC:                             54.63
Df Residuals:                       5   BIC:                             54.87
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.5226      4.431      1.246      0.2

  res = hypotest_fun_out(*samples, **kwds)


In [22]:
# CONTINUATION of previous clock of code

# Predicting response with new input features
# You can also apply .predict with new data as the argument
x_new = np.arange(10).reshape((-1, 2))
x_new = sm.add_constant(x_new)
x_new

y_pred = results.predict(x_new)
y_pred

array([ 5.77760476,  7.18179502,  8.58598528,  9.99017554, 11.3943658 ])

In [15]:
# Multiple Linear Regression with scikit-learn Tutorial starts now
# STEP 1: Import packages and classes
import numpy as np
from sklearn.linear_model import LinearRegression

# STEP 2: Provide data (known inputs and outputs)

X = np.array([
  [0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]
])

# X is a two dimesnional array with two columns

y = np.array([4, 5, 20, 14, 32, 22, 38, 43])
# One dimesnional array

# STEP 3: Create a model and fit it
# The next step is to create the regression model as an instance of LinearRegression and fit it
model = LinearRegression().fit(X, y)

# The result of this statement is the variable model referring o the object type LinearRegression
# It represents the regression model fitted with existing data

# STEP 4: Get Results
r2 = model.score(X, y)
print("r2 score", r2)

intercept = model.intercept_
print("Intercept", intercept)

slope = model.coef_
print("Slope", slope)

# In this example, intercept is 5.52, this is the value of the predicted response when x1 = x2 = 0. 
# An increase of x1 by 1 yields a rise of the predicted response by 0.45. 
# Similarly, when x2 grows by 1, the response rises by 0.26.

# STEP 5: Predict response
y_pred = model.predict(X)
print("Predicted response:", y_pred)

"""
How the predicted y values are calculated:
The model predicts:
  y = b + w1*x1 + w2*x2
  
  - b = intercept (5.52)
  - w1 = slope for x1 (0.45)
  - w2 = slope for x2 (0.26)

Why is the prediction for the first y value 5.77 when X = [0, 1]?
  1. First Input: X[0] = [0, 1]
    - x1 = 0 and x1 = 1
  2. Prediction Calculation:
    - y = 5.52 + (0.45 * 0) + (0.26*1)
        = 5.52 + 0 + 0.26
"""


r2 score 0.8615939258756776
Intercept 5.522579275198183
Slope [0.44706965 0.25502548]
Predicted response: [ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


In [44]:
# CONTINUATION or previous code block
# Predict y values for new X inputs/features
x_new = np.arange(10).reshape((-1, 2))
print(x_new)

y_new = model.predict(x_new)
print(y_new)

[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
[ 5.77760476  7.18179502  8.58598528  9.99017554 11.3943658 ]


In [43]:
print(X)
print(y)

[[ 0  1]
 [ 5  1]
 [15  2]
 [25  5]
 [35 11]
 [45 15]
 [55 34]
 [60 35]]
[ 4  5 20 14 32 22 38 43]


In [61]:
# Polynomial Regression With scikit-learn Tutorial starts now
# STEP 1: Import packages and classes
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# STEP 2a: Provide data
# Define the inputs and outputs
X = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1)) # 2D array, one column, 6 rows
y = np.array([15, 11, 2, 8, 25, 32])

# X, or the input, is a two-dimensional array

# STEP 2b: Transform the input data
transformer = PolynomialFeatures(degree=2, include_bias=False)

"""
Explanation of 'transformer = PolynomialFeatures(degree=2, include_bias=False)'

The variable 'transformer' refers to an instance of PolynomailFeatures that you can use to
transform the input X.

You can provide several optional parameters to PolynomialFeatures:
    - degree: is an integer (2 by default) that represents the degree of the polynomial
      regression function.
    - interaction_only: is Boolean (False by default) that decides whether to include only
      interaction features (True) or all features (False).
    - include_bias: is a Boolean (True by default) that decides whether to include the bias,
      or interceptm column of 1 values (True) or not (False).
      
This example uses the default values of all parameters except include_bias.
You'll sometimes want to experiement with the degree of the function, and it can be beneficial
for readability to provide this argument anayway.
"""

# Before applying transformer, you need to fit it with .fit():
transformer.fit(X)

# Once transformer is fitted, then it's ready to create a new, modified input array
# Apply .transform() to do that
x_ = transformer.transform(X)

# transform() takes the input array as the argument and returns the modified array.
# Use .fit_transform() to replace the three previous statements with only one:

x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)

"""
With .fit_transform(), you're fitting and transforming the input array in one statement.
This method also takes the inout array and effectively does the same thing as .fit() and
.transform() called in that order.
It also returns the modifies array.

This is how the new input array looks:
[[   5.   25.]
 [  15.  225.]
 [  25.  625.]
 [  35. 1225.]
 [  45. 2025.]
 [  55. 3025.]]
 
The modified array ocntains two columns: one with the original inputs and the other with
their squares.
"""

# STEP 3: Create a model and fit it
model = LinearRegression().fit(x_, y)

# The regression model is now created and fitted.
# It's ready for application.
# Keep in mind that that the first argument of .fit() is the modified array x_ and not original x

# STEP 4: Get results
# Obtain properties of the model (r2 score, intercept, slope)
r2 = model.score(x_, y)
print("r2 score:", r2)

intercept = model.intercept_
print("intercept:", intercept)

slope = model.coef_
print("Slope:", slope)

# Here, ,intercept_ represents b0, while .coef_ references the array that contains b1 and b2.

# STEP 5: Predict Response
# Use .predict(), but remember that the argument should be the modified input array x_
y_pred = model.predict(x_)
print("Predicted response:", y_pred)

r2 score: 0.8908516262498563
intercept: 21.372321428571453
Slope: [-1.32357143  0.02839286]
Predicted response: [15.46428571  7.90714286  6.02857143  9.82857143 19.30714286 34.46428571]


In [59]:
# The same regression as above but with several input variables
# Step 1: Import packages and classes
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Step 2a: Provide data
x = np.array([
  [0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]
])
y = np.array([4, 5, 20, 14, 32, 22, 38, 43])

# Step 2b: Transform input data
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

# Step 3: Create a model and fit it
model = LinearRegression().fit(x_, y)

# Step 4: Get results
r2 = model.score(x_, y)
intercept, coefficients = model.intercept_, model.coef_

# Step 5: Predict response
y_pred = model.predict(x_)

print("coefficient of determination:", r2)

print("intercept:", intercept)

print(f"coefficients:\n{coefficients}")

print(f"predicted response:\n{y_pred}")

"""
In this case there are six regression coefficients, including the intercept, as shown in the
estimated regression function: ùëì(ùë•‚ÇÅ, ùë•‚ÇÇ) = ùëè‚ÇÄ + ùëè‚ÇÅùë•‚ÇÅ + ùëè‚ÇÇùë•‚ÇÇ + ùëè‚ÇÉùë•‚ÇÅ¬≤ + ùëè‚ÇÑùë•‚ÇÅùë•‚ÇÇ + ùëè‚ÇÖùë•‚ÇÇ¬≤.

You can also notice that polynomial regression yielded a higher coefficient pf determination (r2)
than multiple linear regression for the same problem.

At first, you could think that obtaining such a large ùëÖ¬≤ is an excellent result. It might be.

However, in real-world situations, having a complex model and ùëÖ¬≤ very close to one might also
be a sign of overfitting. To check the performance of a model, you should test it with new 
data‚Äîthat is, with observations not used to fit, or train, the model.
"""

coefficient of determination: 0.9453701449127823
intercept: 0.8430556452397582
coefficients:
[ 2.44828275  0.16160353 -0.15259677  0.47928683 -0.4641851 ]
predicted response:
[ 0.54047408 11.36340283 16.07809622 15.79139    29.73858619 23.50834636
 39.05631386 41.92339046]


In [None]:
# New PolynomialFeatures Example:
# Shows how changin interaction_only works
# Transforming an X input to a new modified array

import numpy as np
from sklearn.preprocessing import PolynomialFeatures

X = np.arange(6).reshape((3, 2))  # 2D array with 2 columns and 3 rows
print("X:")
print(X)

print("\n")

poly = PolynomialFeatures(2)
x_ = poly.fit_transform(X)  # New transformed array
print("x_:")
print(x_)

print("\n")

poly = PolynomialFeatures(interaction_only=True)
x__ = poly.fit_transform(X)
print("x__:")
print(x__)

"""
Explanation:

This code demonstrates how to use scikit-learns PolynomialFeatures to transform input data
into polynomial features.

'X = np.arange(6).reshape((3, 2))'

    - Creates a 2D array with 3 rows and 2 columns
    - Represents 3 data points, each with 2 features
    
'poly = PolynomialFeatures(2)
x_ = poly.fit_transform(X)'

    - PolynomialFeatures(2) creates polynomial deatures up to degree 2
    
'poly = PolynomialFeatures(interaction_only=True)
x__ = poly.fit_transform(X)'

    - This is the second transformation (interaction only)
    - interaction_only=True means we only include interaction terms (products of different features)
    - Excludes pure squared terms
    
    
Output Explnation:
For input [a, b], the output becomes:
[1, a, b, a¬≤, ab, b¬≤]
The first output (x_) will show:
    - A column of 1's (bias term, column 0)
    - Original features (columns 1-2)
    - Squared terms (columns 3 and 5)
    - Interaction term (ab, column 4)

For input [a, b], the output becomes:
[1, a, b, ab] (No squared terms)
The second output (x__) will show:
    - A column of 1's (bias term)
    - Original features (columns 1-2)
    - Only the interaction term (column 3)
"""

X:
[[0 1]
 [2 3]
 [4 5]]


x_:
[[ 1.  0.  1.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.]
 [ 1.  4.  5. 16. 20. 25.]]


x__:
[[ 1.  0.  1.  0.]
 [ 1.  2.  3.  6.]
 [ 1.  4.  5. 20.]]


"\nExplanation:\n\nThis code demonstrates how to use scikit-learns PolynomialFeatures to transform input data\ninto polynomial features.\n\n'X = np.arange(6).reshape((3, 2))'\n\n    - Creates a 2D array with 3 rows and 2 columns\n    - Represents 3 data points, each with 2 features\n    \n'poly = PolynomialFeatures(2)\nx_ = poly.fit_transform(X)'\n\n    - PolynomialFeatures(2) creates polynomial deatures up to degree 2\n    \n'poly = PolynomialFeatures(interaction_only=True)\nx__ = poly.fit_transform(X)'\n\n    - This is the second transformation (interaction only)\n    - interaction_only=True means we only include interaction terms (products of different features)\n    - Excludes pure squared terms\n    \nOutput Explnation:\nThe first output (x_) will show:\n    - A column of 1's (bias term)\n    - Original features (columns 1-2)\n    - Squared terms (columns 3-4)\n    - Interaction term (column 5)\n\nThe second output (x__) will show:\n    - A column of 1's (bias term)\n    - Origina