In [1]:
# Imports section
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import linear_model
import warnings
warnings.filterwarnings('ignore')

### Part 1. Loading the dataset

In [2]:
# Using pandas load the dataset (load remotely, not locally)
data = pd.read_csv("https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv")
# Output the first 15 rows of the data
data.head(15)

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
0,469,647,624474.3
1,403,694,577961.0
2,302,975,619684.7
3,779,916,1460449.0
4,901,18,43257.26
5,545,637,712463.4
6,660,519,700696.0
7,143,869,271826.0
8,89,461,89198.03
9,294,776,477021.0


In [3]:
# Display a summary of the table information (number of datapoints, etc.)
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature °C  1000 non-null   int64  
 1   Mols KCL        1000 non-null   int64  
 2   Size nm^3       1000 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 23.6 KB


Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
count,1000.0,1000.0,1000.0
mean,500.5,471.53,508611.1
std,288.819436,288.482872,447483.8
min,1.0,1.0,16.11429
25%,250.75,226.75,129826.7
50%,500.5,459.5,382718.2
75%,750.25,710.25,760321.1
max,1000.0,1000.0,1972127.0


### Part 2. Splitting the dataset

In [4]:
# Take the pandas dataset and split it into our features (X) and label (y)
label = data.iloc[:,-1].values #size
features = data.iloc[:,0:2].values #temperature and mols

# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.1, random_state=42)

### Part 3. Perform a Linear Regression

In [5]:
# Use sklearn to train a model on the training set
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

# Create a sample datapoint and predict the output of that sample with the trained model
sample = np.array([[566,75]])
print("Sample predicition: ",lin_reg.predict(sample))

# Report on the score for that model, in your own words (markdown, not code) explain what the score means
score = lin_reg.score(X_test, y_test)
print("Score: ", score)

# Extract the coefficents and intercept from the model and write an equation for your h(x) using LaTeX
coef = lin_reg.coef_
intercept = lin_reg.intercept_ 

print("Coefficients: ",coef)
print("Y-intercept: ",intercept)

Sample predicition:  [158299.52037153]
Score:  0.8552472077276096
Coefficients:  [ 866.14641337 1032.69506649]
Y-intercept:  -409391.47958340833


\begin{equation}
h(x) = -409391.47958340833+ 866.14641337a + 1032.69506649b
\end{equation}

The linear regression model has a score of 86%. It is calculated as dividing sum of squared residuals by total sum of squares and subtracting this value from 1. This is the r squared value also known as the coefficient of determination.
The r squared value of 0.86 means that there is smaller differences between the actual data and the fitted values.
The higher the r sqaured value, the better the model fits the data points.

### Part 4. Use Cross Validation

In [6]:
# Use the cross_val_score function to repeat your experiment across many shuffles of the data
cross_val = cross_val_score(lin_reg, features, label)
print("Cross Val Score :", cross_val)
print("Mean : ", cross_val.mean())
print("Standard Deviation : ", cross_val.std())
## Report on their finding and their significance

Cross Val Score : [0.83918826 0.87051239 0.85871066 0.87202623 0.84364641]
Mean :  0.8568167899144437
Standard Deviation :  0.013466307372096071


In [7]:
kf = KFold(5,shuffle=True,random_state=42)
kf_cross_val = cross_val_score(lin_reg, features, label,cv=kf,scoring="r2")

print("KFold Cross Val Score :", kf_cross_val)
print("Mean : ", kf_cross_val.mean())
print("Standard Deviation : ", kf_cross_val.std())

KFold Cross Val Score : [0.86151889 0.82742341 0.87195173 0.88166206 0.85609101]
Mean :  0.8597294202684644
Standard Deviation :  0.01838773713930643


Cross validation verifies how accurate the model is on multiple and different subsets of data. It ensures that it generalizes well to the data that will be tested. It helps with us identify overfitting problem as sometimes model perform well to the training set but poorly on the test set. It also helps with underfitting when model is not capturing enough pattern in the data.

The scores from the cross validation are between 0.83 and 0.88. The cross validation method splits the dataset into 5 folds. At each fold, it holds out a test set and train the model on remaining set and retain the score. The function returns the score from each fold which is 5 scores in total. These scores are consistent with the score in part 3, both are within the same range. The mean is 0.86 and the standard deviation is 0.018 which mean regardless of the data sets we select for training it will give return similar score.

### Part 5. Using Polynomial Regression

In [9]:
# Using the PolynomialFeatures library perform another regression on an augmented dataset of degree 2
poly_reg = make_pipeline(PolynomialFeatures(2), BayesianRidge())
poly_reg.fit(X_train, y_train)

# Report on the metrics and output the resultant equation as you did in Part 3.
score = poly_reg.score(X_test, y_test)
print("Score: ",score)

coef = poly_reg.named_steps['bayesianridge'].coef_
intercept = poly_reg.named_steps['bayesianridge'].intercept_ 

print("Coefficients: ",coef)
print("Y-intercept: ",intercept)

Score:  1.0
Coefficients:  [ 0.00000000e+00  1.20000000e+01 -1.29217523e-07  1.22288846e-11
  2.00000000e+00  2.85714287e-02]
Y-intercept:  2.0834035240113735e-05


\begin{equation}
h(x) = 0.000020834035240113735 + (0 * 1) + (12 * a) + (-0.000000129217523 * b) + (0.0000000000122288846* a^2)+
(2 * ab) + (0.0285714287 * b^2)
\end{equation}

Using PolynomialFeatures of degree 2 to perform BayesianRidge regression, the score is 1.0. The model has perfect accuracy. The polynomial regression performed better than linear regeression because it fits a wide range of curvature.