## Imports section

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from IPython.display import Markdown as md

## Part 1. Loading the dataset

In [2]:
#Use read_csv function to remotely read the csv and store it into a variable named df
df = pd.read_csv("https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv")

#### First 15 rows

In [3]:
print(df.head(15))

    Temperature °C  Mols KCL     Size nm^3
0              469       647  6.244743e+05
1              403       694  5.779610e+05
2              302       975  6.196847e+05
3              779       916  1.460449e+06
4              901        18  4.325726e+04
5              545       637  7.124634e+05
6              660       519  7.006960e+05
7              143       869  2.718260e+05
8               89       461  8.919803e+04
9              294       776  4.770210e+05
10             991       117  2.441771e+05
11             307       781  5.006455e+05
12             206        70  3.145200e+04
13             437       599  5.390215e+05
14             566        75  9.185271e+04


#### Display summary of the table

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
Temperature °C    1000 non-null int64
Mols KCL          1000 non-null int64
Size nm^3         1000 non-null float64
dtypes: float64(1), int64(2)
memory usage: 23.5 KB


In [5]:
df.describe()

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
count,1000.0,1000.0,1000.0
mean,500.5,471.53,508611.1
std,288.819436,288.482872,447483.8
min,1.0,1.0,16.11429
25%,250.75,226.75,129826.7
50%,500.5,459.5,382718.2
75%,750.25,710.25,760321.1
max,1000.0,1000.0,1972127.0


## Part 2. Splitting the dataset

In [6]:
print("Columns : ", df.columns)

Columns :  Index(['Temperature °C', 'Mols KCL', 'Size nm^3'], dtype='object')


In [7]:
#features (X)
X = df[["Temperature °C", "Mols KCL"]]

#label (y)
y = df["Size nm^3"]

In [8]:
# Use the train_test_split function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

## Part 3. Perform a Linear Regression

#### Make an object of the class LinearRegression followed by fit method which fits the regressor to the training data

In [9]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### Call predict method on the actual dataset

In [10]:
#Predict on the test data: y_pred
y_pred = model.predict(X_test)

#### Make a dataframe to compare the actual vs predicted

In [11]:
compare_actual_predicted = pd.DataFrame({"Actual" : y_test, 'Predicted' : y_pred})
compare_actual_predicted.head()

Unnamed: 0,Actual,Predicted
303,91638.31429,144219.872694
403,63319.11429,40338.36821
157,34530.42857,332978.389261
406,268353.7143,433060.229102
870,313985.2571,328727.129459


#### Use score() method which will use X_test to predict values above and compare with y_test values to tell the accuracy

In [12]:
score = model.score(X_test, y_test)
print("Score = ", score)

Score =  0.8502123283329874


#### What does the score here mean?
The scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables.

In [13]:
score_percentage = score*100
md("In this case the R squared is %0.1f percent"%(score_percentage))

In this case the R squared is 85.0 percent

#### Coefficients and Intercept

In [14]:
temp_coef = model.coef_[0]
mols_coef = model.coef_[1]
intercept = model.intercept_
print("Temperature Coefficient = ", temp_coef)
print("Mols Coefficient        = ", mols_coef)
print("Intercept = ", intercept)
coeff_df = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
coeff_df

Temperature Coefficient =  893.7107657427094
Mols Coefficient        =  1016.4897016116258
Intercept =  -416899.20570338075


Unnamed: 0,Coefficient
Temperature °C,893.710766
Mols KCL,1016.489702


#### Equation

In [15]:
md("$y = %i$ + %i*temp$ + %i*mol$"%(intercept, temp_coef, mols_coef))

$y = -416899$ + 893*temp$ + 1016*mol$

## Part 4. Use Cross Validation

In [16]:
#I will choose to use cv = 10 that means 10 folds of data
cv_results = cross_val_score(model, X_train, y_train,cv=10)
cv_results_mean, cv_results_std = cv_results.mean(), cv_results.std()
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_results_mean, cv_results_std))

0.86 accuracy with a standard deviation of 0.01


#### Significance of using cross validation
When computing R squared on the test set, the R squared is dependent on the way you split up the data. The data points in the test set may have anomolies that which implies that the R-squared computed is not representative of the model's ability to generalize the unseen data. Cross validation here split the dataset into 10 folds/groups. This essentially took the first fold as a test set and fit the model on the remaining 9 folds. Then it predictedon the test set. This repeats for total 10 times. Eventually giving us an array of cross-validation scores. Therefore giving us a more accurate score value

## Part 5. Using Polynomial Regression

In [17]:
poly = PolynomialFeatures(degree=2)
X_test = poly.fit_transform(X_test)
X_train = poly.fit_transform(X_train)
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [18]:
#Predict on the test data: y_pred
y_pred_new = model.predict(X_test)
compare_actual_predicted = pd.DataFrame({"Actual" : y_test, 'Predicted' : y_pred_new})
compare_actual_predicted.head()

Unnamed: 0,Actual,Predicted
303,91638.31429,91638.314292
403,63319.11429,63319.114274
157,34530.42857,34530.428582
406,268353.7143,268353.714281
870,313985.2571,313985.257133


In [19]:
score = model.score(X_test, y_test)
print("Score = ", score)

Score =  1.0


In [20]:
coef = model.coef_
print("Coefficients = ",coef)
print("Intercepts = ", model.intercept_)

Coefficients =  [ 0.00000000e+00  1.20000000e+01 -1.28759629e-07 -2.31074870e-12
  2.00000000e+00  2.85714287e-02]
Intercepts =  1.7481332179158926e-05


In [22]:
md("$y = %0.2f$ + %0.2fa$ + %0.2fb + %0.2fa^2 + %0.2fab$ + %0.2fb^2"%(coef[0], coef[1],coef[2], coef[3], coef[4], coef[5]))

$y = 0.00$ + 12.00a$ + -0.00b + -0.00a^2 + 2.00ab$ + 0.03b^2