## Imports section

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from IPython.display import Markdown as md

## Part 1. Loading the dataset

In [2]:
#Use read_csv function to remotely read the csv and store it into a variable named df
df = pd.read_csv("https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv")

#### First 15 rows

In [3]:
print(df.head(15))

    Temperature °C  Mols KCL     Size nm^3
0              469       647  6.244743e+05
1              403       694  5.779610e+05
2              302       975  6.196847e+05
3              779       916  1.460449e+06
4              901        18  4.325726e+04
5              545       637  7.124634e+05
6              660       519  7.006960e+05
7              143       869  2.718260e+05
8               89       461  8.919803e+04
9              294       776  4.770210e+05
10             991       117  2.441771e+05
11             307       781  5.006455e+05
12             206        70  3.145200e+04
13             437       599  5.390215e+05
14             566        75  9.185271e+04


#### Display summary of the table

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
Temperature °C    1000 non-null int64
Mols KCL          1000 non-null int64
Size nm^3         1000 non-null float64
dtypes: float64(1), int64(2)
memory usage: 23.5 KB


In [5]:
df.describe()

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
count,1000.0,1000.0,1000.0
mean,500.5,471.53,508611.1
std,288.819436,288.482872,447483.8
min,1.0,1.0,16.11429
25%,250.75,226.75,129826.7
50%,500.5,459.5,382718.2
75%,750.25,710.25,760321.1
max,1000.0,1000.0,1972127.0


## Part 2. Splitting the dataset

In [6]:
print("Columns : ", df.columns)

Columns :  Index(['Temperature °C', 'Mols KCL', 'Size nm^3'], dtype='object')


In [7]:
#features (X)
X = df[["Temperature °C", "Mols KCL"]]

#label (y)
y = df["Size nm^3"]

In [8]:
# Use the train_test_split function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [9]:
print("Train: ",X_train.shape)
print("Test : ",X_test.shape)

print("Train: ",y_train.shape)
print("Test : ",y_test.shape)

Train:  (900, 2)
Test :  (100, 2)
Train:  (900,)
Test :  (100,)


## Part 3. Perform a Linear Regression

#### Make an object of the class LinearRegression followed by fit method which trains the model

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### Call predict method on the actual dataset

In [11]:
y_pred = model.predict(X_test)
y_pred

array([ 369686.35246253,  719204.34216601,  299668.31259342,
        330785.83121152,  114993.78083378, 1286902.33553064,
        296610.8259866 ,  761122.38511225,  -86604.34541716,
        494824.81372822,  785053.45501707,  395950.36337021,
        189756.45038235, 1048908.63410223,  806077.63667214,
        875526.92429924,  576149.57639279,  837671.24121002,
        -45504.10547184,  838438.65367022,  507427.73631429,
       1206568.54527744,  825294.78051977,  -68803.29609492,
        183729.5217511 ,  572550.56595653,  432464.66894267,
        864662.69966631, -184951.74604961, -285641.9222393 ,
       1013479.13160315, 1050377.43003824,  580635.89402403,
        431555.9372244 ,  742909.54124974,   70640.86927315,
        654841.97456456,  601767.21610519,  627665.18346383,
        119759.26177147,  343550.92431389,  373657.78360217,
        761734.57747555,  617217.96618572,  614436.16767564,
        483760.7645373 ,  862309.49847995,  116642.12333462,
        530629.0478006 ,

In [12]:
compare_actual_predicted = pd.DataFrame({"Actual" : y_test, 'Predicted' : y_pred})
compare_actual_predicted.head()

Unnamed: 0,Actual,Predicted
946,347416.4571,369686.352463
178,723400.3143,719204.342166
974,114028.1143,299668.312593
483,317104.1143,330785.831212
182,121228.4286,114993.780834


#### Use score() method which will use X_test to predict values above and compare with y_test values to tell the accuracy

In [13]:
model.score(X_test, y_test)

0.7983137890489602

In [14]:
model.score(X_train, y_train)


""" don't forget to use either one, and explain what score means"""

" don't forget to use either one, and explain what score means"

#### Coefficients and Intercept

In [15]:
temp_coef = model.coef_[0]
mols_coef = model.coef_[1]

print("Temperature Coefficient = ", temp_coef)
print("Mols Coefficient        = ", mols_coef)
coeff_df = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
coeff_df

Temperature Coefficient =  891.3556697511474
Mols Coefficient        =  1039.6253472343424


Unnamed: 0,Coefficient
Temperature °C,891.35567
Mols KCL,1039.625347


In [16]:
intercept = model.intercept_
print("Intercept = ", intercept)

Intercept =  -425874.351325789


#### Equation

In [17]:
md("$y = %i$ + %i$ + %iy$"%(intercept, temp_coef, mols_coef))

$y = -425874$ + 891$ + 1039y$

## Part 4. Use Cross Validation

In [18]:
scores = cross_val_score(model, X_train, y_train,cv=10)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.86 accuracy with a standard deviation of 0.01


## Part 5. Using Polynomial Regression

In [19]:
poly = PolynomialFeatures(degree=2)
X_test = poly.fit_transform(X_test)
X_train = poly.fit_transform(X_train)
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [20]:
model.score(X_train, y_train)

1.0