<!-- <a href="https://colab.research.google.com/github/profmcnich/example_notebook/blob/main/a3_sample_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

\(\^Be sure to update this button to point to your notebook instead of the sample notebook\) -->

In [75]:
# Imports section

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression

## Part 1. Loading the dataset

In [2]:
# Using pandas load the dataset (load remotely, not locally)

df_science = pd.read_csv("https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv")

In [3]:
# Output the first 15 rows of the data

df_science.head(15)

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
0,469,647,624474.3
1,403,694,577961.0
2,302,975,619684.7
3,779,916,1460449.0
4,901,18,43257.26
5,545,637,712463.4
6,660,519,700696.0
7,143,869,271826.0
8,89,461,89198.03
9,294,776,477021.0


In [4]:
# Display a summary of the table information (number of datapoints, etc.)

df_science.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature °C  1000 non-null   int64  
 1   Mols KCL        1000 non-null   int64  
 2   Size nm^3       1000 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 23.6 KB


## Part 2. Splitting the dataset

In [5]:
# Take the pandas dataset and split it into our features (X) and label (y)

# label (y)

y = df_science.iloc[:, 2].values
y

array([6.24474257e+05, 5.77961029e+05, 6.19684714e+05, 1.46044903e+06,
       4.32572571e+04, 7.12463400e+05, 7.00696029e+05, 2.71826029e+05,
       8.91980286e+04, 4.77021029e+05, 2.44177114e+05, 5.00645457e+05,
       3.14520000e+04, 5.39021457e+05, 9.18527143e+04, 3.95288286e+04,
       5.38421457e+05, 1.14843143e+04, 1.48585029e+05, 4.16308457e+05,
       1.31596457e+05, 4.82433257e+05, 1.16136540e+06, 1.36031143e+04,
       4.24489114e+05, 1.97787143e+04, 8.03035857e+05, 3.21295000e+05,
       6.95233029e+05, 2.23961400e+05, 1.10432926e+06, 1.92627283e+06,
       5.21373600e+05, 7.91715314e+05, 4.53954314e+05, 5.11930286e+04,
       5.94753143e+04, 4.40629714e+05, 4.60782857e+05, 7.88616000e+04,
       8.03208600e+05, 2.28364457e+05, 2.41597829e+05, 1.04578046e+06,
       6.59932571e+04, 6.18540286e+04, 3.97636457e+05, 4.93009714e+05,
       1.18457029e+05, 2.42666829e+05, 1.26718971e+06, 1.28496257e+05,
       6.10293600e+05, 5.76091143e+04, 9.13729029e+05, 1.41796260e+06,
      

In [6]:
# Take the pandas dataset and split it into our features (X) and label (y)

# features (X)

X = df_science.iloc[:, df_science.columns != 'Size nm^3'].values
X

array([[469, 647],
       [403, 694],
       [302, 975],
       ...,
       [791, 213],
       [769, 553],
       [919, 452]], dtype=int64)

In [7]:
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9, test_size=0.1, random_state=1, shuffle = True)

## Part 3. Perform a Linear Regression

In [10]:
# Use sklearn to train a model on the training set

reg = LinearRegression().fit(X_train, y_train)

In [72]:
# Create a sample datapoint and predict the output of that sample with the trained model

# Ler's create a sample point as Temperature °C: 28; Mols KCL: 600.

X_sample_point = np.array([28, 600]).reshape(1, -1) # Reshape to a 2D array

print('Sample Point: ', X_sample_point[0][0], ',', X_sample_point[0][1],)

predict = reg.predict(X_sample_point)

print('Prediction: ', predict[0])

Sample Point:  28 , 600
Prediction:  233100.8901880661


In [40]:
# Report on the score for that model, in your own words (markdown, not code) explain what the score means

train_accuray = reg.score(X_train, y_train)
test_accuray = reg.score(X_test, y_test)

print('Training Accuracy: ', train_accuray)
print('Testing Accuracy: ', test_accuray)

Training Accuracy:  0.8608840241280852
Testing Accuracy:  0.8580918842450161


## Report on the score of the model

### what is the score function?
### what does the score mean in our model? 

In [54]:
# Extract the coefficents and intercept from the model and write an equation for your h(x) using LaTeX

reg_coef = reg.coef_ # Extract the coefficents
reg_intercept = reg.intercept_ # Extract the intercept

print('Intercept:')
print('b0:', '{:0.5f}'.format(reg_intercept))
print('Coefficient:')
print('b1: ', '{:0.5f}'.format(reg_coef[0]))
print('b2: ', '{:0.5f}'.format(reg_coef[1]))

Intercept:
b0: -413045.20674
Coefficient:
b1:  861.67892
b2:  1036.69848


## Equation for your h(x)

### Linear Regresion Equation

##  <center> $ h(x) = b_{0} + b_{1}\cdot x_{1} + b_{2}\cdot x_{2} + \ldots + b_{n}\cdot x_{n} $ </center>
where $b_0$ is the intercept, $b_1$, $b_2$ are coefficients, and n is No. of observatioons 

##  <center> $ h(x) = -413045.20674 + 861.67892 \cdot x_{1} + 1036.69848 \cdot x_{2}$ </center>

## Part 4. Use Cross Validation

## Part 5. Using Polynomial Regression

In [None]:
# Using the PolynomialFeatures library perform another regression on an augmented dataset of degree 2

# Report on the metrics and output the resultant equation as you did in Part 3.