In [27]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [28]:
df = pd.read_csv("science_data_large.csv")

In [29]:
df.head(15)


Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
0,469,647,624474.3
1,403,694,577961.0
2,302,975,619684.7
3,779,916,1460449.0
4,901,18,43257.26
5,545,637,712463.4
6,660,519,700696.0
7,143,869,271826.0
8,89,461,89198.03
9,294,776,477021.0


In [30]:
df.shape

(1000, 3)

In [31]:
df.columns

Index(['Temperature °C', 'Mols KCL', 'Size nm^3'], dtype='object')

# Part 2. Splitting the dataset

In [32]:
X = df['Mols KCL']
y = df['Size nm^3']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [34]:
X_train.shape

(900,)

In [35]:
X_test.shape

(100,)

# Part 3. Linear Regression

In [36]:
lr_model = LinearRegression()

In [37]:
lr_model.fit(X_train.values.reshape(-1,1), y_train)

LinearRegression()

In [38]:
lr_model.score(X_test.values.reshape(-1,1), y_test)

0.502763085219885

###### This model has achieved accuracy of 58% with Linear Regression. It means that our model got accuracy score of 58% on the trained model.

In [39]:
y_pred = lr_model.predict(X_test.values.reshape(-1,1))

In [40]:
print(lr_model.coef_)
print(lr_model.intercept_)

[1160.75217481]
-33727.15332718915


###### Linear Regression Equation
$y = h(x) = -30652.81 + 1147.60 X_i$

# Part 4. Cross Validation

In [41]:
cross_val_score(lr_model, X_train.values.reshape(-1,1), y_train)

array([0.58613951, 0.5158616 , 0.60543157, 0.49842543, 0.54208702])

###### The cross_val_score method returns the scores from the different forests of data the model was trained on.

# Part 5. Polynomial Regression

In [42]:
from sklearn.preprocessing import PolynomialFeatures

X = df.drop('Size nm^3', axis=1)
y = df['Size nm^3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

pr = PolynomialFeatures(degree=2)
X_poly_train = pr.fit_transform(X_train)
X_poly_test = pr.fit_transform(X_test)

X_poly_train.shape

polynomial_model = LinearRegression()
polynomial_model.fit(X_poly_train, y_train)

print("The model explains {:.2f}% of the average price.".format(polynomial_model.score(X_poly_test,y_test)*100))

The model explains 100.00% of the average price.


In [43]:
a = polynomial_model.coef_
b = polynomial_model.intercept_

predictions = a @ X_poly_test.T + b

In [44]:
print(polynomial_model.intercept_)
print(["{:.2f}".format(value) for value in polynomial_model.coef_])

1.3691140338778496e-05
['0.00', '12.00', '-0.00', '-0.00', '2.00', '0.03']


##### Polynomial Regression Equation

$y = h(x) = 1.71 + 12.00 X_i^2 + 2 X_i^5 + 0.03 X_i^6$