In [None]:
import matplotlib.pyplot as plt

x = [3.2,3.5,3.8,3.0,3.7,3.9,3.5,3.2,3.6,3.4]   # high_school_gpas
y = [3.5,3.8,3.9,3.2,4.0,4.1,3.7,3.4,3.6,3.3]   # university_gpas
plt.title('High School GPAs vs University GPAs')
plt.ylabel('University GPAs')
plt.xlabel('High School GPAs')
plt.plot(x, y,  'k.')
plt.grid(True)


In [None]:
import numpy as np

# finding the variance of the high_school_gpas
variance = np.var(x, ddof=1) # ddof=1 is used to set Bessel's correction 
                             # to calculate the sample variance

# finding the co-variance of the high_school_gpas and university_gpas
covariance = np.cov(x, y)[0][1]

# calculating β 
beta = covariance / variance
print("β: ", beta)

# calculating α
alpha = np.mean(y) - (beta * np.mean(x))
print("α: ", alpha)


In [None]:
plt.title('High School GPAs vs University GPAs')
plt.ylabel('University GPAs')
plt.xlabel('High School GPAs')

plt.plot(x, y,  'k.')  # k. means black colored dot
plt.grid(True)

# plot best-fit line
_x = np.arange(2.8, 4.5, 0.01)
_y = [alpha + (beta * i) for i in _x]
plt.plot(_x, _y)


# making prediction
high_school_gpa = 3.9
predicted_university_gpa = alpha + (beta * high_school_gpa)
print(predicted_university_gpa)  # 4.049456521739129

# plotting the predicted university GPA
plt.plot(high_school_gpa, predicted_university_gpa, marker="*", c='r')


In [None]:
!pip install scikit-learn -U

In [None]:
# using Scikit-learn
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np

x = [3.2,3.5,3.8,3.0,3.7,3.9,3.5,3.2,3.6,3.4]   # high_school_gpas
y = [3.5,3.8,3.9,3.2,4.0,4.1,3.7,3.4,3.6,3.3]   # university_gpas

x = np.array(x).reshape(-1,1)

# create and fit the model
model = LinearRegression()
model.fit(x, y)

# plot the points 
plt.plot(x, y, 'k.')
plt.grid(True)

# plot the regression line
plt.title('High School GPAs vs University GPAs')
plt.ylabel('University GPAs')
plt.xlabel('High School GPAs')
plt.plot(x, model.predict(x), color='b')

#---make prediction---
predicted_university_gpa = model.predict([[3.9]])[0]
print(predicted_university_gpa)
plt.plot(high_school_gpa, predicted_university_gpa, marker="*", c='r')


In [None]:
model.intercept_ # 0.34021739130434625 (α)
model.coef_      # array([0.95108696]) (β)

In [None]:
print('Residual sum of squares: %.2f' % 
       np.sum((y - model.predict(x)) ** 2))

In [None]:
# data from another group of students
x_test = [3.1,3.4,3.5,3.7,3.8]    # high_school_gpas
y_test = [3.3,3.6,3.6,3.9,3.9]   # university_gpas


In [None]:
x_test = np.array(x_test).reshape(-1,1)
y_test = np.array(y_test).reshape(-1)

# total sum of squares
y_test_mean = np.mean(y_test)
ss_total = np.sum((y_test - y_test_mean) ** 2)  
print("ss_total: %.2f" % ss_total)

# residual sum of squares
ss_res = np.sum((y_test - model.predict(x_test)) ** 2)
print("ss_res: %.2f" % ss_res) 

# R-Squared
r_squared = 1 - (ss_res / ss_total)    
print("R-squared: %.2f" % r_squared)


In [None]:
from sklearn.datasets import make_regression
x, y = make_regression(n_samples=100, n_features=1, noise=2.3)


In [None]:
import matplotlib.pyplot as plt
plt.scatter(x, y)


In [None]:
x, y = make_regression(n_samples=100, n_features=1, noise=26)

In [None]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('insurance.csv')
df = df.query('smoker == "yes"')
x1 = df['bmi'].values.reshape(-1,1)
y1 = df['charges']

fig, axes = plt.subplots(1,2)

# first plot
axes[0].scatter(x1, y1)
axes[0].set_title('Smoker')
axes[0].set_xlabel('bmi')
axes[0].set_ylabel('charges')

df = pd.read_csv('insurance.csv')
df = df.query('smoker == "no"')
x2 = df['bmi'].values.reshape(-1,1)
y2 = df['charges']

# second plot
axes[1].scatter(x2, y2, color = 'orange')
axes[1].set_title('Non Smoker')
axes[1].set_xlabel('bmi')
axes[1].set_ylabel('charges')

fig.set_size_inches(12,4)
fig.tight_layout()


from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(x1,y1)
predictions = model1.predict(x1)
axes[0].plot(x1, predictions,  color='red')

model2 = LinearRegression()
model2.fit(x2,y2)
predictions = model2.predict(x2)
axes[1].plot(x2, predictions, color='red')

In [None]:
X, y = make_regression(n_samples=1000, n_features=2, noise=3)
print(X)

In [None]:
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt

X, y = make_regression(n_samples=1000, n_features=2, noise=1)
fig = plt.figure(figsize=(13,13))
ax = plt.axes(projection='3d')
ax.scatter3D(X[:,0], 
             X[:,1], 
             y, 
             c=y, 
             cmap='Greens')

ax.set_xlabel('X[0]')
ax.set_ylabel('X[1]')
ax.set_zlabel('y')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('insurance.csv')
df = df.query('smoker == "yes"')
fig = plt.figure(figsize=(13,13))
ax = plt.axes(projection='3d')
ax.scatter3D(df['age'], 
             df['bmi'], 
             df['charges'], 
             c='Green')
ax.set_xlabel('age')
ax.set_ylabel('bmi')
ax.set_zlabel('charges')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('insurance.csv')
df = df.query('smoker == "yes"')
fig = plt.figure(figsize=(13,13))
ax = plt.axes(projection='3d')
ax.scatter3D(df['age'], 
             df['bmi'], 
             df['charges'], 
             c='Green')
ax.set_xlabel('age')
ax.set_ylabel('bmi')
ax.set_zlabel('charges')

# create a meshgrid of all the values for age and bmi
import numpy as np

x_surf = np.arange(15, 70, 1)   #---for age---
y_surf = np.arange(10, 60, 1)   #---for bmi---
x_surf, y_surf = np.meshgrid(x_surf, y_surf)

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df[['age','bmi']],    # independent variables
          df['charges'])        # dependent variable

# calculate z based on the model
z = lambda x1,x2: (model.intercept_ + model.coef_[0] * x1 + model.coef_[1] * x2) 

ax.plot_surface(x_surf, 
                y_surf, 
                z(x_surf,y_surf),
                rstride=1,
                cstride=1,
                color='yellow',
                alpha = 0.4)

plt.show()


In [None]:
print(model.coef_)

In [None]:
print(model.intercept_)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('insurance.csv')
df = df.query('smoker == "no"')
fig = plt.figure(figsize=(13,13))
ax = plt.axes(projection='3d')
ax.scatter3D(df['age'], 
             df['bmi'], 
             df['charges'], 
             c='Orange')
ax.set_xlabel('age')
ax.set_ylabel('bmi')
ax.set_zlabel('charges')

# create a meshgrid of all the values for age and bmi
import numpy as np

x_surf = np.arange(15, 70, 1)   #---for age---
y_surf = np.arange(10, 60, 1)   #---for bmi---
x_surf, y_surf = np.meshgrid(x_surf, y_surf)

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df[['age','bmi']],    # independent variables
          df['charges'])        # dependent variable

# calculate z based on the model
z = lambda x1,x2: (model.intercept_ + model.coef_[0] * x1 + model.coef_[1] * x2) 

ax.plot_surface(x_surf, 
                y_surf, 
                z(x_surf,y_surf),
                rstride=1,
                cstride=1,
                color='Yellow',
                alpha = 0.4)

plt.show()


In [None]:
model.predict(pd.DataFrame([{'age':45, 'bmi':40}]))
# array([9998.83299315])

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

x = [3.2,3.5,3.8,3.0,3.7,3.9,3.5,3.2,3.6,3.4]   # high_school_gpas
y = [3.5,3.8,3.9,3.2,4.0,4.1,3.7,3.4,3.6,3.3]   # university_gpas

# plot the points 
plt.plot(x, y, 'k.')
plt.grid(True)

df = pd.DataFrame({'x':x, 'y':y})

sorted_indices = np.argsort(df['x'].values)
x1 = df['x'].values[sorted_indices].reshape(-1,1)
y1 = df['y'].values[sorted_indices]

from sklearn.preprocessing import PolynomialFeatures
degree = 5
poly_features = PolynomialFeatures(degree=degree)
x_poly = poly_features.fit_transform(x1)

model = LinearRegression()
model.fit(x_poly, y1)

y_poly_predict = model.predict(x_poly)
plt.plot(x1.reshape(-1), y_poly_predict, color='red')

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

df = pd.read_csv('insurance.csv')
df = df.query('smoker == "yes"')

x1 = df['bmi'].values.reshape(-1,1)
y1 = df['charges']

fig, axes = plt.subplots()
# first plot
axes.scatter(x1, y1)
axes.set_title('Smoker')
axes.set_xlabel('bmi')
axes.set_ylabel('charges')

sorted_indices = np.argsort(df['bmi'].values)
x1 = df['bmi'].values[sorted_indices].reshape(-1,1)
y1 = df['charges'].values[sorted_indices]

from sklearn.preprocessing import PolynomialFeatures
degree = 2
poly_features = PolynomialFeatures(degree=degree)
x_poly = poly_features.fit_transform(x1)

model = LinearRegression()
model.fit(x_poly, y1)

y_poly_predict = model.predict(x_poly)
axes.plot(x1.reshape(-1), y_poly_predict, color='red')


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('insurance.csv')
df = df.query('smoker == "yes"')

fig = plt.figure(figsize=(13,13))
ax = plt.axes(projection='3d')
ax.scatter3D(df['age'], 
             df['bmi'], 
             df['charges'], 
             c='Green')
ax.set_xlabel('age')
ax.set_ylabel('bmi')
ax.set_zlabel('charges')

# create a meshgrid of all the values for age and bmi
import numpy as np

x_surf = np.arange(15, 70, 1)   #---for age---
y_surf = np.arange(10, 60, 1)   #---for bmi---
x_surf, y_surf = np.meshgrid(x_surf, y_surf)

#---use a polynomial function of degree 2---
from sklearn.preprocessing import PolynomialFeatures
degree = 2
polynomial_features= PolynomialFeatures(degree = degree)
x_poly = polynomial_features.fit_transform(df[['age','bmi']])

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_poly, df['charges'])

# calculate z based on the model
z = lambda x1,x2: (model.intercept_ +
                   (model.coef_[1] * x1) +
                   (model.coef_[2] * x2) +
                   (model.coef_[3] * x1**2) +
                   (model.coef_[4] * x1*x2) +
                   (model.coef_[5] * x2**2))

ax.plot_surface(x_surf, 
                y_surf, 
                z(x_surf,y_surf),
                rstride=1,
                cstride=1,
                color='yellow',
                alpha = 0.4)

print(model.intercept_)  # -31373.674559076117
print(model.coef_)       # [ 0.00000000e+00 -1.61089934e+00  
                         #   2.36856002e+03  3.97486813e-01
                         #   7.43342505e+00 -1.90741154e+01]

plt.show()
