# 1.0 Setup

## 1.1 References

https://www.deeplearning.ai/

## 1.2 Install and import dependencies

In [1]:
# install dependencies 
!pip install plotly==4.0.0

Collecting plotly==4.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/58/f3/a49d3281cc7275164ecf89ad3497556b11d9661faa119becdf7f9d3b2125/plotly-4.0.0-py2.py3-none-any.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 2.8MB/s 
Installing collected packages: plotly
  Found existing installation: plotly 3.6.1
    Uninstalling plotly-3.6.1:
      Successfully uninstalled plotly-3.6.1
Successfully installed plotly-4.0.0


In [0]:
# import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.express as px
import plotly
import plotly.offline as pyo
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
import scipy.optimize as opt 
from sklearn.linear_model import LogisticRegression

# 2.0 Example 01

## 2.1 Get the data

In [8]:
# the dataset contains two tests result of microchips in a 
# factory and we are going to use the test results to predict
# whether the microchips should be accepted or rejected

data = pd.read_csv("lesson5data1.txt",
                   names=["X1","X2","Admitted"],
                   dtype={"X1":np.float64,"X2":np.float64, "Admitted": "object"}
                   )
data.head()

Unnamed: 0,X1,X2,Admitted
0,34.62366,78.024693,0
1,30.286711,43.894998,0
2,35.847409,72.902198,0
3,60.182599,86.308552,1
4,79.032736,75.344376,1


In [7]:
# fisrt contact with the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
X1          100 non-null float64
X2          100 non-null float64
Admitted    100 non-null object
dtypes: float64(2), object(1)
memory usage: 2.4+ KB


In [9]:
# first contact with the data
data.describe()

Unnamed: 0,X1,X2
count,100.0,100.0
mean,65.644274,66.221998
std,19.458222,18.582783
min,30.058822,30.603263
25%,50.919511,48.179205
50%,67.032988,67.682381
75%,80.212529,79.360605
max,99.827858,98.869436


In [10]:
# see all markers available
# https://plot.ly/python/reference/#scatter-marker

fig = px.scatter(data, 
                 x="X1",
                 y="X2",
                 color_discrete_sequence=["red","green"],
                 symbol_sequence=["cross","circle"],
                 color="Admitted",
                 width=500,
                 height=500,
                 title="Admitted vs not Admitted",symbol="Admitted",
                 )
#fig.update_layout({'legend_orientation':'h'})
fig.show()

In [0]:
# normalize using z-score
scaler = StandardScaler()
scaled_values =  scaler.fit_transform(data.drop(axis=1,labels="Admitted"))

In [0]:
data["X1_scaled"] = scaled_values[:,0]
data["X2_scaled"] = scaled_values[:,1]

## 2.2 Logistic/Sigmoid Function



$
\displaystyle sigmoid(x) = \frac{1}{1+e^{-x}}
$

In [0]:
# Logistic Function or Sigmoid Function
def logistic(z):
    # np.exp(x) raises x to the exponential power, ie e^x. e ~= 2.71828
    return 1 / (1 + np.exp(-z)) 

## 2.3 Cost Function

$
\displaystyle J(\theta) = - \frac{1}{m} \sum_{i=1}^{m}\left[ y^{(i)}\log(h_{\theta}(x^{(i)})) + (1-y)(\log(1-h_{\theta}(x^{(i)})\right]
$

In [0]:
def cost_function(theta, X, y):
  thetaX = logistic(np.matmul(X, theta))
  return -1/len(y) * np.sum(y*np.log(thetaX) + (1-y)*np.log(1 - thetaX))

## 2.4 Gradient Descent

In [0]:
def gradient_descent_multi(theta_, X, y, alpha, iterations):
    m = len(X)
    theta = theta_.copy()
    cost_history = []
    for i in range(iterations):
        gradient = (1/m) * np.dot(X.T, logistic(np.dot(X, theta)) - y)
        theta = theta - (alpha * gradient)
        cost_history.append(cost_function(theta,X, y))
    return theta, cost_history

## 2.5 Learning parameters using fmin_tnc

Instead of taking the gradient descent steps, we will use a built-in function [fmin_tnc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.fmin_tnc.html) from **scipy** library. **fmin_tnc** is an optimization solver that finds the minimum of an unconstrained function. For logistic regression, you want to optimize the cost function with the parameters **theta**.

Concretely, you are going to use **fmin_tnc** to find the best or optimal parameters **theta** for the logistic regression **cost function**, given a fixed dataset (of X and y values). You will pass to **fmin_tnc** the following inputs:

- the initial values of the parameters we are trying to optimize.
- a function that (gradient(theta,X,y) in this case), when given the training set and a particular **theta**, computes the logistic regression cost and gradient with respect to **theta** for the dataset (X, y).

In [0]:
def gradient(theta,X,y):
  m = len(X)
  return (1/m) * np.dot(X.T, logistic(np.dot(X, theta)) - y)

## 2.6 Main

In [0]:
# define X and y
X = np.column_stack((np.ones(data.shape[0]),data[["X1_scaled","X2_scaled"]]))
y = data.Admitted.astype(np.int64).values.reshape(-1,1)

# define m and n
m,n = X.shape

# guess an initial value for theta
theta = np.zeros((n,1))

In [0]:
# using gradient descent
# theta, X, y, alpha, iterations
theta_batch, cost_history = gradient_descent_multi(theta,X,y,1,400)

In [19]:
# value of theta
theta_batch

array([[1.65947664],
       [3.8670477 ],
       [3.60347302]])

In [24]:
# using the swiss army knife: fmin_tnc
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.fmin_tnc.html
# see the return codes
result = opt.fmin_tnc(func = cost_function, 
                      x0 = theta.flatten(), 
                      fprime = gradient,
                      args = (X, y.flatten()))

# the output of above function is a tuple whose first element 
# contains the optimized values of theta
theta_opt = result[0].reshape(-1,1)
theta_opt

array([[1.71787865],
       [3.99150583],
       [3.72363971]])

In [25]:
fig = go.Figure(data=go.Scatter(x=np.arange(400), 
                                y=cost_history,
                                name="Cost Function"))
fig.update_layout(width=600,
                  height=400,
                  yaxis=go.layout.YAxis(title_text="Cost Function"),
                  xaxis=go.layout.XAxis(title_text="Iterations"),
                  title="Cost Function vs #iterations (using gradient descent)",
                  showlegend=True
                  )
fig.show()

## 2.7 Plotting the decision boundary

$
\theta_0 + \theta_1 x_1 + \theta_2 x_2 = 0
$ is the decision boundary

Since we plot $x_1$ against $x_2$ the boundary line will be the equation:

$
\displaystyle x_2 = \frac{-(\theta_0 + \theta_1 x_1)}{\theta_2}
$




In [26]:
fig = px.scatter(data, 
                 x="X1_scaled",
                 y="X2_scaled",
                 color_discrete_sequence=["red","green"],
                 symbol_sequence=["cross","circle"],
                 color="Admitted",
                 width=800,
                 height=500,
                 title="Admitted vs not Admitted",symbol="Admitted",
                 )
fig.add_scatter(x=data.X1_scaled, 
                y=-(theta_batch[0]+theta_batch[1]*data.X1_scaled)/theta_batch[2],
                name="Decision Boundary BGD")

fig.add_scatter(x=data.X1_scaled, 
                y=-(theta_opt[0]+theta_opt[1]*data.X1_scaled)/theta_opt[2],
                name="Decision Boundary fmin_tnc")
fig.show()

## 2.8 Prediction

In [0]:
# if z >=0 class 1
def prediction(X,theta):
  return np.dot(X,theta) >= 0

## 2.9 Accuracy on training set

In [28]:
p = prediction(X,theta_batch)
print("Train Accuracy BGD: {}".format(sum(p==y)[0]/len(y)))

Train Accuracy BGD: 0.89


In [29]:
p = prediction(X,theta_opt)
print("Train Accuracy fmin_tnc: {}".format(sum(p==y)[0]/len(y)))

Train Accuracy fmin_tnc: 0.89


## 2.10 A single test

In [32]:
x_test = np.array([40,81])
x_test = (x_test - scaler.mean_)/scaler.scale_
x_test = np.append(np.ones(1),x_test)
prob_batch = logistic(np.dot(x_test,theta_batch))
prob_fmin = logistic(np.dot(x_test,theta_opt))
print("For a student with scores 40 and 81, we predict an admission probability g(h_theta(X))\n"""
"Prob(batch): {0}\n Prob(fmin): {1}. \nIn other words, he or she was not admitted g(h_theta(X) < 0.5).".format(prob_batch,prob_fmin))


For a student with scores 40 and 81, we predict an admission probability g(h_theta(X))
Prob(batch): [0.35835075]
 Prob(fmin): [0.35596176]. 
In other words, he or she was not admitted g(h_theta(X) < 0.5).


# 3.0 Example 02

## 3.1 Get the data

In [33]:
# the dataset contains two tests result of microchips in a 
# factory and we are going to use the test results to predict
# whether the microchips should be accepted or rejected

data2 = pd.read_csv("lesson5data2.txt",
                   names=["X1","X2","Admitted"],
                   dtype={"X1":np.float64,"X2":np.float64, "Admitted": "object"}
                   )
data2.head()

Unnamed: 0,X1,X2,Admitted
0,0.051267,0.69956,1
1,-0.092742,0.68494,1
2,-0.21371,0.69225,1
3,-0.375,0.50219,1
4,-0.51325,0.46564,1


In [34]:
# fisrt contact with the data
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 3 columns):
X1          118 non-null float64
X2          118 non-null float64
Admitted    118 non-null object
dtypes: float64(2), object(1)
memory usage: 2.8+ KB


In [35]:
# first contact with the data
data2.describe()

Unnamed: 0,X1,X2
count,118.0,118.0
mean,0.054779,0.183102
std,0.496654,0.519743
min,-0.83007,-0.76974
25%,-0.37212,-0.254385
50%,-0.006336,0.213455
75%,0.47897,0.646562
max,1.0709,1.1089


In [36]:
# see all markers available
# https://plot.ly/python/reference/#scatter-marker

fig = px.scatter(data2, 
                 x="X1",
                 y="X2",
                 color_discrete_sequence=["green","red"],
                 symbol_sequence=["cross","circle"],
                 color="Admitted",
                 width=500,
                 height=500,
                 title="Admitted vs not Admitted",symbol="Admitted",
                 )
fig.show()

## 3.2 Polynomial Features

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html



In [37]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# n = 2 (x1, x2)
a = np.arange(6).reshape(3, 2)
print(a)
# d = 2 (degree is 2)
poly = PolynomialFeatures(2)
poly.fit_transform(a)

[[0 1]
 [2 3]
 [4 5]]


array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

$
\displaystyle terms = \binom{n+d}{d} = \binom{2+2}{2} = 6\\
\displaystyle \left[1, x_1, x_2, x_1^2, x_1 x_2, x_2^2 \right]\\
\displaystyle \theta_0 + \theta_1 x_1 + \theta_2 x_2 + \theta_3 x_1^2 + \theta_4 x_1 x_2 + \theta_5 x_2^2 = 0 \\
$


In [38]:
poly2 = PolynomialFeatures(3)
poly2.fit_transform(a)

array([[  1.,   0.,   1.,   0.,   0.,   1.,   0.,   0.,   0.,   1.],
       [  1.,   2.,   3.,   4.,   6.,   9.,   8.,  12.,  18.,  27.],
       [  1.,   4.,   5.,  16.,  20.,  25.,  64.,  80., 100., 125.]])

$
\displaystyle terms = \binom{n+d}{d} = \binom{2+3}{3} = 10\\
\displaystyle \left[1, x_1, x_2, x_1^2, x_1 x_2, x_2^2, x_1^3, x_1^2x_2, x_1x_2^2, x_2^3 \right]\\
\displaystyle \theta_0 + \theta_1 x_1 + \theta_2 x_2 + \theta_3 x_1^2 + \theta_4 x_1 x_2 + \theta_5 x_2^2 + \theta_6 x_1^3 + \theta_7 x_1^2x_2 + \theta_8 x_1x_2^2 + \theta_9 x_2^3 = 0 \\
$

In [39]:
poly = PolynomialFeatures(6)

# C(2+6,6) = 28 columns
X = poly.fit_transform(data2[["X1","X2"]])
y = data2.Admitted.astype(np.int64).values.reshape(-1,1)

# define m and n
m,n = X.shape

print(m,n)

# guess an initial value for theta
theta = np.zeros((n,1))

118 28


In [40]:
y.shape

(118, 1)

In [0]:
# using gradient descent
theta_batch, cost_history = gradient_descent_multi(theta,X,y,1,1000)

In [0]:
# using the swiss army knife: fmin_tnc
result = opt.fmin_tnc(func = cost_function, 
                      x0 = theta.flatten(), 
                      fprime = gradient,
                      #approx_grad = True,
                      args = (X, y.flatten())
                      )

#the output of above function is a tuple whose first element 
#contains the optimized values of theta
theta_opt = result[0].reshape(-1,1)

In [51]:
np.hstack((theta_batch,theta_opt))

array([[ 3.25330946e+00,  1.44841523e+01],
       [ 2.30689087e+00,  1.89467688e+01],
       [ 3.56981732e+00,  6.82056303e+00],
       [-5.02948176e+00, -1.54843730e+02],
       [-4.11214084e+00, -8.62129674e+01],
       [-4.98400183e+00, -5.56361782e+01],
       [ 1.02490787e+00, -1.53010127e+02],
       [-1.50909822e+00, -1.38851420e+02],
       [-9.49121021e-01, -7.27707860e+01],
       [-3.32284247e-01,  7.85309773e+00],
       [-4.10630911e+00,  5.56708720e+02],
       [ 2.87498386e-01,  5.19020470e+02],
       [-2.18883217e+00,  6.09923082e+02],
       [-1.54724591e+00,  2.83177923e+02],
       [-3.37455596e+00,  6.42501849e+01],
       [-6.08714253e-01,  2.76743854e+02],
       [-7.08618930e-01,  3.95139827e+02],
       [ 4.26069185e-01,  4.52428164e+02],
       [-1.29159280e+00,  2.99585929e+02],
       [-1.49618215e+00,  1.24060240e+02],
       [-2.45932430e-01, -8.86598211e+00],
       [-3.25985502e+00, -6.56270557e+02],
       [ 3.73145291e-01, -8.54347243e+02],
       [-9.

In [52]:
fig = go.Figure(data=go.Scatter(x=np.arange(1000), 
                                y=cost_history,
                                name="Cost Function"))
fig.update_layout(width=600,
                  height=400,
                  yaxis=go.layout.YAxis(title_text="Cost Function"),
                  xaxis=go.layout.XAxis(title_text="Iterations"),
                  title="Cost Function vs #iterations (gradient descent)",
                  showlegend=True
                  )
fig.show()

In [53]:
p = prediction(X,theta_batch)
print("Train Accuracy BGD: {}".format(sum(p==y)[0]/len(y)))

Train Accuracy BGD: 0.8389830508474576


In [54]:
p = prediction(X,theta_opt)
print("Train Accuracy fmin: {}".format(sum(p==y)[0]/len(y)))

Train Accuracy fmin: 0.864406779661017


## 3.3 Plot Decision Boundary

In [0]:
# For that, we will assign a color to each
x_ = np.linspace(-1,1.5,50)
y_ = np.linspace(-1,1.5,50)
xx, yy = np.meshgrid(x_, y_)

In [0]:
poly6 = PolynomialFeatures(6)
# manual predict
# transform X1 and X2 using a polynomial of higher degree
# z = x_transformed * theta

# change to theta_batch in order to see other decision boundary
Z = np.dot(poly6.fit_transform(np.c_[xx.ravel(), yy.ravel()]),theta_opt)
Z = Z.reshape(xx.shape)

In [63]:
Z.shape

(50, 50)

In [64]:
fig = px.scatter(data2, 
                 x="X1",
                 y="X2",
                 color_discrete_sequence=["green","red"],
                 symbol_sequence=["cross","circle"],
                 color="Admitted",
                 width=500,
                 height=500,
                 title="Admitted vs not Admitted",symbol="Admitted",
                 )
fig.add_contour(z=Z,
                x=x_,
                y=y_,
                ncontours=1,
                showscale=False,
                colorscale='haline')
fig.show()

# other colorscale
# https://plot.ly/python/v3/cmocean-colorscales/

## 3.4 Cost Function with Regularization (L2 - Ridge Regression)

**Reference**

https://machinelearningmastery.com/vector-norms-machine-learning/


**Ridge regression** adds ''squared magnitude'' of coefficient as penalty term to the loss function. Here this  part
```python
lambda_/(2*m) * np.sum(theta[1:]**2)
```
represents L2 regularization element.

$
\displaystyle J(\theta) = - \frac{1}{m} \sum_{i=1}^{m}\left[ y^{(i)}\log(h_{\theta}(x^{(i)})) + (1-y)(\log(1-h_{\theta}(x^{(i)})\right] + \frac{\lambda}{2m} \sum_{j=1}^{n}\theta_j^2\\
\theta_i: i = 0 \ldots n 
$

In [0]:
def cost_function_reg_l2(theta, X, y, lambda_):
  m = len(y)
  thetaX = logistic(np.matmul(X, theta))
  regularization = lambda_/(2*m) * np.sum(theta[1:]**2)
  return -1/m * np.sum(y*np.log(thetaX) + (1-y)*np.log(1 - thetaX)) + regularization

## 3.5 Cost Function with Regularization (L1 - Lasso Regression)

**Lasso Regression** (Least Absolute Shrinkage and Selection Operator) adds ''absolute value of magnitude'' of coefficient as penalty term to the loss function.

Here this parte 
```python
lambda_/(2*m) * np.sum(np.abs(theta[1:]))
```

 represents L1 regularization element.

$
\displaystyle J(\theta) = - \frac{1}{m} \sum_{i=1}^{m}\left[ y^{(i)}\log(h_{\theta}(x^{(i)})) + (1-y)(\log(1-h_{\theta}(x^{(i)})\right] + \frac{\lambda}{2m} \sum_{j=1}^{n}|\theta_j|\\
\theta_i: i = 0 \ldots n 
$

In [0]:
def cost_function_reg_l1(theta, X, y, lambda_):
  m = len(y)
  thetaX = logistic(np.matmul(X, theta))
  regularization = lambda_/(2*m) * np.sum(np.absolute(theta[1:]))
  return -1/m * np.sum(y*np.log(thetaX) + (1-y)*np.log(1 - thetaX)) + regularization

## 3.6 Gradient Descent with Regularization (L1 and L2)

In [0]:
def gradient_descent_multi_reg(theta_, X, y, alpha, iterations,lambda_,penalty):
    m = len(X)
    theta = theta_.copy()
    cost_history = []
    for i in range(iterations):
        gradient = np.dot(X.T, logistic(np.dot(X, theta)) - y)
        theta[0] = theta[0] - (alpha/m * gradient[0])
        if penalty == 'l1':
          theta[1:] = theta[1:] - (alpha/m * (gradient[1:] + lambda_/2))
          cost_history.append(cost_function_reg_l1(theta, X, y,lambda_))
        else:
          theta[1:] = theta[1:]*(1-alpha*lambda_/m) - (alpha/m * gradient[1:])
          cost_history.append(cost_function_reg_l2(theta, X, y,lambda_))
    return theta, cost_history

In [0]:
# gradient for fmin_tnc
def gradient_reg(theta,X,y,lambda_):
  m = len(X)
  grad = (1/m) * np.dot(X.T, logistic(np.dot(X, theta)) - y)
  grad[1:] = grad[1:] + (lambda_/m) * theta[1:]
  return grad

## 3.6 Main

In [112]:
poly = PolynomialFeatures(6)

# C(2+6,6) = 28 columns
X = poly.fit_transform(data2[["X1","X2"]])
y = data2.Admitted.astype(np.int64).values.reshape(-1,1)

# define m and n
m,n = X.shape

print(m,n)

# guess an initial value for theta
theta = np.zeros((n,1))

lambda_  = 0.5

118 28


In [0]:
theta_batch_reg_l1, cost_history_reg_l1 = gradient_descent_multi_reg(theta, X,y,1,280,lambda_,'l1')

In [0]:
theta_batch_reg_l2, cost_history_reg_l2 = gradient_descent_multi_reg(theta,X,y,1,280,lambda_,'l2')

In [0]:
# using the swiss army knife: fmin_tnc
result = opt.fmin_tnc(func = cost_function_reg_l1, 
                      x0 = theta.flatten(), 
                      fprime = gradient_reg,
                      #approx_grad = True,
                      args = (X, y.flatten(),lambda_),
                      maxfun=1000
                      )

#the output of above function is a tuple whose first element 
#contains the optimized values of theta
theta_opt = result[0].reshape(-1,1)

In [116]:
clf = LogisticRegression(penalty="l1",C=1/lambda_)
clf.fit(X,y.ravel())





LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [117]:
print(np.hstack((theta_batch,theta_batch_reg_l1,theta_batch_reg_l2,theta_opt,clf.coef_.reshape(28,1))))

[[ 3.25330946  2.34552949  1.57887747  1.71035761  1.6792032 ]
 [ 2.30689087  1.70043432  0.89299092  0.9799437   1.55159374]
 [ 3.56981732  2.61008535  1.57446815  1.65796762  2.19590763]
 [-5.02948176 -3.70018168 -2.54548567 -2.73692178 -5.59486766]
 [-4.11214084 -2.13592389 -1.30264737 -1.49621204 -3.43494794]
 [-4.98400183 -2.75793823 -1.8243869  -2.11724597 -4.93160753]
 [ 1.02490787  0.30422358  0.23122962  0.27707024  0.        ]
 [-1.50909822 -1.15428515 -0.51927564 -0.56838261  0.        ]
 [-0.94912102 -1.04695799 -0.49573234 -0.53398159  0.        ]
 [-0.33228425 -0.32210283 -0.19116124 -0.18065132  0.        ]
 [-4.10630911 -2.99896953 -1.8826361  -1.99969841 -2.27593759]
 [ 0.28749839 -0.50694553 -0.0838596  -0.06286978  0.        ]
 [-2.18883217 -1.66998372 -0.81923709 -0.90867848  0.        ]
 [-1.54724591 -1.05583167 -0.4076183  -0.46218555  0.        ]
 [-3.37455596 -2.45539718 -1.54466524 -1.64007657 -2.11285209]
 [-0.60871425 -0.68857047 -0.28410407 -0.27745375  0.  

In [118]:
p = prediction(X,theta_batch_reg_l1)
print("Train Accuracy BGD L1: {}".format(sum(p==y)[0]/len(y)))

Train Accuracy BGD L1: 0.8305084745762712


In [119]:
p = prediction(X,theta_batch_reg_l2)
print("Train Accuracy: BGD L2 {}".format(sum(p==y)[0]/len(y)))

Train Accuracy: BGD L2 0.8220338983050848


In [120]:
p = prediction(X,theta_opt)
print("Train Accuracy fmin: {}".format(sum(p==y)[0]/len(y)))

Train Accuracy fmin: 0.8220338983050848


In [121]:
p = prediction(X,clf.coef_.reshape(28,1))
print("Train Accuracy scikit: {}".format(sum(p==y)[0]/len(y)))

Train Accuracy scikit: 0.7372881355932204


In [123]:
fig = go.Figure(data=go.Scatter(x=np.arange(1000), 
                                y=cost_history_reg_l1,
                                name="Cost Function"))
fig.update_layout(width=600,
                  height=400,
                  yaxis=go.layout.YAxis(title_text="Cost Function"),
                  xaxis=go.layout.XAxis(title_text="Iterations"),
                  title="Cost Function vs #iterations",
                  showlegend=True
                  )
fig.show()

In [0]:
# For that, we will assign a color to each
x_ = np.linspace(-1,1.5,50)
y_ = np.linspace(-1,1.5,50)
xx, yy = np.meshgrid(x_, y_)

In [0]:
poly6 = PolynomialFeatures(6)
# manual predict
# transform X1 and X2 using a polynomial of higher degree
# z = x_transformed * theta
Z = np.dot(poly6.fit_transform(np.c_[xx.ravel(), yy.ravel()]),theta_batch_reg_l2)
Z = Z.reshape(xx.shape)

In [631]:
fig = px.scatter(data2, 
                 x="X1",
                 y="X2",
                 color_discrete_sequence=["green","red"],
                 symbol_sequence=["cross","circle"],
                 color="Admitted",
                 width=800,
                 height=500,
                 title="Admitted vs not Admitted",symbol="Admitted",
                 )
fig.add_contour(z=Z,
                x=x_,
                y=y_,
                ncontours=1,
                showscale=False,
                colorscale='haline')
fig.show()

# other colorscale
# https://plot.ly/python/v3/cmocean-colorscales/