In [1]:
import numpy as np
import tensorflow as tf
np.random.seed(2042)

In [2]:
print(tf.__version__)

2.1.0


In [3]:
from sklearn import datasets
iris=datasets.load_iris()
list(iris.keys())

['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']

In [4]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]
print(len(X))

150


### total 150 samples

In [6]:
X=X.astype('float32')
print(X.dtype)
print(X.shape)

float32
(150, 2)


In [7]:
X_with_bias = np.c_[np.ones([len(X), 1]), X]

In [8]:
X_with_bias[:5]

array([[1.        , 1.39999998, 0.2       ],
       [1.        , 1.39999998, 0.2       ],
       [1.        , 1.29999995, 0.2       ],
       [1.        , 1.5       , 0.2       ],
       [1.        , 1.39999998, 0.2       ]])

In [9]:
X[:5]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2]], dtype=float32)

In [10]:
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X_with_bias)

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

rnd_indices = np.random.permutation(total_size)

### total=150, train=90, test=30

In [11]:
print(train_size)
print(test_size)

90
30


In [12]:
X_train = X_with_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_with_bias[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

In [13]:
X_train_wo_bias = X[rnd_indices[:train_size]]

In [14]:
X_train_wo_bias[:5]

array([[1.4, 0.2],
       [4.1, 1.3],
       [5.2, 2. ],
       [4. , 1.3],
       [4.1, 1.3]], dtype=float32)

In [15]:
X_train[:5]

array([[1.        , 1.39999998, 0.2       ],
       [1.        , 4.0999999 , 1.29999995],
       [1.        , 5.19999981, 2.        ],
       [1.        , 4.        , 1.29999995],
       [1.        , 4.0999999 , 1.29999995]])

In [16]:
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes))
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot

In [17]:
y_train[:10]

array([0, 1, 2, 1, 1, 0, 1, 1, 1, 0])

In [18]:
to_one_hot(y_train[:7])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [19]:
Y_train_one_hot = to_one_hot(y_train)
Y_valid_one_hot = to_one_hot(y_valid)
Y_test_one_hot = to_one_hot(y_test)

In [20]:
Y_train_one_hot.shape

(90, 3)

In [21]:
Y_train_one_hot[:5]

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [22]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [23]:
n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)
n_iterations = 5001

In [24]:
eta = 0.01
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7

Theta = np.random.randn(n_inputs, n_outputs)
old_Theta=Theta
for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    error = Y_proba - Y_train_one_hot
    if iteration % 500 == 0:
        print(iteration, loss)
    gradients = 1/m * X_train.T.dot(error)
    if iteration % 500 == 0:
        print(iteration, gradients)
    Theta = Theta - eta * gradients

0 5.446205779299409
0 [[ 0.59406399 -0.28615298 -0.30791101]
 [ 3.17973972 -1.34526175 -1.83447798]
 [ 1.10140275 -0.43091503 -0.67048771]]
500 0.8350062644113228
500 [[-0.13595679  0.02841431  0.10754247]
 [ 0.01794235 -0.01436509 -0.00357726]
 [ 0.05473265  0.01235083 -0.06708348]]
1000 0.6878801462436703
1000 [[-0.10042224  0.02081561  0.07960662]
 [ 0.01573514 -0.01335253 -0.00238261]
 [ 0.04034821  0.01283161 -0.05317983]]
1500 0.6012379159284251
1500 [[-0.07766894  0.01412547  0.06354347]
 [ 0.01331931 -0.01173709 -0.00158222]
 [ 0.03100568  0.01336514 -0.04437082]]
2000 0.5444496888309313
2000 [[-0.06296144  0.00926182  0.05369962]
 [ 0.01131264 -0.01020736 -0.00110528]
 [ 0.02493899  0.0134161  -0.03835509]]
2500 0.5038530211914799
2500 [[-0.05300457  0.00581147  0.0471931 ]
 [ 0.00974229 -0.00889482 -0.00084747]
 [ 0.0208332   0.01309477 -0.03392797]]
3000 0.4729229006455079
3000 [[-0.04593355  0.00333114  0.04260241]
 [ 0.0085186  -0.00778969 -0.00072892]
 [ 0.01792546  0.012

### Now let's use tensorflow to check gradient 

In [25]:
row1=old_Theta[1,:].astype('float32')
row2=old_Theta[2,:].astype('float32')
row0=old_Theta[0,:].astype('float32')
bias=np.array([row0])
matrix=np.array([row1,row2])

In [26]:
bias

array([[ 0.11330361, -0.23452355, -0.20774285]], dtype=float32)

In [27]:
matrix

array([[ 0.43433246, -0.66647124, -0.71757054],
       [ 1.0188498 ,  0.41245225, -0.7501844 ]], dtype=float32)

In [28]:
w2=tf.Variable(matrix)
b=tf.Variable(bias)

In [29]:
with tf.GradientTape() as tape:
    logit = tf.matmul(X_train_wo_bias, w2) + b
    entropyy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=Y_train_one_hot, name='entropy')
    losss = tf.reduce_mean(entropyy, name='loss')
gradients=tape.gradient(losss,[b,w2])

In [30]:
gradients

[<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[ 0.59406406, -0.28615296, -0.30791098]], dtype=float32)>,
 <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
 array([[ 3.1797397 , -1.3452618 , -1.834478  ],
        [ 1.101403  , -0.43091506, -0.67048764]], dtype=float32)>]

In [31]:
n_iterations=250000
w2=tf.Variable(matrix)
b=tf.Variable(bias)

for iteration in range(n_iterations):
    with tf.GradientTape() as tape:
        logit = tf.matmul(X_train_wo_bias, w2) + b
        entropyy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=Y_train_one_hot, name='entropy')
        losss = tf.reduce_mean(entropyy, name='loss')
    gradients=tape.gradient(losss,[b,w2])
    #b=b-eta*(gradients[0].numpy())
    b.assign_sub(eta*(gradients[0].numpy()))
    #w2=w2-eta*(gradients[1].numpy())
    w2.assign_sub(eta*(gradients[1].numpy()))

In [32]:
b

<tf.Variable 'Variable:0' shape=(1, 3) dtype=float32, numpy=array([[ 14.029167,   4.02366 , -18.383087]], dtype=float32)>

In [44]:
w2

<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[-2.909397  , -0.16438407,  2.1241298 ],
       [-4.368392  , -0.90395266,  5.951257  ]], dtype=float32)>

In [34]:
losss

<tf.Tensor: shape=(), dtype=float32, numpy=0.08747757>

In [50]:
weight=w2.numpy()
bias=b.numpy()

In [51]:
b

<tf.Variable 'Variable:0' shape=(1, 3) dtype=float32, numpy=array([[ 14.029167,   4.02366 , -18.383087]], dtype=float32)>

In [52]:
weight

array([[-2.909397  , -0.16438407,  2.1241298 ],
       [-4.368392  , -0.90395266,  5.951257  ]], dtype=float32)

In [53]:
inp1=np.array([[5,2]])
print(inp1.shape)

(1, 2)


In [54]:
inp1.dot(weight)

array([[-23.28376842,  -2.62982565,  22.52316332]])

In [55]:
inp1.dot(weight)+bias

array([[-9.25460124,  1.39383453,  4.14007616]])

In [56]:
z=tf.constant(inp1.dot(weight)+bias)

### an iris with petal length 5cm , with 2cm :   93.9% is Iris-Virginca (class 2)

In [57]:
tf.nn.softmax(z)

<tf.Tensor: shape=(1, 3), dtype=float64, numpy=array([[1.43137842e-06, 6.02991738e-02, 9.39699395e-01]])>