In [1]:
import numpy as np
import tensorflow as tf
np.random.seed(2042)

In [2]:
from sklearn import datasets
iris=datasets.load_iris()
list(iris.keys())

['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']

In [3]:
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]
print(len(X))

150


### total 150 samples

In [4]:
X=X.astype('float32')
print(X.dtype)
print(X.shape)

float32
(150, 2)


In [5]:
X_with_bias = np.c_[np.ones([len(X), 1]), X]

In [6]:
X_with_bias[:5]

array([[1.        , 1.39999998, 0.2       ],
       [1.        , 1.39999998, 0.2       ],
       [1.        , 1.29999995, 0.2       ],
       [1.        , 1.5       , 0.2       ],
       [1.        , 1.39999998, 0.2       ]])

In [8]:
X[:5]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2]], dtype=float32)

In [9]:
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X_with_bias)

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

rnd_indices = np.random.permutation(total_size)

### total=150, train=90, test=30

In [10]:
print(train_size)
print(test_size)

90
30


In [11]:
X_train = X_with_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_with_bias[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

In [12]:
X_train_wo_bias = X[rnd_indices[:train_size]]

In [13]:
X_train_wo_bias[:5]

array([[1.4, 0.2],
       [4.1, 1.3],
       [5.2, 2. ],
       [4. , 1.3],
       [4.1, 1.3]], dtype=float32)

In [14]:
X_train[:5]

array([[1.        , 1.39999998, 0.2       ],
       [1.        , 4.0999999 , 1.29999995],
       [1.        , 5.19999981, 2.        ],
       [1.        , 4.        , 1.29999995],
       [1.        , 4.0999999 , 1.29999995]])

In [15]:
X_train.shape

(90, 3)

In [16]:
np.unique(y_train)

array([0, 1, 2])

In [17]:
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes))
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot

In [18]:
y_train[:10]

array([0, 1, 2, 1, 1, 0, 1, 1, 1, 0])

In [19]:
to_one_hot(y_train[:7])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [20]:
Y_train_one_hot = to_one_hot(y_train)
Y_valid_one_hot = to_one_hot(y_valid)
Y_test_one_hot = to_one_hot(y_test)

In [21]:
Y_train_one_hot.shape

(90, 3)

In [22]:
Y_train_one_hot[:5]

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [23]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [24]:
n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)

In [39]:
eta = 0.01
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7

Theta = np.random.randn(n_inputs, n_outputs)
old_Theta=Theta
for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    error = Y_proba - Y_train_one_hot
    if iteration % 500 == 0:
        print(iteration, loss)
    gradients = 1/m * X_train.T.dot(error)
    if iteration % 500 == 0:
        print(iteration, gradients)
    Theta = Theta - eta * gradients

0 6.101695306378934
0 [[ 0.62218423 -0.30549306 -0.31669117]
 [ 3.2235978  -1.37633498 -1.84726282]
 [ 1.10899029 -0.43684904 -0.67214126]]
500 0.7495956023769575
500 [[-0.12787783  0.0406489   0.08722893]
 [ 0.01993732 -0.01230197 -0.00763535]
 [ 0.04558461 -0.00280025 -0.04278436]]
1000 0.6335867657621506
1000 [[-0.09639016  0.02726899  0.06912117]
 [ 0.01709969 -0.01066787 -0.00643182]
 [ 0.03428076  0.00079905 -0.03507981]]
1500 0.5627810020111176
1500 [[-0.07570861  0.01789321  0.0578154 ]
 [ 0.01439991 -0.00899943 -0.00540048]
 [ 0.02679658  0.00314605 -0.02994263]]
2000 0.5154218466212835
2000 [[-0.06195896  0.01155791  0.05040105]
 [ 0.0122263  -0.00759661 -0.00462969]
 [ 0.02180504  0.00449721 -0.02630225]]
2500 0.4811610196484237
2500 [[-0.05244436  0.00720676  0.0452376 ]
 [ 0.01053729 -0.00646217 -0.00407512]
 [ 0.01834933  0.00521195 -0.02356128]]
3000 0.4548332562566568
3000 [[-0.04558076  0.00413705  0.04144371]
 [ 0.00922135 -0.00554356 -0.00367779]
 [ 0.01585887  0.005

### Now let's use tensorflow to check gradient 

In [40]:
old_Theta.dtype

dtype('float64')

In [41]:
old_Theta

array([[ 0.63844637,  0.61608862,  0.53739165],
       [ 2.16968434,  0.08167019, -0.04731371],
       [ 0.72490157,  1.06711956,  2.04506174]])

In [42]:
row1=old_Theta[1,:].astype('float32')
row2=old_Theta[2,:].astype('float32')
row0=old_Theta[0,:].astype('float32')
bias=np.array([row0])
matrix=np.array([row1,row2])

In [43]:
bias

array([[0.6384464 , 0.6160886 , 0.53739166]], dtype=float32)

In [44]:
matrix

array([[ 2.1696844 ,  0.08167019, -0.04731371],
       [ 0.72490156,  1.0671196 ,  2.0450618 ]], dtype=float32)

In [45]:
w2=tf.get_variable( "weight0" , initializer = tf.constant(matrix) )
b = tf.get_variable(name='bias0', initializer=tf.constant(bias) )
logit = tf.matmul(X_train_wo_bias, w2) + b

In [46]:
entropyy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=Y_train_one_hot, name='entropy')
losss = tf.reduce_mean(entropyy, name='loss') # computes the mean over all the examples in the batch
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)

In [47]:
grads=optimizer.compute_gradients(losss,var_list=[b,w2])
training_op = optimizer.minimize(losss)

In [48]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for iteration in range(n_iterations):
        loss_result=sess.run(losss)
        if iteration %2000 == 0:
            print(iteration, loss_result)
        grads_out=sess.run(grads)
        if iteration %2000 == 0:
            print(iteration, grads_out)
        sess.run(training_op)

0 6.103397
0 [(array([[ 0.62218446, -0.30549312, -0.31669122]], dtype=float32), array([[0.6384464 , 0.6160886 , 0.53739166]], dtype=float32)), (array([[ 3.2235985, -1.3763347, -1.8472627],
       [ 1.1089904, -0.4368491, -0.6721413]], dtype=float32), array([[ 2.1696844 ,  0.08167019, -0.04731371],
       [ 0.72490156,  1.0671196 ,  2.0450618 ]], dtype=float32))]
2000 0.515422
2000 [(array([[-0.06195899,  0.01155792,  0.05040106]], dtype=float32), array([[ 2.4165616 ,  0.18589571, -0.8105329 ]], dtype=float32)), (array([[ 0.0122263 , -0.0075966 , -0.00462967],
       [ 0.02180503,  0.0044972 , -0.02630224]], dtype=float32), array([[ 0.59397566,  0.9629732 ,  0.6470912 ],
       [-0.414544  ,  1.2808546 ,  2.9707763 ]], dtype=float32))]
4000 0.4160437
4000 [(array([[-0.03647733,  0.00026889,  0.03620844]], dtype=float32), array([[ 3.3518915,  0.0915499, -1.6515211]], dtype=float32)), (array([[ 0.00734784, -0.00416588, -0.00318202],
       [ 0.01256456,  0.00556715, -0.01813171]], dtype=f

In [None]:
[[ 0.62218423 -0.30549306 -0.31669117]
 [ 3.2235978  -1.37633498 -1.84726282]
 [ 1.10899029 -0.43684904 -0.67214126]]

[[-0.06195896  0.01155791  0.05040105]
 [ 0.0122263  -0.00759661 -0.00462969]
 [ 0.02180504  0.00449721 -0.02630225]]