In [2]:
import numpy as np


In [3]:
import tensorflow as tf

In [4]:
from sklearn import datasets
iris=datasets.load_iris()
list(iris.keys())

['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']

In [6]:
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]
print(len(X))

150


In [7]:
print(type(iris.data))
print(iris.data.shape)
print((iris.data.ndim))

<class 'numpy.ndarray'>
(150, 4)
2


In [8]:
X = iris["data"][:, (2, 3)]  # petal length, petal width

In [9]:
X[:5]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2]])

In [11]:
X=X.astype('float32')
print(X.dtype)
print(X.shape)

float32
(150, 2)


In [12]:
X_with_bias = np.c_[np.ones([len(X), 1]), X]

In [13]:
np.random.seed(2042)

In [14]:
X[:5]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2]], dtype=float32)

In [15]:
X_with_bias[:5]

array([[1.        , 1.39999998, 0.2       ],
       [1.        , 1.39999998, 0.2       ],
       [1.        , 1.29999995, 0.2       ],
       [1.        , 1.5       , 0.2       ],
       [1.        , 1.39999998, 0.2       ]])

In [16]:
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X_with_bias)

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

rnd_indices = np.random.permutation(total_size)

In [17]:
print(train_size)
print(test_size)

90
30


In [18]:
X_train = X_with_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_with_bias[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

In [19]:
X_train_wo_bias = X[rnd_indices[:train_size]]



In [20]:
X_train_wo_bias[:5]

array([[1.4, 0.2],
       [4.1, 1.3],
       [5.2, 2. ],
       [4. , 1.3],
       [4.1, 1.3]], dtype=float32)

In [21]:
X_train[:5]

array([[1.        , 1.39999998, 0.2       ],
       [1.        , 4.0999999 , 1.29999995],
       [1.        , 5.19999981, 2.        ],
       [1.        , 4.        , 1.29999995],
       [1.        , 4.0999999 , 1.29999995]])

In [33]:
X_train.shape

(90, 3)

In [22]:
np.unique(y_train)

array([0, 1, 2])

In [23]:
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes))
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot

In [24]:
y_train[:10]

array([0, 1, 2, 1, 1, 0, 1, 1, 1, 0])

In [25]:
to_one_hot(y_train[:7])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [26]:
Y_train_one_hot = to_one_hot(y_train)
Y_valid_one_hot = to_one_hot(y_valid)
Y_test_one_hot = to_one_hot(y_test)

In [27]:
Y_train_one_hot.shape

(90, 3)

In [28]:
Y_train_one_hot[:5]

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [29]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [30]:
n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)

In [31]:
eta = 0.01
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7

Theta = np.random.randn(n_inputs, n_outputs)
old_Theta=Theta
for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    error = Y_proba - Y_train_one_hot
    if iteration % 500 == 0:
        print(iteration, loss)
    gradients = 1/m * X_train.T.dot(error)
    Theta = Theta - eta * gradients

0 5.446205779299409
500 0.8350062644113228
1000 0.6878801462436703
1500 0.6012379159284251
2000 0.5444496888309313
2500 0.5038530211914799
3000 0.4729229006455079
3500 0.4482424456589629
4000 0.4278651134705443
4500 0.410600718643495
5000 0.39567804213167


In [32]:
lots = X_train.dot(old_Theta)
print(lots[:5])
print(lots.shape)

[[ 0.925139   -1.08509285 -1.36237848]
 [ 3.21857133 -2.43086775 -4.12502168]
 [ 4.4095319  -2.87526948 -5.43947832]
 [ 3.17513813 -2.36422069 -4.0532647 ]
 [ 3.21857133 -2.43086775 -4.12502168]]
(90, 3)


In [34]:
Y_proba = softmax(lots)
print(Y_proba[:5])

[[8.09404776e-01 1.08425932e-01 8.21692919e-02]
 [9.95851072e-01 3.50488824e-03 6.44039452e-04]
 [9.99261861e-01 6.85378173e-04 5.27604536e-05]
 [9.95366842e-01 3.91084137e-03 7.22316789e-04]
 [9.95851072e-01 3.50488824e-03 6.44039452e-04]]


In [36]:
loss_a = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))  
print(loss_a)

5.446205779299409


### Now let's  use tensorflow to get the logit and softmax

In [37]:
row1=old_Theta[1,:].astype('float32')
row2=old_Theta[2,:].astype('float32')
row0=old_Theta[0,:].astype('float32')

In [40]:
matrix=np.array([row1,row2])
bias=np.array([row0])
print(bias.shape)
print(matrix.shape)

(1, 3)
(2, 3)


In [41]:
wt = tf.get_variable( "weight" , initializer = tf.constant(matrix) )
bs = tf.get_variable(name='bias', initializer=tf.constant(bias) )

Instructions for updating:
Colocations handled automatically by placer.


In [42]:
logitts = tf.matmul(X_train_wo_bias, wt)+bs
y=tf.nn.softmax(logitts)
entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logitts, labels=Y_train_one_hot, name='entropy')
loss = tf.reduce_mean(entropy, name='loss') # computes the mean over all the examples in the batch

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [47]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    lgts=sess.run(logitts)
    print('logits=',lgts[:5])
    sftmx=sess.run(y)
    print(sess.run(loss))
    print(lgts.shape)
    print(y.shape)
    print('softmax',sftmx[:5])
    print(entropy.shape)

logits= [[ 0.925139  -1.0850929 -1.3623785]
 [ 3.2185714 -2.4308677 -4.125022 ]
 [ 4.409532  -2.8752694 -5.439478 ]
 [ 3.1751382 -2.3642206 -4.0532646]
 [ 3.2185714 -2.4308677 -4.125022 ]]
5.447902
(90, 3)
(90, 3)
softmax [[8.09404790e-01 1.08425915e-01 8.21692869e-02]
 [9.95851040e-01 3.50488885e-03 6.44039072e-04]
 [9.99261796e-01 6.85378036e-04 5.27604352e-05]
 [9.95366931e-01 3.91084049e-03 7.22316618e-04]
 [9.95851040e-01 3.50488885e-03 6.44039072e-04]]
(90,)


### compare the results from  plain formula:

In [44]:
Y_proba = softmax(lots)
print(Y_proba[:5])

[[8.09404776e-01 1.08425932e-01 8.21692919e-02]
 [9.95851072e-01 3.50488824e-03 6.44039452e-04]
 [9.99261861e-01 6.85378173e-04 5.27604536e-05]
 [9.95366842e-01 3.91084137e-03 7.22316789e-04]
 [9.95851072e-01 3.50488824e-03 6.44039452e-04]]


In [48]:
loss_a

5.446205779299409