# Basic knowledge

## Bag-of-words model

ref:   
[link](https://www.bookstack.cn/read/duoergun0729-nlp/%E8%AF%8D%E8%A2%8B%E6%A8%A1%E5%9E%8B%E5%92%8CTFIDF%E6%A8%A1%E5%9E%8B.md)  
[sklearn text-feature-extraction](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'I like the first document.',
    'He likes the first document.',
]

vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)

print(X.toarray())
vectorizer.vocabulary_


[[0 1 1 0 1 0 0 0 0 1 0 1]
 [0 1 0 0 1 0 0 0 2 1 0 1]
 [1 0 0 0 0 0 0 1 0 1 1 0]
 [0 1 1 0 1 0 0 0 0 1 0 1]
 [0 1 1 0 0 1 0 0 0 1 0 0]
 [0 1 1 1 0 0 1 0 0 1 0 0]]


{'this': 11,
 'is': 4,
 'the': 9,
 'first': 2,
 'document': 1,
 'second': 8,
 'and': 0,
 'third': 10,
 'one': 7,
 'like': 5,
 'he': 3,
 'likes': 6}

In [18]:
vectorizer2 = CountVectorizer(min_df=1, stop_words='english')
X = vectorizer2.fit_transform(corpus)

print(X.toarray())
print(vectorizer2.vocabulary_)
vectorizer2.get_feature_names()

[[1 0 0 0]
 [1 0 0 2]
 [0 0 0 0]
 [1 0 0 0]
 [1 1 0 0]
 [1 0 1 0]]
{'document': 0, 'second': 3, 'like': 1, 'likes': 2}


['document', 'like', 'likes', 'second']

In [21]:
!pip list

Package                  Version
------------------------ -----------
absl-py                  0.11.0
anyio                    2.1.0
appdirs                  1.4.4
appnope                  0.1.2
APScheduler              3.6.3
argon2-cffi              20.1.0
astunparse               1.6.3
async-generator          1.10
attrs                    20.3.0
Babel                    2.9.0
backcall                 0.2.0
backports.zoneinfo       0.2.1
black                    20.8b1
bleach                   3.3.0
boto3                    1.24.81
botocore                 1.27.81
CacheControl             0.12.6
cachetools               4.2.2
cachy                    0.3.0
certifi                  2020.12.5
cffi                     1.14.5
chardet                  4.0.0
cleo                     0.8.1
click                    7.1.2
clikit                   0.6.2
crashtest                0.3.1
cycler                   0.10.0
decorator                4.4.2
defusedxml               0.6.0
dill             

## tf-idf

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(smooth_idf=False)

In [24]:
# Use the corpus from above again 
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'I like the first document.',
    'He likes the first document.',
]

vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
term_counts = X.toarray()
term_counts

array([[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0]])

In [26]:
tfidf = transformer.fit_transform(term_counts)


<6x12 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [27]:
tfidf.toarray()

array([[0.        , 0.37190386, 0.44209453, 0.        , 0.53258605,
        0.        , 0.        , 0.        , 0.        , 0.3145539 ,
        0.        , 0.53258605],
       [0.        , 0.18858203, 0.        , 0.        , 0.27005947,
        0.        , 0.        , 0.        , 0.89057951, 0.15950148,
        0.        , 0.27005947],
       [0.56538652, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.56538652, 0.        , 0.20251978,
        0.56538652, 0.        ],
       [0.        , 0.37190386, 0.44209453, 0.        , 0.53258605,
        0.        , 0.        , 0.        , 0.        , 0.3145539 ,
        0.        , 0.53258605],
       [0.        , 0.3389545 , 0.40292653, 0.        , 0.        ,
        0.80035708, 0.        , 0.        , 0.        , 0.28668554,
        0.        , 0.        ],
       [0.        , 0.26463289, 0.31457796, 0.62486502, 0.        ,
        0.        , 0.62486502, 0.        , 0.        , 0.22382481,
        0.        ,

In [30]:
transformer.idf_

array([2.79175947, 1.18232156, 1.40546511, 2.79175947, 1.69314718,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 1.        ,
       2.79175947, 1.69314718])

In [None]:
# New in Pandas version 1.0
#transformer.feature_names_in_

---

# Test tensor

In [3]:
import tensorflow as tf

In [8]:
tf_con = tf.constant([[1,2,3]])
tf_con

<tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[1, 2, 3]], dtype=int32)>

In [9]:
print(tf_con.shape, tf_con.ndim, tf_con.dtype)
print(tf.shape(tf_con))

(1, 3) 2 <dtype: 'int32'>
tf.Tensor([1 3], shape=(2,), dtype=int32)


In [10]:
tf_con*2

<tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[2, 4, 6]], dtype=int32)>

In [11]:
tf_var = tf.Variable(tf_con)
tf_var

<tf.Variable 'Variable:0' shape=(1, 3) dtype=int32, numpy=array([[1, 2, 3]], dtype=int32)>

In [12]:
tf.constant([[1, 3],[2,0]])

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[1, 3],
       [2, 0]], dtype=int32)>

In [18]:
tf.sequence_mask([[1, 3],[2,0]]).numpy()

array([[[ True, False, False],
        [ True,  True,  True]],

       [[ True,  True, False],
        [False, False, False]]])

In [22]:
a=tf.Variable([i for i in range(10)])
a

<tf.Variable 'Variable:0' shape=(10,) dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)>

In [38]:
b=tf.sequence_mask(5,10)
# b = tf.cast(b,dtype=tf.int32)
b

<tf.Tensor: shape=(10,), dtype=bool, numpy=
array([ True,  True,  True,  True,  True, False, False, False, False,
       False])>

In [26]:
tf.bitwise.bitwise_and(a,b)

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int32)>

In [31]:
c = tf.Variable([100]*10)

In [41]:
tf.where(b,a,[0])

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 0, 0, 0, 0, 0], dtype=int32)>

In [43]:
label = tf.Variable([i for i in range(10)])
label

<tf.Variable 'Variable:0' shape=(10,) dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)>

In [66]:
tf.reshape(tf.Variable([[i for i in range(10)]]),[10,1])

<tf.Tensor: shape=(10, 1), dtype=int32, numpy=
array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]], dtype=int32)>

In [67]:
output = tf.Variable([[i for i in range(10)]])
output = tf.broadcast_to(output, [10, 10])
output*=10
output+= tf.reshape(tf.Variable([[i for i in range(10)]]),[10,1])
output

<tf.Tensor: shape=(10, 10), dtype=int32, numpy=
array([[ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
       [ 1, 11, 21, 31, 41, 51, 61, 71, 81, 91],
       [ 2, 12, 22, 32, 42, 52, 62, 72, 82, 92],
       [ 3, 13, 23, 33, 43, 53, 63, 73, 83, 93],
       [ 4, 14, 24, 34, 44, 54, 64, 74, 84, 94],
       [ 5, 15, 25, 35, 45, 55, 65, 75, 85, 95],
       [ 6, 16, 26, 36, 46, 56, 66, 76, 86, 96],
       [ 7, 17, 27, 37, 47, 57, 67, 77, 87, 97],
       [ 8, 18, 28, 38, 48, 58, 68, 78, 88, 98],
       [ 9, 19, 29, 39, 49, 59, 69, 79, 89, 99]], dtype=int32)>

In [68]:
for row, value in enumerate(label):
    print(row, value, output[row][value])
    

0 tf.Tensor(0, shape=(), dtype=int32) tf.Tensor(0, shape=(), dtype=int32)
1 tf.Tensor(1, shape=(), dtype=int32) tf.Tensor(11, shape=(), dtype=int32)
2 tf.Tensor(2, shape=(), dtype=int32) tf.Tensor(22, shape=(), dtype=int32)
3 tf.Tensor(3, shape=(), dtype=int32) tf.Tensor(33, shape=(), dtype=int32)
4 tf.Tensor(4, shape=(), dtype=int32) tf.Tensor(44, shape=(), dtype=int32)
5 tf.Tensor(5, shape=(), dtype=int32) tf.Tensor(55, shape=(), dtype=int32)
6 tf.Tensor(6, shape=(), dtype=int32) tf.Tensor(66, shape=(), dtype=int32)
7 tf.Tensor(7, shape=(), dtype=int32) tf.Tensor(77, shape=(), dtype=int32)
8 tf.Tensor(8, shape=(), dtype=int32) tf.Tensor(88, shape=(), dtype=int32)
9 tf.Tensor(9, shape=(), dtype=int32) tf.Tensor(99, shape=(), dtype=int32)


In [84]:
-(np.log(0.95)+np.log(0.1))/2

1.176939193690798

In [81]:
np.log(1)

0.0

In [82]:
cce = tf.keras.losses.SparseCategoricalCrossentropy()

In [83]:
y_true = [1, 2]
y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
# Using 'auto'/'sum_over_batch_size' reduction type.
scce = tf.keras.losses.SparseCategoricalCrossentropy()
scce(y_true, y_pred).numpy()

1.1769392

In [85]:
a = tf.Variable([1])

In [86]:
for _ in range(4):
    a = a+1
    print(a)

tf.Tensor([2], shape=(1,), dtype=int32)
tf.Tensor([3], shape=(1,), dtype=int32)
tf.Tensor([4], shape=(1,), dtype=int32)
tf.Tensor([5], shape=(1,), dtype=int32)


In [87]:
# A trainable variable
x0 = tf.Variable(3.0, name='x0')
# Not trainable
x1 = tf.Variable(3.0, name='x1', trainable=False)
# Not a Variable: A variable + tensor returns a tensor.
x2 = tf.Variable(2.0, name='x2') + 1.0
# Not a variable
x3 = tf.constant(3.0, name='x3')

with tf.GradientTape() as tape:
  y = (x0**2) + (x1**2) + (x2**2)

grad = tape.gradient(y, [x0, x1, x2, x3])

for g in grad:
  print(g)

tf.Tensor(6.0, shape=(), dtype=float32)
None
None
None


In [88]:
[var.name for var in tape.watched_variables()]

['x0:0']

In [90]:
x = tf.constant(10,dtype=tf.float32)

with tf.GradientTape() as g:
  g.watch(x)
  y = x * x

print(g.gradient(y, x))

tf.Tensor(20.0, shape=(), dtype=float32)


In [104]:
x0 = tf.Variable(3.0)
x1 = tf.Variable(0.0)

with tf.GradientTape() as tape:
  # Update x1 = x1 + x0.
  # x1.assign_add(x0)
  # The tape starts recording from x1.
  # y = x1**2   # y = (x1 + x0)**2
  # y = (x1 + x0)**2
  y = x1**2 +x0**2 + 2*x0*x1

# This doesn't work.
print(tape.gradient(y, {"x1":x1, "x0":x0}))   #dy/dx0 = 2*(x1 + x0)

{'x1': <tf.Tensor: shape=(), dtype=float32, numpy=6.0>, 'x0': <tf.Tensor: shape=(), dtype=float32, numpy=6.0>}


In [99]:
x = tf.Variable(2.0)
y = tf.Variable(3.0)

with tf.GradientTape() as t:
  x_sq = x * x
  # with t.stop_recording():
  y_sq = y * y
  z = x_sq + y_sq

grad = t.gradient(z, {'x': x, 'y': y})

print(grad)
print('dz/dx:', grad['x'])  # 2*x => 4
print('dz/dy:', grad['y'])

{'x': <tf.Tensor: shape=(), dtype=float32, numpy=4.0>, 'y': <tf.Tensor: shape=(), dtype=float32, numpy=6.0>}
dz/dx: tf.Tensor(4.0, shape=(), dtype=float32)
dz/dy: tf.Tensor(6.0, shape=(), dtype=float32)


In [105]:
def simple_relu(x):
  if tf.greater(x, 0):
    return x
  else:
    return 0

# `tf_simple_relu` is a TensorFlow `Function` that wraps `simple_relu`.
tf_simple_relu = tf.function(simple_relu)

print("First branch, with graph:", tf_simple_relu(tf.constant(1)).numpy())
print("Second branch, with graph:", tf_simple_relu(tf.constant(-1)).numpy())

First branch, with graph: 1
Second branch, with graph: 0


In [106]:
print(tf.autograph.to_code(simple_relu))

def tf__simple_relu(x):
    with ag__.FunctionScope('simple_relu', 'fscope', ag__.ConversionOptions(recursive=True, user_requested=True, optional_features=(), internal_convert_user_code=True)) as fscope:
        do_return = False
        retval_ = ag__.UndefinedReturnValue()

        def get_state():
            return (do_return, retval_)

        def set_state(vars_):
            nonlocal do_return, retval_
            (do_return, retval_) = vars_

        def if_body():
            nonlocal do_return, retval_
            try:
                do_return = True
                retval_ = ag__.ld(x)
            except:
                do_return = False
                raise

        def else_body():
            nonlocal do_return, retval_
            try:
                do_return = True
                retval_ = 0
            except:
                do_return = False
                raise
        ag__.if_stmt(ag__.converted_call(ag__.ld(tf).greater, (ag__.ld(x), 0), None, fscope), if_bo

In [107]:
print(tf_simple_relu.get_concrete_function(tf.constant(1)).graph.as_graph_def())

node {
  name: "x"
  op: "Placeholder"
  attr {
    key: "shape"
    value {
      shape {
      }
    }
  }
  attr {
    key: "dtype"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "_user_specified_name"
    value {
      s: "x"
    }
  }
}
node {
  name: "Greater/y"
  op: "Const"
  attr {
    key: "value"
    value {
      tensor {
        dtype: DT_INT32
        tensor_shape {
        }
        int_val: 0
      }
    }
  }
  attr {
    key: "dtype"
    value {
      type: DT_INT32
    }
  }
}
node {
  name: "Greater"
  op: "Greater"
  input: "x"
  input: "Greater/y"
  attr {
    key: "T"
    value {
      type: DT_INT32
    }
  }
}
node {
  name: "cond"
  op: "StatelessIf"
  input: "Greater"
  input: "x"
  attr {
    key: "then_branch"
    value {
      func {
        name: "cond_true_1272"
      }
    }
  }
  attr {
    key: "output_shapes"
    value {
      list {
        shape {
        }
        shape {
        }
      }
    }
  }
  attr {
    key: "else_branch"
  

In [108]:
@tf.function
def get_MSE(y_true, y_pred):
  sq_diff = tf.pow(y_true - y_pred, 2)
  return tf.reduce_mean(sq_diff)

y_true = tf.random.uniform([5], maxval=10, dtype=tf.int32)
y_pred = tf.random.uniform([5], maxval=10, dtype=tf.int32)
print(y_true)
print(y_pred)

get_MSE(y_true, y_pred)

tf.Tensor([4 1 1 7 2], shape=(5,), dtype=int32)
tf.Tensor([9 2 8 0 0], shape=(5,), dtype=int32)


<tf.Tensor: shape=(), dtype=int32, numpy=25>

In [111]:
tf.config.run_functions_eagerly(True)
get_MSE(y_true, y_pred)

<tf.Tensor: shape=(), dtype=int32, numpy=25>

In [110]:
tf.config.run_functions_eagerly(False)

In [112]:
@tf.function
def get_MSE(y_true, y_pred):
  print("Calculating MSE!")
  sq_diff = tf.pow(y_true - y_pred, 2)
  return tf.reduce_mean(sq_diff)

error = get_MSE(y_true, y_pred)
error = get_MSE(y_true, y_pred)
error = get_MSE(y_true, y_pred)

Calculating MSE!


In [114]:
class SimpleModule(tf.Module):
  def __init__(self, name=None):
    super().__init__(name=name)
    self.a_variable = tf.Variable(5.0, name="train_me")
    self.non_trainable_variable = tf.Variable(5.0, trainable=False, name="do_not_train_me")
  def __call__(self, x):
    return self.a_variable * x + self.non_trainable_variable

simple_module = SimpleModule(name="simple")

simple_module(tf.constant(5.0))

<tf.Tensor: shape=(), dtype=float32, numpy=30.0>

In [115]:
class SimpleModule(tf.Module):
  def __init__(self, name=None):
    super().__init__(name=name)
    self.a_variable = tf.Variable(5.0, name="train_me")
    self.non_trainable_variable = tf.Variable(5.0, trainable=False, name="do_not_train_me")
  def __call__(self, x):
    return self.a_variable * x + self.non_trainable_variable

simple_module = SimpleModule(name="simple")

simple_module(tf.constant(5.0))

# tf.Module does not have model.summary()

<tf.Tensor: shape=(), dtype=float32, numpy=30.0>

In [117]:
simple_module.variables

(<tf.Variable 'train_me:0' shape=() dtype=float32, numpy=5.0>,
 <tf.Variable 'do_not_train_me:0' shape=() dtype=float32, numpy=5.0>)

In [118]:
simple_module.submodules

()

In [125]:
class Dense(tf.Module):
  def __init__(self, in_features, out_features, name=None):
    super().__init__(name=name)
    self.w = tf.Variable(
      tf.random.normal([in_features, out_features]), name='w')
    self.b = tf.Variable(tf.zeros([out_features]), name='b')
  def __call__(self, x):
    y = tf.matmul(x, self.w) + self.b
    return tf.nn.relu(y)

class SequentialModule(tf.Module):
  def __init__(self, name=None):
    super().__init__(name=name)

    self.dense_1 = Dense(in_features=3, out_features=3,name='d1')
    self.dense_2 = Dense(in_features=3, out_features=2,name='d2')

  def __call__(self, x):
    x = self.dense_1(x)
    return self.dense_2(x)

# You have made a model!
my_model = SequentialModule(name="the_model")

# Call it, with random results
print("Model results:", my_model(tf.constant([[2.0, 2.0, 2.0]])))

Model results: tf.Tensor([[0. 0.]], shape=(1, 2), dtype=float32)


In [126]:
my_model.submodules

(<__main__.Dense at 0x1607bc400>, <__main__.Dense at 0x1607bf4f0>)

In [127]:
my_model.variables

(<tf.Variable 'b:0' shape=(3,) dtype=float32, numpy=array([0., 0., 0.], dtype=float32)>,
 <tf.Variable 'w:0' shape=(3, 3) dtype=float32, numpy=
 array([[ 0.02736765, -0.17608088, -1.032676  ],
        [-0.4262823 , -1.1004285 , -0.7121168 ],
        [-0.6336593 , -2.1401153 , -1.1367048 ]], dtype=float32)>,
 <tf.Variable 'b:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>,
 <tf.Variable 'w:0' shape=(3, 2) dtype=float32, numpy=
 array([[ 2.3993342 , -1.4112239 ],
        [-0.18513879,  0.11712091],
        [ 0.10611385,  1.5401213 ]], dtype=float32)>)