In [1]:
import torch as t
import numpy as np
import torchtext
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
import collections

!pip install 'portalocker>=2.8.2'



# 1) Torch text datasets

In [2]:
''' According to this link, https://pytorch.org/text/stable/datasets.html pretty much every
    single torchtext (tt) dataset (ds) has root and split arguments.

    1) root - ??? Directory where ds are saved ???
    2) split - train or test for ag_news, but for a ds like SST2 it has train, test and DEV. '''
train, test = AG_NEWS()

In [3]:
''' Iter gives ability to of course loop (technically) over the object. As of now its type
    is "ShardingFilterIterDataPipe". After iter? It's
    "<generator object ShardingFilterIterDataPipe.__iter__ at 0x7d2b405bc660>" '''
x = iter(train)

x

<generator object ShardingFilterIterDataPipe.__iter__ at 0x78caf1f33ed0>

In [4]:
''' Just keeps getting the next value, can also use a default value if end is reached.
    https://stackoverflow.com/questions/76302971/question-in-pytorch-transformer-tutorial-about-nonetype-object-has-no-attribut '''
next(iter(train))

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [5]:
# Zip is good for a certain range, 2, 5, etc. n in general. Very useful.
# for i, x in zip(range(2), train):
#   print(f'Index {i}. x:\n{x}\n\n')

# Enumerate works fine as well.
for i, x in enumerate(train):
  print(f'Index {i}:\nx: {x}')
  break

Index 0:
x: (3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")




# 2) Tokenization

In [6]:
# Get list of sentences
s = []
for i, x in zip(range(3), train):
  s.append(x[1])

print(f'List of sentences to tokenize:\n\n{s}\nTotal sentences: {len(s)}')

List of sentences to tokenize:

["Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.', "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."]
Total sentences: 3


In [7]:
tk = get_tokenizer('basic_english')

# ts = tokenized sentences. !!! do "tk(s[0])" for individual sentences !!!
ts = [tk(sent) for sent in s]

ts

[['wall',
  'st',
  '.',
  'bears',
  'claw',
  'back',
  'into',
  'the',
  'black',
  '(',
  'reuters',
  ')',
  'reuters',
  '-',
  'short-sellers',
  ',',
  'wall',
  'street',
  "'",
  's',
  'dwindling\\band',
  'of',
  'ultra-cynics',
  ',',
  'are',
  'seeing',
  'green',
  'again',
  '.'],
 ['carlyle',
  'looks',
  'toward',
  'commercial',
  'aerospace',
  '(',
  'reuters',
  ')',
  'reuters',
  '-',
  'private',
  'investment',
  'firm',
  'carlyle',
  'group',
  ',',
  '\\which',
  'has',
  'a',
  'reputation',
  'for',
  'making',
  'well-timed',
  'and',
  'occasionally\\controversial',
  'plays',
  'in',
  'the',
  'defense',
  'industry',
  ',',
  'has',
  'quietly',
  'placed\\its',
  'bets',
  'on',
  'another',
  'part',
  'of',
  'the',
  'market',
  '.'],
 ['oil',
  'and',
  'economy',
  'cloud',
  'stocks',
  "'",
  'outlook',
  '(',
  'reuters',
  ')',
  'reuters',
  '-',
  'soaring',
  'crude',
  'prices',
  'plus',
  'worries\\about',
  'the',
  'economy',
  'a

In [8]:
# counter just counts how many times its seen something.

# 1) Give first tokenized str
# counter = collections.Counter(ts[0])
# counter

# 2) Init default and update it with tokenized str
# counter = collections.Counter()
# counter.update(ts[0])
# counter

# 3) Use whole ds with it (part of it here)
# counter = collections.Counter()
# for i, x in zip(range(3), train):
#   counter.update(tk(x[1]))
# counter

# 4) The 3rd option used train directly, can also use the tokenized sentences in the list "ts".
counter = collections.Counter()
for sent in ts:
  counter.update(sent)
counter

Counter({'wall': 2,
         'st': 1,
         '.': 4,
         'bears': 1,
         'claw': 1,
         'back': 1,
         'into': 1,
         'the': 7,
         'black': 1,
         '(': 3,
         'reuters': 6,
         ')': 3,
         '-': 3,
         'short-sellers': 1,
         ',': 4,
         'street': 1,
         "'": 2,
         's': 1,
         'dwindling\\band': 1,
         'of': 3,
         'ultra-cynics': 1,
         'are': 2,
         'seeing': 1,
         'green': 1,
         'again': 1,
         'carlyle': 2,
         'looks': 1,
         'toward': 1,
         'commercial': 1,
         'aerospace': 1,
         'private': 1,
         'investment': 1,
         'firm': 1,
         'group': 1,
         '\\which': 1,
         'has': 2,
         'a': 1,
         'reputation': 1,
         'for': 2,
         'making': 1,
         'well-timed': 1,
         'and': 3,
         'occasionally\\controversial': 1,
         'plays': 1,
         'in': 1,
         'defense': 1,
     

In [9]:
# vocab is a dictionary
v = torchtext.vocab.vocab(counter, min_freq=1)

print(v)
print(v.vocab)
print(len(v.vocab))

Vocab()
<torchtext._torchtext.Vocab object at 0x78ca2b9a33b0>
75


In [10]:
# Returns dictionary of word/assigned number in key/value pairs.
v.get_stoi()

{'depth': 72,
 'during': 71,
 'next': 69,
 'over': 67,
 'to\\hang': 66,
 'earnings': 64,
 'worries\\about': 63,
 'prices': 61,
 'crude': 60,
 'dwindling\\band': 18,
 'green': 23,
 'group': 33,
 'has': 35,
 'private': 30,
 'of': 19,
 'carlyle': 25,
 'looks': 26,
 ',': 14,
 'street': 15,
 'plus': 62,
 'black': 8,
 'toward': 27,
 'bears': 3,
 'the\\summer': 73,
 'week': 70,
 'reuters': 10,
 'again': 24,
 ')': 11,
 '.': 2,
 '\\which': 34,
 'st': 1,
 'market': 53,
 'the': 7,
 'in': 44,
 'expected': 65,
 '(': 9,
 'are': 21,
 'part': 52,
 "'": 16,
 'claw': 4,
 'into': 6,
 'short-sellers': 13,
 '-': 12,
 'aerospace': 29,
 'commercial': 28,
 'investment': 31,
 'firm': 32,
 'seeing': 22,
 'making': 39,
 'for': 38,
 'another': 51,
 'cloud': 56,
 'back': 5,
 'outlook': 58,
 'a': 36,
 'on': 50,
 'reputation': 37,
 's': 17,
 'well-timed': 40,
 'and': 41,
 'wall': 0,
 'industry': 46,
 'soaring': 59,
 'stocks': 57,
 'occasionally\\controversial': 42,
 'stock': 68,
 'bets': 49,
 'doldrums': 74,
 'ultra

In [11]:
#  Returns list of all words. showing first 10
v.get_itos()[:10]

['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(']

In [12]:
# zip is cool.
for i, x in zip(range(5), v.get_itos()):
  print(f'Numerical value in dictionary: {i}. Assigned word: {x}')

Numerical value in dictionary: 0. Assigned word: wall
Numerical value in dictionary: 1. Assigned word: st
Numerical value in dictionary: 2. Assigned word: .
Numerical value in dictionary: 3. Assigned word: bears
Numerical value in dictionary: 4. Assigned word: claw


In [13]:
''' Convert sentence into nums. es = example sentence. Get the tokenized sentence.
    Loop through it, get_stoi() remember returns dictionary of str/num key/value pairs.
    So give it str (or token) to get num back. '''
es = ts[0]

numerical_sentence = [v.get_stoi()[cur_token] for cur_token in es]

print(f'Example sentence:\n{es}\n\nNumerical version of same sentence:\n{numerical_sentence}')

Example sentence:
['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']

Numerical version of same sentence:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13, 14, 0, 15, 16, 17, 18, 19, 20, 14, 21, 22, 23, 24, 2]


In [14]:
# Create lists with tokens and their numerical conversions.
pairs = []
for cur_sent in ts:
  pairs.append([list((v.get_stoi()[cur_token], cur_token)) for cur_token in cur_sent])
pairs

[[[0, 'wall'],
  [1, 'st'],
  [2, '.'],
  [3, 'bears'],
  [4, 'claw'],
  [5, 'back'],
  [6, 'into'],
  [7, 'the'],
  [8, 'black'],
  [9, '('],
  [10, 'reuters'],
  [11, ')'],
  [10, 'reuters'],
  [12, '-'],
  [13, 'short-sellers'],
  [14, ','],
  [0, 'wall'],
  [15, 'street'],
  [16, "'"],
  [17, 's'],
  [18, 'dwindling\\band'],
  [19, 'of'],
  [20, 'ultra-cynics'],
  [14, ','],
  [21, 'are'],
  [22, 'seeing'],
  [23, 'green'],
  [24, 'again'],
  [2, '.']],
 [[25, 'carlyle'],
  [26, 'looks'],
  [27, 'toward'],
  [28, 'commercial'],
  [29, 'aerospace'],
  [9, '('],
  [10, 'reuters'],
  [11, ')'],
  [10, 'reuters'],
  [12, '-'],
  [30, 'private'],
  [31, 'investment'],
  [32, 'firm'],
  [25, 'carlyle'],
  [33, 'group'],
  [14, ','],
  [34, '\\which'],
  [35, 'has'],
  [36, 'a'],
  [37, 'reputation'],
  [38, 'for'],
  [39, 'making'],
  [40, 'well-timed'],
  [41, 'and'],
  [42, 'occasionally\\controversial'],
  [43, 'plays'],
  [44, 'in'],
  [7, 'the'],
  [45, 'defense'],
  [46, 'industry'

In [15]:
v.get_itos()[21]

'are'

In [16]:
def encode(str_to_encode):
  return [v.get_stoi()[token] for token in str_to_encode]

''' get_itos works for decoding because the vocab object already assigned numbers
    to words. So as of right now, v.get_stoi()[0] gets the word in the dictionary
    with assigned num 0. '''
def decode(nums_to_decode):
  return [v.get_itos()[num] for num in nums_to_decode]

# es = example str declared in earlier cell. already tokenized.
encoded_str = encode(es)
print(f'Example str:\n{es}\n\nEncoded str:\n{encoded_str}\n\n')

decoded_str = decode(encoded_str)
print(f'Decoded str:\n{decoded_str}')

Example str:
['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']

Encoded str:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13, 14, 0, 15, 16, 17, 18, 19, 20, 14, 21, 22, 23, 24, 2]


Decoded str:
['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']


# 3) N grams

In [17]:
from torchtext.data.utils import ngrams_iterator

''' Ngrams will help solve multiword expression issues like "hamburger" because "ham" and
    "burger" can be 2 separate words. Can't always represent both of those words with the same
    vector. The update func gets pairs because ngrams=2. If the goal was to be able to understand
    3 sets of words, ngrams will be 3. Downside is this WILL example the counter object
    by a lot. Specifically len(v) * 2/3/etc. '''

# nc = ngrams counter
nc = collections.Counter()
nc.update(ngrams_iterator(es, ngrams=2))
nc

Counter({'wall': 2,
         'st': 1,
         '.': 2,
         'bears': 1,
         'claw': 1,
         'back': 1,
         'into': 1,
         'the': 1,
         'black': 1,
         '(': 1,
         'reuters': 2,
         ')': 1,
         '-': 1,
         'short-sellers': 1,
         ',': 2,
         'street': 1,
         "'": 1,
         's': 1,
         'dwindling\\band': 1,
         'of': 1,
         'ultra-cynics': 1,
         'are': 1,
         'seeing': 1,
         'green': 1,
         'again': 1,
         'wall st': 1,
         'st .': 1,
         '. bears': 1,
         'bears claw': 1,
         'claw back': 1,
         'back into': 1,
         'into the': 1,
         'the black': 1,
         'black (': 1,
         '( reuters': 1,
         'reuters )': 1,
         ') reuters': 1,
         'reuters -': 1,
         '- short-sellers': 1,
         'short-sellers ,': 1,
         ', wall': 1,
         'wall street': 1,
         "street '": 1,
         "' s": 1,
         's dwindlin

In [18]:
for i, a in zip(range(3), nc):
  print(a)

wall
st
.


# 4) Bag of words (bow) & padding

In [19]:
# A bow is simply seeing how many times a word occurs. Get the first 3 sentences num representations here
num_sents = [encode(tk(a[1])) for i, a in zip(range(3), train)]

for x in num_sents:
  print(x)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13, 14, 0, 15, 16, 17, 18, 19, 20, 14, 21, 22, 23, 24, 2]
[25, 26, 27, 28, 29, 9, 10, 11, 10, 12, 30, 31, 32, 25, 33, 14, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 7, 45, 46, 14, 35, 47, 48, 49, 50, 51, 52, 19, 7, 53, 2]
[54, 41, 55, 56, 57, 16, 58, 9, 10, 11, 10, 12, 59, 60, 61, 62, 63, 7, 55, 41, 7, 58, 38, 64, 21, 65, 66, 67, 7, 68, 53, 69, 70, 71, 7, 72, 19, 73, 74, 2]


In [20]:
# Get tensor of length of vocab. With first 3 sentences in ag_news USED for vocab, this will be 75.
a = t.zeros(len(v))

# Loop over first num sentence seen in previous cell.
for num in num_sents[0]:
  ''' Ex: v.get_stoi() has "wall" paired with num 0. this will increase index 0 by 1. Keep in mind,
      we can access index 0 in this "a" tensor, and give that index to v.get_stoi()[] and it'll
      return the proper word '''
  a[num] = a[num] + 1

a

tensor([2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 2., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])

In [21]:
''' num_sents aren't the same size which is required for models like embedding, so padding will be
    demonstrated. map wil take the len func and apply it to every num sentence and list will get
    all the lengths. Then max just gets the biggest one. Quick and easy. '''
max_length = max(list(map(len, num_sents)))

''' Convert sent to tensor first, and 0, max length - len of current sentence just makes sure we get
    the CORRECT amount of 0s. '''
padded_num_sents = [t.nn.functional.pad(t.tensor(sent), (0, max_length - len(sent))) for sent in num_sents]

print(padded_num_sents)

for x in padded_num_sents:
  print(len(x))

[tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 10, 12, 13, 14,  0, 15,
        16, 17, 18, 19, 20, 14, 21, 22, 23, 24,  2,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0]), tensor([25, 26, 27, 28, 29,  9, 10, 11, 10, 12, 30, 31, 32, 25, 33, 14, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44,  7, 45, 46, 14, 35, 47, 48, 49, 50,
        51, 52, 19,  7, 53,  2]), tensor([54, 41, 55, 56, 57, 16, 58,  9, 10, 11, 10, 12, 59, 60, 61, 62, 63,  7,
        55, 41,  7, 58, 38, 64, 21, 65, 66, 67,  7, 68, 53, 69, 70, 71,  7, 72,
        19, 73, 74,  2,  0,  0])]
42
42
42


# 5) Dataset & Data Loader

In [22]:
from torch.utils.data import Dataset, DataLoader

''' Datasets are good because they can potentially load data only when necessary instead of all at once
    like with image processing tasks. Also a way to keep things organized. Must override len and getitem. '''
class TestDataset(Dataset):
  def __init__(self, data_to_use):
    self.data = data_to_use

  def __getitem__(self, index):
    return self.data[index]

  def __len__(self):
    return len(self.data)

# The collate_fn arg in DataLoader which extra things to the dataset before it's saved in data loader.
def sub(num):
  return t.tensor([cur_tensor_num.item() - 1 for cur_tensor_num in num])

# Get some generic data. "tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])"
x = t.tensor([x * 2 for x in range(10)])
tds = TestDataset(x)


''' tdl = test data loader (dl) Very efficient way of loading data. Also when using the collate_fn arg, it
    seems like the DataLoader GIVES the data to whatever func as a list. Strange.
    Uncomment/comment whichever below DataLoader to see results.  '''
# tdl = DataLoader(tds, 1, shuffle=False, collate_fn=sub)
tdl = DataLoader(tds, 1, shuffle=False)

for x in tdl:
  print(x)

tensor([0])
tensor([2])
tensor([4])
tensor([6])
tensor([8])
tensor([10])
tensor([12])
tensor([14])
tensor([16])
tensor([18])


In [23]:
# ntd = new test dataset
ntd = TestDataset([5,4,3,2])

# ndl = new data loader
ndl = DataLoader(ntd, batch_size=2)

# Gets 2 because of batch size
print(len(ndl))

# Gets 4 in total. can also do len(dl.dataset)
print(len(ndl.dataset.data))

2
4


# 6) Neural Network related topics (Including Models built)

In [24]:
# Seeing inputs of a linear layer and what it'll output.
t.nn.Linear(len(v), 3)

Linear(in_features=75, out_features=3, bias=True)

In [25]:
x = t.tensor([[2,8,9],
              [5,1,3],
              [4,8,7]], dtype=t.float64)

''' ??? 1 is likely vertical (row by row) and 0 horizontal (column by column). ??? Softmax
    converts things to probabilities, so log softmax does the same but with log applied.
    Also logsoftmax is for classification
    https://www.baeldung.com/cs/softmax-vs-log-softmax '''
ls = t.nn.LogSoftmax(dim=1)

print(x)
print(ls(x))

tensor([[2., 8., 9.],
        [5., 1., 3.],
        [4., 8., 7.]], dtype=torch.float64)
tensor([[-7.3139, -1.3139, -0.3139],
        [-0.1429, -4.1429, -2.1429],
        [-4.3266, -0.3266, -1.3266]], dtype=torch.float64)


In [26]:
# Sequential is just layers in an order.
network = t.nn.Sequential(t.nn.Linear(len(v), 3),
                          t.nn.LogSoftmax(dim=1))

network

Sequential(
  (0): Linear(in_features=75, out_features=3, bias=True)
  (1): LogSoftmax(dim=1)
)

In [27]:
''' len(v) because model input is equal to vocab length.
    1) "TypeError: linear(): argument 'input' (position 1) must be Tensor, not list"
          I converted it to a tensor because of this.

    2) "RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float"
          I checked the input_test dtype and it was torch.int64. Apparently that
          qualifies as a long? A QUICK fix would be to change the input_test tensor
          to dtype=t.float32, but I also wanted to check models input.

          2.2) Checking model input.
                Help from link:
                https://discuss.pytorch.org/t/get-appropriate-model-in-output-type-programmatically/53742

                Doing:
                "z = t.nn.Linear(len(v), 3)
                list(z.parameters())" returns a list of tensors. First tensor is 2d
                because it has 3 inner tensors of 75 values, the vocab size.

                Ex:
                "tensor([[-0.0353, -0.0916,  0.0258,  0.0546, -0.0598,  0.0273, -0.0447, -0.0129,
                           0.0199, -0.0067,  0.0418,  0.1102, -0.0363, -0.0615, -0.1136,  0.0627],

                          [0.0922, -0.0013, -0.0645, -0.0640, -0.0426,  0.0972,  0.0951, -0.0859,
                            0.0891,  0.1113, -0.0431, -0.0607,  0.0699, -0.0706,  0.0240, -0.0716],

                          [-0.0369,  0.0732,  0.0284, -0.0249,  0.0937, -0.0938,  0.0555,  0.0908,
                           -0.0923, -0.0790,  0.0530,  0.0607,  0.1147,  0.0963, -0.0195, -0.1021]]"

                It's in 2d. Strange. I guess input must be in 2d as well.

          But to the point of this section, which is 2, the error can be fixed by changing the
          type of the input to dtype=t.float32.


    3) "IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)"
          Then this one came up because I BELIEVE the input tensor was not 2d. I used
          "t.unsqueeze(input_test, dim=0)" which ADDS a dimension horizontally so now its definitely
          2d. Shape is "torch.Size([1, 75])"

     '''
input_test = t.tensor([i * 2 for i in range(len(v))], dtype=t.float32)
# uit = updated input test
uit = t.unsqueeze(input_test, dim=0)
print(f'Input for model:\n{input_test}\n\nType: {input_test.dtype}\n\nNEW input for model:\n{uit}\n\n')

# Show model result.
network(uit)

Input for model:
tensor([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.,  20.,  22.,
         24.,  26.,  28.,  30.,  32.,  34.,  36.,  38.,  40.,  42.,  44.,  46.,
         48.,  50.,  52.,  54.,  56.,  58.,  60.,  62.,  64.,  66.,  68.,  70.,
         72.,  74.,  76.,  78.,  80.,  82.,  84.,  86.,  88.,  90.,  92.,  94.,
         96.,  98., 100., 102., 104., 106., 108., 110., 112., 114., 116., 118.,
        120., 122., 124., 126., 128., 130., 132., 134., 136., 138., 140., 142.,
        144., 146., 148.])

Type: torch.float32

NEW input for model:
tensor([[  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.,  20.,  22.,
          24.,  26.,  28.,  30.,  32.,  34.,  36.,  38.,  40.,  42.,  44.,  46.,
          48.,  50.,  52.,  54.,  56.,  58.,  60.,  62.,  64.,  66.,  68.,  70.,
          72.,  74.,  76.,  78.,  80.,  82.,  84.,  86.,  88.,  90.,  92.,  94.,
          96.,  98., 100., 102., 104., 106., 108., 110., 112., 114., 116., 118.,
         120., 122., 124., 1

tensor([[  0.0000, -32.3327, -97.6060]], grad_fn=<LogSoftmaxBackward0>)

In [28]:
''' Most basic model ever. Embedding takes in v size and outputs dimension size of whatever requested.
    And that'll be input to the Linear layer of course.

    Errors:
    1) "RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar
        types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)"

        Solution: "uit = uit.type(t.long)" type conversion


    2) "IndexError: index out of range in self"

        Understanding the error: https://rollbar.com/blog/how-to-handle-index-out-of-range-in-self-pytorch/#
        The code at the beginning is simple and I made my own test code
        "r = t.nn.Embedding(10, 5)
        m = t.tensor([9])
        r(m)"
        The thing is, if the tensor value is 10, it breaks and gives same error. But if its 9 or less, it
        works fine. Which tells me that the MAX range it'll accept is 10. Anything greater and it breaks.
        Of course it indexes from 0. The above is just for a 1d dimensional tensor.

        Solution: I was given numerical input to the model when the VOCAB didn't match it. The vocab
          went to num 75. Yet I tried giving the model:
          "tensor([[  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.,  20.,  22.,
                      24.,  26.,  28.,  30.,  32.,  34.,  36.,  38.,  40.,  42.,  44.,  46.,
                      48.,  50.,  52.,  54.,  56.,  58.,  60.,  62.,  64.,  66.,  68.,  70.,
                      72.,  74.,  76.,  78.,  80.,  82.,  84.,  86.,  88.,  90.,  92.,  94.,
                      96.,  98., 100., 102., 104., 106., 108., 110., 112., 114., 116., 118.,
                      120., 122., 124., 126., 128., 130., 132., 134., 136., 138., 140., 142.,
                      144., 146., 148.]])"
          The aforementioned link helped me understand this with t.all(). Initially I thought
          t.nn.Embedding(vocab size, etc) meant the LENGTH of the input tensor couldn't be bigger
          than vocab size, but it was checking for INDIVIDUAL numbers instead. Example code below:

          " r = t.nn.Embedding(len(v), 5) # vocab len 75
            m = t.tensor([9,4,2,4,1,4,5,8,3,6,5,2,3,2,4,6,2,4,6,5,5,5,55,31,2]) # Will pass!
            # m = t.tensor([9,4,2,4,1,4,5,8,74,10,75]) # Will fail!
            # m = t.tensor([9,4,2,4,1,4,5,8,74,10,74]) # Will pass!

            # All literally checks ALL values in tensor.
            if t.all(m >= 0):
              print(f'Passed. Tensor is: {m.shape}')
            if t.all(m < r.num_embeddings):
              print(f'Passed. Tensor is: {m.shape}')
            else:
              print(f'!!! Failed. Tensor shape: {uit.shape} and num embeddings: {embedding.num_embeddings} !!!')"



    3) "RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x29 and 5x3)"
        Why did this happen? 1x29 is the size of the num input tenor which is:
        "tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 10, 12, 13, 14,  0, 15,
                 16, 17, 18, 19, 20, 14, 21, 22, 23, 24,  2])"
         Real tokenized sentence of above numerical sentence is:
            "['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(',
               'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's',
               'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']"


        Solution 1: I commented out:
          "# x = t.mean(x,dim=1)
           # print(f'x shape after t.mean: {x.shape}\nx after t.mean:\n{x}')"
           And it worked perfectly. Why?

           Answer: Because if input is:
             "tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 10, 12, 13, 14,  0, 15,
                      16, 17, 18, 19, 20, 14, 21, 22, 23, 24,  2])", that's length 29. If
                      embedding dimension arg for model (which is the output of the embedding
                      layer and input to the linear layer) is say, 5, then the embedding
                      layer will output a 2d matrix of size 29,5 because there's 29 values in
                      the original tensor so theres a nested tensor for each one. Remember the
                      embedding dimension is INPUT to the linear layer, it has the same value
                      of 5. Since the embedding layer output shape is 29,5 , it works.


          Solution 2: I used t.mean(x,dim=0) which get the mean in a horizontal fashion. Why did
            it work? BEFORE matrix x goes into the embedding layer, the tensor is:
            "tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 10, 12, 13, 14,  0, 15,
                     16, 17, 18, 19, 20, 14, 21, 22, 23, 24,  2])" More importantly the shape/size
            is 29. AFTER the matrix x goes through the embedding layer, it's shape/size is (29,5).
            It's now 2d. Why? Well because the output of the embedding layer (called embed_dim)
            is 5. So it gets 29 matrices of length 5 each. Then the linear layers INPUT is embed_dim
            as well which is of course 5. So the linear layer can take a 2d matrix as long as the
            columns of the 2d matrix (the "5" in 29,5) matches the linear layers INPUT (input is
            also 5).
            Ex Code:
            "td = t.tensor([[1,4,2,8],
                            [6,9,3,7]], dtype=t.float32)
              x = t.nn.Linear(4, 1)
              x(td)"

            Will print something like:
            "tensor([[2.5558],
                     [4.6453]], grad_fn=<AddmmBackward0>)"

            Ex code 2 (1d):
            " # 1d linear layer test. Tensor is "tensor([5., 5., 5.])", shape is torch.Size([3])
              e = t.tensor([5] * 3, dtype=t.float32)
              x = t.nn.Linear(3, 1)
              # Will print something like "tensor([0.6625], grad_fn=<ViewBackward0>)" is linear output is 1.
              print(x(e))"



          Conclusion: Solving the issue WITHOUT messing with t.mean(x, dim=1) is impossible. Simply
            because of the dimensions of both x (after it goes through t.mean) and the input of the
            linear layer.
            1) Dimension of x after t.mean(x, dim=1) - torch.Size([29])
            2) Dimensions of linear layer - 5,3. 5 input, 3 output.

            I initially THOUGHT the REAL size was 30, due to programming indexing from 0. If that
            was the case I had an idea to resize the x matrix into shape (6,5) because that'll be
            accepted by the linear linear and it'd work. As a matter of fact I wrote some test
            code for just that issue, resizing a tensor that was initially 30 in length but the
            wrong input size for a linear layer that had input as 5. See code below:
            "
              # Get same values.
              t.manual_seed(0)

              # nt = new tensor. Make tensor of size 30, (0-29)
              nt = t.rand(30)
              print(f'New tensor:\n{nt}\nNew tensor SIZE: {nt.shape}\n\n')

              # Create a linear layer which only takes 1d tensors of 5, and 2d tensors of (n, 5).
              ll = t.nn.Linear(5, 1)

              # Try reshaping the new tensor to be in form (n, 5). First get size and check if its divisible by embed_dim (ed) 5
              ed = 5
              cur_mat_size = nt.size()[0]

              # Can 30 be divided by 5? If so, we can make a new tensor evenly.
              if cur_mat_size % ed == 0:
                rows_for_reshape = int(cur_mat_size / ed)

                # nm = new matrix.
                nm = t.reshape(nt, (rows_for_reshape, ed))
                print(f'New reshaped matrix is:\n{nm}\nNew reshaped matrix shape: {nm.shape}\n\n')

                print(ll(nm))
            "

            Feel free to copy and past in a different cell. The point of the code was to demonstrate how
            it would be POSSIBLE to reshape a tensor so it can be passed to a linear layer, which I initially
            thought was possible with the tensor([ 0,  1,  2,  3,  4,  5,  6,  7, etc]) input tensor used
            below. But I remembered it was size 29 and not 30. '''

class EmbedClassifier(t.nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
    super().__init__()
    self.embedding = t.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
    self.fc = t.nn.Linear(embed_dim, num_class)

  def forward(self, x):
    print(f'x shape: {x.shape}\nx BEFORE embedding:\n{x}\n\n')
    x = self.embedding(x)
    print(f'x shape: {x.shape}\nx AFTER embedding is: {x}\n\n')
    x = t.mean(x,dim=0)
    print(f'x shape after t.mean: {x.shape}\nx after t.mean:\n{x}\n\n')

    return self.fc(x)

ec = EmbedClassifier(len(v), 5, 3)

''' Tokenized text is:
    "['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(',
      'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street',
      "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing',
      'green', 'again', '.']

    Must get appropriate input for the model. Length is 29. Actual tensor is:
    "tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 10, 12, 13, 14,  0, 15,
              16, 17, 18, 19, 20, 14, 21, 22, 23, 24,  2])"  '''
t_test = t.tensor([v.get_stoi()[cur_token] for cur_token in es], dtype=t.long)

x = ec(t_test)
print(f'Returned tensor x:\n{x}\nReturned tensor x shape: {x.shape}')

x shape: torch.Size([29])
x BEFORE embedding:
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 10, 12, 13, 14,  0, 15,
        16, 17, 18, 19, 20, 14, 21, 22, 23, 24,  2])


x shape: torch.Size([29, 5])
x AFTER embedding is: tensor([[ 8.6002e-02,  1.3401e+00,  8.4724e-01, -1.6921e-01, -7.4903e-01],
        [ 1.1128e+00, -1.0178e+00, -1.8242e-01,  1.9588e+00, -9.4974e-01],
        [-9.0016e-01,  3.8660e-01,  1.5842e+00, -5.5649e-01, -1.4217e+00],
        [-3.3141e-01, -1.5244e-01, -5.1314e-01, -2.0827e+00, -3.8774e-01],
        [-1.3159e+00,  1.2744e+00, -1.4552e+00,  6.2983e-01,  2.1051e+00],
        [ 9.5075e-01, -4.8679e-01,  8.6886e-01, -2.8780e-02, -6.3802e-01],
        [-1.6090e+00,  1.3224e-01, -1.1635e+00, -4.7096e-01, -6.7253e-01],
        [-3.4021e-01, -3.3820e-01, -2.0466e+00,  8.4488e-01, -7.6056e-01],
        [ 1.3313e+00,  6.9714e-01, -3.5009e-01,  9.5036e-01,  9.4620e-01],
        [ 1.3155e-01,  2.6122e-01, -1.5337e+00, -1.4152e-01,  1.7064e+00],
        [-1.3386e+

In [29]:
''' This code documents why solving the 3rd error: "RuntimeError: mat1 and mat2 shapes cannot
    be multiplied (1x29 and 5x3)" in the above cell, was impossible. '''

# Get same values.
t.manual_seed(0)

# nt = new tensor. Make tensor of size 30, (0-29)
nt = t.rand(30)
print(f'New tensor:\n{nt}\nNew tensor SIZE: {nt.shape}\n\n')

# Create a linear layer which only takes 1d tensors of 5, and 2d tensors of (n, 5).
ll = t.nn.Linear(5, 1)

# Try reshaping the new tensor to be in form (n, 5). First get size and check if its divisible by embed_dim (ed) 5
ed = 5
cur_mat_size = nt.size()[0]

# Can 30 be divided by 5? If so, we can make a new tensor evenly.
if cur_mat_size % ed == 0:
  rows_for_reshape = int(cur_mat_size / ed)

  # nm = new matrix.
  nm = t.reshape(nt, (rows_for_reshape, ed))
  print(f'New reshaped matrix is:\n{nm}\nNew reshaped matrix shape: {nm.shape}\n\n')

  print(ll(nm))

New tensor:
tensor([0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
        0.6323, 0.3489, 0.4017, 0.0223, 0.1689, 0.2939, 0.5185, 0.6977, 0.8000,
        0.1610, 0.2823, 0.6816, 0.9152, 0.3971, 0.8742, 0.4194, 0.5529, 0.9527,
        0.0362, 0.1852, 0.3734])
New tensor SIZE: torch.Size([30])


New reshaped matrix is:
tensor([[0.4963, 0.7682, 0.0885, 0.1320, 0.3074],
        [0.6341, 0.4901, 0.8964, 0.4556, 0.6323],
        [0.3489, 0.4017, 0.0223, 0.1689, 0.2939],
        [0.5185, 0.6977, 0.8000, 0.1610, 0.2823],
        [0.6816, 0.9152, 0.3971, 0.8742, 0.4194],
        [0.5529, 0.9527, 0.0362, 0.1852, 0.3734]])
New reshaped matrix shape: torch.Size([6, 5])


tensor([[-0.3574],
        [-0.8912],
        [-0.4575],
        [-0.5929],
        [-0.6101],
        [-0.3124]], grad_fn=<AddmmBackward0>)


### 6.2) Updating embedding weights, using Gensim, model creation, save/load model

In [30]:
import gensim.downloader as api

'''

Commenting this out because it takes forever to download

'''

# # w2v has word embeddings and pytorch neural nets work with them.
# w2v = api.load('word2vec-google-news-300')

# print(type(w2v['taco'])) # Returns numpy.ndarray
# print(w2v['taco'][:10]) # Get first 10 values of 300 embedded vector.

# ''' [('Jackson', 0.5326348543167114),
#      ('Prince', 0.5306329727172852),
#      ('Tupou_V.', 0.5292826294898987),
#      ('KIng', 0.5227501392364502),
#      ('e_mail_robert.king_@', 0.5173623561859131)] '''
# print(w2v.similar_by_word('King', topn=5)) # This got the values in below comment.

# '''
# w is vehicle and x is 0.7821096181869507
# w is cars and x is 0.7423831224441528
# w is SUV and x is 0.7160962224006653
# w is minivan and x is 0.6907036900520325
# w is truck and x is 0.6735789775848389
# w is Car and x is 0.6677608489990234
# w is Ford_Focus and x is 0.667320191860199
# w is Honda_Civic and x is 0.6626849174499512
# w is Jeep and x is 0.651133120059967
# w is pickup_truck and x is 0.6441438794136047 '''

# # This gets comment above.
# for w, x in w2v.most_similar('car'):
#   print(f'w is {w} and x is {x}')

'\n\nCommenting this out because it takes forever to download\n\n'

In [31]:
# A bit quicker to download.
te = api.load("glove-twitter-25")



In [32]:
''' .vocab can't be used with this set of embeddings, so key_to_index returns a dictionary. Also
     index_to_key returns a list which is much better.'''
len(te.key_to_index)

1193514

In [33]:
all_words = te.index_to_key
selected_word = all_words[0]
emb = te[selected_word]

print(f'Word: {selected_word}\nEmbedding FOR {selected_word}:\n{emb}')

Word: <user>
Embedding FOR <user>:
[ 0.62415   0.62476  -0.082335  0.20101  -0.13741  -0.11431   0.77909
  2.6356   -0.46351   0.57465  -0.024888 -0.015466 -2.9696   -0.49876
  0.095034 -0.94879  -0.017336 -0.86349  -1.3348    0.046811  0.36999
 -0.57663  -0.48469   0.40078   0.75345 ]


In [34]:
''' The concept of this is literally one embedding layer borrowing/taking embedding values.
    In a real model, there might be certain words that don't have proper embedding values at
    all. So why not plug those holes up? First I'll use an example embedding layer. It'll
    take a vocab size and output a embed_dim size tensor.

    The issue that also happened in the EmbedClassifier model should be talked about here.
    The second error that happened was "IndexError: index out of range in self". Basically,
    to sum the error, if there's a tensor like t.tensor([15]) and the embedding layer is:
    t.nn.Embedding(num_embeddings=15, embedding_dim=5), we can't pass the tensor to it. The
    embedding first arg is the MAX value for ALL elements in a given tensor. So I need to be
    careful about the size of the embedding layer.

    Errors:
    1) "RuntimeError: a view of a leaf Variable that requires grad is being used in an
        in-place operation."

        Solution: el.weight.requires_grad = False '''

# Get max length of vocab dictionary.
el = t.nn.Embedding(num_embeddings=len(te.key_to_index), embedding_dim=25)
el.weight.requires_grad = False

num_words = 5

# Loop for first 5 words.
for i, x in zip(range(num_words), all_words):
  print(f'-----Index {i}-----\nWord: {x}\nWord embedding in glove embeddings:\n{te[x]}\n\n')
  print(f'--- Previous model embedding:\n{el.weight[i]}')
  # Update the embedding with embedding in downloaded word embeddings (currently "glove-twitter-25")
  el.weight[i] = t.tensor(te[x], dtype=t.float32)
  print(f'--- UPDATE model embedding:\n{el.weight[i]}\n\n')

# Get all new weights and display them.
updated_weights = el.weight[:num_words]
print(f'New updated first {num_words} word embeddings in embedding layer:\n{updated_weights}')

-----Index 0-----
Word: <user>
Word embedding in glove embeddings:
[ 0.62415   0.62476  -0.082335  0.20101  -0.13741  -0.11431   0.77909
  2.6356   -0.46351   0.57465  -0.024888 -0.015466 -2.9696   -0.49876
  0.095034 -0.94879  -0.017336 -0.86349  -1.3348    0.046811  0.36999
 -0.57663  -0.48469   0.40078   0.75345 ]


--- Previous model embedding:
tensor([ 0.4397,  0.1124,  0.6408,  0.4412, -0.2159, -0.7425,  0.5627,  0.2596,
         0.5229,  2.3022, -1.4689, -1.5867,  1.2032,  0.0845, -1.2001, -0.0048,
        -0.2303, -0.3918,  0.5433, -0.3952,  0.2055, -0.4503, -0.5731, -0.5554,
        -1.5312])
--- UPDATE model embedding:
tensor([ 0.6241,  0.6248, -0.0823,  0.2010, -0.1374, -0.1143,  0.7791,  2.6356,
        -0.4635,  0.5746, -0.0249, -0.0155, -2.9696, -0.4988,  0.0950, -0.9488,
        -0.0173, -0.8635, -1.3348,  0.0468,  0.3700, -0.5766, -0.4847,  0.4008,
         0.7534])


-----Index 1-----
Word: .
Word embedding in glove embeddings:
[ 0.69586  -1.1469   -0.41797  -0.022311 

In [35]:
''' Basic model. I desired to see how certain things worked.

    Error(s) resolved:

    1) "RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x5 and 3x5)"
        Why did this happen? Ex: I remembered from a EmbedClassifier model I had
        this code:
        "self.embedding = t.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
         self.fc = t.nn.Linear(embed_dim, num_class)"
        vocab_size = 75
        embed_dim = 5
        num_class = 3
        So the linear layer is EXPECTING something of size 5. To quote my own comment
        "So the linear layer can take a 2d matrix as long as the columns of the 2d matrix
        (the "5" in 29,5) matches the linear layers INPUT (input is also 5)."

          - Dealing with 2d tensors.
        The above explanation applies to 1d tensors. In THIS case with the code below,
        the input of t.ones is 3,5 which is 3 rows and 5 columns. The linear layer is:
        "Linear(in_features=3, out_features=5, bias=True)". So in this case the t.ones
        tensor is 3 rows and 5 columns but it needs 5 rows and 3 columns, and that's why
        the code uses transpose with "preds = tnn(x.T)" which literally flips the
        dimensions FROM 3,5, to 5,3. As long as the matrix columns match the
        input of the Linear layer, it works. '''

model_h = 3
model_w = 5

class NeuralNetwork(t.nn.Module):
  def __init__(self):
    super().__init__()

    # ll = linear layer.
    self.layers = t.nn.Sequential(
        t.nn.Linear(model_h, model_w, 10)
    )

  def forward(self, x):
    logits = self.layers(x)
    return logits

# test neural network = tnn
tnn = NeuralNetwork()

# Create dummy data
x = t.ones((model_h, model_w))

preds = tnn(x.T)
print(preds)

tensor([[-1.4528,  0.6201,  0.6881,  1.1022, -0.4014],
        [-1.4528,  0.6201,  0.6881,  1.1022, -0.4014],
        [-1.4528,  0.6201,  0.6881,  1.1022, -0.4014],
        [-1.4528,  0.6201,  0.6881,  1.1022, -0.4014],
        [-1.4528,  0.6201,  0.6881,  1.1022, -0.4014]],
       grad_fn=<AddmmBackward0>)


In [36]:
# --- Save a model
import os

# ndn = new dir name
ndn = "testmodel"

curdir = os.getcwd()

# cd = combined dir
cd = os.path.join(curdir, ndn + ".pth")

# create dir
print(cd)

t.save(tnn.state_dict(), cd)

/content/testmodel.pth


In [37]:
# --- load model

# instantiate one first. lnn = loaded neural network
lnn = NeuralNetwork()
lnn.load_state_dict(t.load(cd))

lnn

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=3, out_features=5, bias=True)
  )
)

### 6.3) Glove embeddings

In [38]:
# ttv = torchtext vocab
ttv = torchtext.vocab.GloVe(name='6B', dim=50)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.39MB/s]                           
100%|█████████▉| 399999/400000 [00:15<00:00, 25397.90it/s]


In [39]:
# Very similar to the vocab created earlier since this has stoi and itos as well
ttv["Car"]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

In [40]:
ttv.stoi

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 '"': 8,
 "'s": 9,
 'for': 10,
 '-': 11,
 'that': 12,
 'on': 13,
 'is': 14,
 'was': 15,
 'said': 16,
 'with': 17,
 'he': 18,
 'as': 19,
 'it': 20,
 'by': 21,
 'at': 22,
 '(': 23,
 ')': 24,
 'from': 25,
 'his': 26,
 "''": 27,
 '``': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you': 81,
 'years': 82,
 'i

In [41]:
ttv.itos[379]

'born'

### 6.4) Etc Neural Network related topics/code

In [42]:
from torchvision.transforms import ToTensor, Lambda

# --- lambdas

# simple one. Type is <class 'function'>
a = lambda x: x * 2
print(f'Custom lambda: {a(2)}\nCustom lambda TYPE: {type(a)}\n\n')

''' this type is:
torchvision.transforms.transforms.Lambda
Does the same thing of course. All that's
done is the previous lambda is given to it. '''
b = Lambda(a)
print(f'torchvision.transforms Lambda: {b(4)}\ntorchvision.transforms Lambda TYPE: {type(b)}')

Custom lambda: 4
Custom lambda TYPE: <class 'function'>


torchvision.transforms Lambda: 8
torchvision.transforms Lambda TYPE: <class 'torchvision.transforms.transforms.Lambda'>


In [43]:
import torch.nn.functional as f

''' One hot encoding

"Used to indicate the presence
of a value and lack of presence
of other values."

Below goes in order in each
Tensor. Like index 0,1,2,3,4,5
Will be indicative of the slot
For each num. Hence why 5s tensor
Gets ITS 1 at end. '''

l = t.tensor([5,1,0,1,0])
f.one_hot(l, num_classes=6)

tensor([[0, 0, 0, 0, 0, 1],
        [0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0]])

In [44]:
''' Loss function example. Loss funcs get difference between predictions
    and ground truths.

    Also requires_grad is used. Needed because back propagation to calculate
    gradients for each tensor that has "requires_grad" to true.

    The first tensor is the ground truths aka what the CORRECT values
    are, and the predictions are what a model would predict for example.
    The model got 4 out of 5 correct but there's more math in the background
    of the loss function going on, hence why the loss value isn't 0.8
    exactly. '''

lf = t.nn.CrossEntropyLoss()

ground_truths = t.tensor([1,0,1,1,0], dtype=t.float32, requires_grad=True)
preds = t.tensor([1,1,1,1,0], dtype=t.float32, requires_grad=True)

l = lf(ground_truths, preds)

print(f'Loss: {l}')

l.backward()

Loss: 6.271803855895996


In [45]:
# See the gradients mentioned above.
print(ground_truths.grad)
print(preds.grad)

tensor([ 0.0707, -0.6061,  0.0707,  0.0707,  0.3939])
tensor([1.3180, 2.3180, 1.3180, 1.3180, 2.3180])


In [46]:
# Stop ts from computational history. Sometimes computation isn't needed.
z = preds.detach()
print(z.requires_grad)

False


In [47]:
''' Flatten layer. Interesting because it dramatically changes the shape

    Testing an images nn input with 28 height and width
    Flatten - This helps it work. Because the code
    "t.ones((num, num)), gets a 2d matrix of say
    size (28,28). But the linear layer
    requires it to be 1d due to the input
    being num * num.

    Also you can change the height/width as much as
    you want because at the end of the day the
    multiplication math will work. Definitely works for 3d and up
    as well  '''

height = 25
width = 28

# tt = test tensor. Of proper size.
tt = t.ones((height, width))

# 28 * 28 = 784
ll = t.nn.Linear(height * width, 100)

ll(tt.flatten())

tensor([ 0.0741,  0.4089,  0.3737, -0.2707, -0.0272, -0.4843, -0.0669, -0.1538,
        -0.2415, -0.3134, -1.3957,  0.7815, -1.3746,  0.7480, -0.0760, -0.4360,
         1.1383,  0.4808,  0.6637,  0.3833, -0.7154,  0.5242, -0.3230,  0.7998,
        -0.9154,  0.1342,  0.4628,  0.6280, -0.0844,  1.2271, -0.1102, -1.0103,
         0.4213,  0.7421,  0.5243, -0.5112,  0.2357, -0.0971, -0.8332, -0.0836,
        -0.4302, -0.1669,  0.6612, -1.2081,  0.1913, -0.5197, -0.6149, -0.0942,
         0.9280, -0.0220, -0.6618,  0.6094,  0.3318,  0.1174, -0.9837, -1.1060,
         0.1541,  0.3663, -0.3196,  0.1374,  0.3263, -0.5648, -0.5835,  0.4989,
         0.0666,  0.1372,  0.7819,  0.1181, -0.5437,  1.0324,  0.9498, -0.0302,
        -0.1829, -0.0233, -0.4555, -0.6643,  0.3441,  0.2438,  0.2928, -0.2203,
        -0.0129, -0.4925, -1.5830,  0.5887, -1.0196,  0.0602,  0.3125,  0.1881,
         0.2223, -0.6951,  0.3112,  0.8646,  0.2355,  0.3442,  0.0583, -0.3290,
        -0.8411,  0.5196, -0.0149, -0.09

# 7) Data pipes

In [48]:
import torchdata.datapipes as dp

''' these are good for handling
Data. Different types of data
Pipes as well. Choosing one that
Fits needs/requirements is key
'''

x = t.tensor([5,4,7])

p = dp.iter.IterableWrapper(x)

for y in p:
   print(y)

tensor(5)
tensor(4)
tensor(7)


In [49]:
fl = dp.iter.FileLister(curdir)

print(curdir)

''' Prints the model path created earlier. If getting
    a sizeable number of files is necessary, FileLister is definitely
    the data pipe to go with. '''
for x in fl:
   print(x)

/content
/content/testmodel.pth


In [50]:
# convert to list, if desired
dpl = list(p)
dpl[1]

tensor(4)

In [51]:
# Mapping helps change all elements in simple code.
for x in p:
  print(x)
print('\n')

def multiply(val):
  return val * 2

# p = p.map(lambda x: x * 2) # Can do lambda but a not supported warning appears.
p = p.map(multiply)

for x in p:
  print(x)

tensor(5)
tensor(4)
tensor(7)


tensor(10)
tensor(8)
tensor(14)


# 8) Build vocab from iterator

In [52]:
from torchtext.vocab import build_vocab_from_iterator

''' PyTorch way to get a vocab
And a vocab is numerical versions
Of each word.
'''

# sentences for BASIC tokenization
s = [
   "This is a test",
   "Gaming is still fun but I don't have time for much of it anymore",
   "Hello there sir, how are you?"
]
print(f'Sentences:\n{s}\n')

''' ts = tokenized sentences. cs = current sentences.
    Loop through all sentences and get each word into
    lowercase and split the sentence up word by word.
'''
ts = [cs.lower().split() for cs in s]
print(f'TOKENIZED Sentences:\n{ts}\n')

# yield pauses when func is looped and continues where it left off
def get_ts(cs):
  for s in cs:
    yield s.lower().split()

''' if v gets given a word it
 doesn't know, it'll give unk '''
v = build_vocab_from_iterator(get_ts(s),
                              specials=["<pad>", "<unk>"],
                              special_first=True)

# stoi is string to int. Take notice to the special tokens.
v.get_stoi()

Sentences:
['This is a test', "Gaming is still fun but I don't have time for much of it anymore", 'Hello there sir, how are you?']

TOKENIZED Sentences:
[['this', 'is', 'a', 'test'], ['gaming', 'is', 'still', 'fun', 'but', 'i', "don't", 'have', 'time', 'for', 'much', 'of', 'it', 'anymore'], ['hello', 'there', 'sir,', 'how', 'are', 'you?']]



{'you?': 24,
 'there': 21,
 'test': 20,
 'still': 19,
 'sir,': 18,
 'time': 23,
 'this': 22,
 'of': 17,
 'much': 16,
 '<unk>': 1,
 'hello': 12,
 'i': 14,
 '<pad>': 0,
 'for': 8,
 'a': 3,
 'are': 5,
 'is': 2,
 'anymore': 4,
 "don't": 7,
 'it': 15,
 'how': 13,
 'but': 6,
 'gaming': 10,
 'fun': 9,
 'have': 11}

In [53]:
from torchtext.data.utils import get_tokenizer

tk = get_tokenizer('basic_english')

''' Quick comparison with the tokenizer torchtext
    has.
    Using get_tokenizer:
    1) s[0] - gets sentence.
    2) tk() - tokenizes that sentence.
    3) v - Pass to vocab to get numerical
        representations of words.

    Using my method:
    1) ts[0] - Sentence tokenized already with
        list comprehension code:
        "[cs.lower().split() for cs in s]"
        cs = current sentence

    2) v - Pass to vocab to get numerical
        representations of words. '''



print(v(tk(s[0])))
print(v(ts[0]))

# ns = num sentence
ns = v(ts[0])

[22, 2, 3, 20]
[22, 2, 3, 20]


# 9) Transforms

In [54]:
import torchtext.transforms as T

''' Transforms are good for altering text in a step by step manner. Similar to how a neural network class has a forward func
    which passes the output of one layer as input to the next.

      1) VocabTransform - Just takes the vocab and does exactly as seen in previous cell.

      2) AddToken - Can add a numerical token that'll do some sort of special job. Ex: the unk token signifies if any
          unknown values to the vocab are given. The primary reason why the output shows both pad and unk tokens is
          because earlier when the build vocab from iterator object was created, this line was present:
          "specials=["<pad>", "<unk>"]" and 0 was assigned to pad and 1 was assigned to unk. It literally
          shows that when we access the vocab dictionary with ".get_itos()"



3) AddToken - This time put it in the end

'''

x = T.Sequential(
    T.VocabTransform(vocab=v),
    T.AddToken(0, begin=True),
    T.AddToken(1, begin=False)
)

# Get numerical sentence WITH tokens
ns = x(ts[0])
print(f'Numerical sentence:\n{ns}')

# Convert sentence back to text
vocab_dict = v.get_itos()

# Loop through all the nums in numerical sentence (ns)
for i in ns:
  print(vocab_dict[i], end=" ")

Numerical sentence:
[0, 22, 2, 3, 20, 1]
<pad> this is a test <unk> 

# 10) Batches & Lambdas

In [55]:
''' Networks are always trained in batches. Data pipes have BucketBatch functions
    in them. Can give them a regular list, tensor, whatever iterable desired.
    1) batch_size - Size of each batch
    2) batch_num - # of batches IN a bucket.
    3) drop_last - If there's leftovers like if batch size if 4, but the iterator
        has 15 elements, the first 3 batches will have 4 each. A total of 12 values
        used so far. Setting drop_last to true will get rid of the final 3
        values in the last batch.

 '''

p = dp.iter.IterableWrapper(range(15))

x = p.bucketbatch(batch_size=4)

for i in x:
  print(i)

[3, 11, 9, 7]
[8, 2, 14, 1]
[12, 0, 4, 13]
[6, 5, 10]


In [56]:
''' Can use lambdas with the tokenizer
    and vocab for quick text editing.

    Remember variable "s" has multiple
    sentences in it.

    tl = tokenize lambda
    vl = vocab lambda. Input MUST be
      tokenized. always the specific vocab
      built in code below was created
      with build_vocab_from_iterator
      with the sentence in the "s"
      variable.
'''

tl = lambda x: tk(x)
print(f'Real sentence:\n{s[0]}\nTokenized sentence:\n{tl(s[0])}\n\n')

vl = lambda y: v(tk(s[0]))
print(f'Real sentence:\n{s[0]}\nNumerial sentence:\n{vl(s[0])}\n\n')

print(f'View vocab itos to see above numerical sentence MATCHES the vocab:')
for key, value in v.get_stoi().items():
  print(f'Key: {key} - Value: {value}')

Real sentence:
This is a test
Tokenized sentence:
['this', 'is', 'a', 'test']


Real sentence:
This is a test
Numerial sentence:
[22, 2, 3, 20]


View vocab itos to see above numerical sentence MATCHES the vocab:
Key: you? - Value: 24
Key: there - Value: 21
Key: test - Value: 20
Key: still - Value: 19
Key: sir, - Value: 18
Key: time - Value: 23
Key: this - Value: 22
Key: of - Value: 17
Key: much - Value: 16
Key: <unk> - Value: 1
Key: hello - Value: 12
Key: i - Value: 14
Key: <pad> - Value: 0
Key: for - Value: 8
Key: a - Value: 3
Key: are - Value: 5
Key: is - Value: 2
Key: anymore - Value: 4
Key: don't - Value: 7
Key: it - Value: 15
Key: how - Value: 13
Key: but - Value: 6
Key: gaming - Value: 10
Key: fun - Value: 9
Key: have - Value: 11


# 10) Packed Sequences

In [57]:
''' What are packed sequences?
    Mainly for RNN.

    [[1,2,3,4,5],
     [6,7,8,0,0],
     [9,0,0,0,0]]"
    Len is 5,3,1. RNN cells train with 1,6,9. Then 2,7. Then 3,8 Etc. So packed sequence is one
    vec "[1,6,9,2,7,3,8,4,5]" Len of vec still 5,3,1.

    First we need sentences of different lengths. "ts" is from section 1, tokenized sentences. '''

for tks in ts:
  print(tks)

['this', 'is', 'a', 'test']
['gaming', 'is', 'still', 'fun', 'but', 'i', "don't", 'have', 'time', 'for', 'much', 'of', 'it', 'anymore']
['hello', 'there', 'sir,', 'how', 'are', 'you?']


In [58]:
# encode them, function in section 1.
num_sents = [encode(tks) for tks in ts]

for ns in num_sents:
  print(ns)

[22, 2, 3, 20]
[10, 2, 19, 9, 6, 14, 7, 11, 23, 8, 16, 17, 15, 4]
[12, 21, 18, 13, 5, 24]


In [59]:
# Now pad. This code was written already in section 1 as well.
padded_sents = [t.nn.functional.pad(t.tensor(sent), (0, max_length - len(sent))) for sent in num_sents]

padded_sents

[tensor([22,  2,  3, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0]),
 tensor([10,  2, 19,  9,  6, 14,  7, 11, 23,  8, 16, 17, 15,  4,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0]),
 tensor([12, 21, 18, 13,  5, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0])]

In [60]:
# Create 2d tensor of 0s to be used and have the num sents added to them soon.
a = t.zeros((len(padded_sents), len(padded_sents[0])))

# Get lengths of each numerical sentence
num_sent_lens = [len(s) for s in num_sents]
print(f'Num sent lengths: {num_sent_lens}')

# Get longest one, in this case it's 42.
max_len = max(num_sent_lens)
print(f'Max length: {max_len}\n')

''' Loop over the numerical sentences ALONG with the specific lengths. But do so with index,
    that's why enumerate is used with "i". Remember "a" is a 2d tensor and I can access each
    index with "a[i, etc]". num_sent will be current numerical sentence, and same concept for
    sent_len, which will be the current length.

    Ex numerical sentence: " 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
    10., 12., 13., 14.,  0., 15., 16., 17., 18., 19., 20., 14., 21., 22., 23., 24.,
    2."
    Ex length: 29

    So if i is 0, then in the tensor of 0's, which is "a", go to the tensor in index 0
    which would be "[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
    0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
    0., 0., 0., 0.]" and UP UNTIL the numerical sentence length, which is ":sent_len",
    apply the real numerical sentence. '''
for i, (num_sent, sent_len) in enumerate(zip(num_sents, num_sent_lens)):
  a[i, :sent_len] = t.FloatTensor(num_sent)

a

Num sent lengths: [4, 14, 6]
Max length: 14



tensor([[22.,  2.,  3., 20.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [10.,  2., 19.,  9.,  6., 14.,  7., 11., 23.,  8., 16., 17., 15.,  4.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [12., 21., 18., 13.,  5., 24.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [61]:
t.LongTensor(num_sent_lens)

tensor([ 4, 14,  6])

In [62]:
x, y = t.LongTensor(num_sent_lens).sort(0, descending=True)

print(x)
print(y)

tensor([14,  6,  4])
tensor([1, 2, 0])


# 11) Character encoding

In [63]:
# build vocab for chars

# counter keeps track
c = collections.Counter()

# need ds
ds = [
   (0, "A test sentence"),
   (1, "I'm here studying"),
   (1, "Tacos are unhealthy"),
   (0, "The Superbowl is over")
]

# ct is current tuple
for ct in ds:
   ''' list cast turns sentence
   into list of individual letters
   counter has dictionary in it.
   Keys are letters/chars but nums
   are how many times. Ex: letter
   "A" was only seen once so it
   is "'A':1" '''
   c.update(list(ct[1]))

# get stoi is dictionary
# can use stoi & itos
v = torchtext.vocab.vocab(c)
v.get_stoi()

{'w': 24,
 'l': 20,
 'S': 21,
 'o': 19,
 'a': 18,
 'T': 17,
 'g': 16,
 'i': 15,
 'y': 14,
 'v': 25,
 's': 4,
 'I': 7,
 'r': 11,
 't': 2,
 'e': 3,
 'd': 13,
 'n': 5,
 'c': 6,
 'b': 23,
 'A': 0,
 "'": 8,
 'm': 9,
 'p': 22,
 ' ': 1,
 'h': 10,
 'u': 12}

In [64]:
# func for encoding. c = current character

def encode_chars(s):
   return [v[cc] for cc in list(s)]

encode_chars(ds[0][1])

[0, 1, 2, 3, 4, 2, 1, 4, 3, 5, 2, 3, 5, 6, 3]

# 12) Bert

In [65]:
from transformers import BertTokenizer

''' Bert is a cool model for
natural language processing (nlp)
and the models let us skip the
sometimes tedious and long process
Of training a large model.
'''

b = BertTokenizer.from_pretrained('bert-base-uncased',
do_lower_case=True)

b

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [66]:

''' The comment below is from my own personal Kaggle project found
Here: https://www.kaggle.com/code/omarmoodie/social-media-pytorch-huggingface

"Using the tokenizer this way gets input_ids which are string
to num conversions, and also puts in special tokens like 101,
and 102 for us. Then includes the token_type_ids and
attention_mask."

Look at previous cell to see the special tokens mentioned
In the comment.

1) input IDs - string to num conversions according to the vocabulary.
2) Token type IDs - Pays attention to certain parts of the converted
string.
3) Attention mask - Which of the converted nums to pay attention to

All the above probably explained better than me, here:
https://huggingface.co/docs/transformers/en/glossary

'''

ts = "this is a test"
b(ts)

{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [67]:
# .encode only gets input IDs
b.encode(ts)

[101, 2023, 2003, 1037, 3231, 102]

In [68]:
''' encode plus mainly gives
Extra OPTIONS to the user.
Example below where I made sure add
Special tokens was false so they
Don't appear.

See: https://stackoverflow.com/questions/61708486/whats-difference-between-tokenizer-encode-and-tokenizer-encode-plus-in-hugging
And also see: https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html
'''
b.encode_plus(ts,
              add_special_tokens=False)

{'input_ids': [2023, 2003, 1037, 3231], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

# 13) Predictons

In [69]:
''' Not every portion of code aimed at model predictions is the same, but the below
    few cells represent some ways I've collectively seen.

    argmax(1) is useful because predictions for classes can use it. If there's
    5 classes, like "bike, car, scooter, skateboard, jetski", all numbered 0-4,
    then argmax will get the index of what the model predicted. I mainly did the
    above code to test argmax things on a 2d tensor. Why? Because argmax(1)
    goes row by row in a 2d column to get the index of the HIGHEST num in the
    inner/nested tensor. But argmax(0) only works with 1d tensors.

    Ex code:
    "r = t.tensor([3,5,7,22,5])
     t.argmax(r, dim=0)"
    Gives: tensor(3).
    And 3 is the index of course.

'''

# Same set of nums.
t.manual_seed(0)

# tt = test tensor, 2d shape
tt = t.rand((5,5))

# Create labels/ground_truths. tl = test labels
tl = {
    'bike':0,
    'car':1,
    'scooter':2,
    'skateboard':3,
    'jetski':4
}

print(f'Test labels:\n{tl}\n\nTest tensor:\n{tt}\n\n')

# Dummy model predictions.
x = tt.argmax(1)
print(f'"Model" predictions:\n{x}\n\n')

# rtl = reversed test labels dictionary. Get labels from test labels
rtl = {value:key for key, value in tl.items()}

# Then convert the indices to real labels
text_preds = [rtl[num.item()] for num in x]
print(f'Real predicted labels:\n{text_preds}\n\n')


''' Since manual seed is used, when the code runs again, outputs
    won't be the same. So to explain how it works, the tensor I'm
    CURRENTLY seeing is:
    "tensor([[0.4963, 0.7682, 0.0885, 0.1320, 0.3074],
        [0.6341, 0.4901, 0.8964, 0.4556, 0.6323],
        [0.3489, 0.4017, 0.0223, 0.1689, 0.2939],
        [0.5185, 0.6977, 0.8000, 0.1610, 0.2823],
        [0.6816, 0.9152, 0.3971, 0.8742, 0.4194]])"

    First row the highest index is 1, with the value of 0.7682.
    Hence why doing "tt[0].argmax(0)" gets 1. We can take that index
    and translate it into a str prediction with the list using
    "text_preds[tt[0].argmax(0)]". Since scooter is index 1, this gives
    us scooter. '''
text_preds[tt[0].argmax(0)]

Test labels:
{'bike': 0, 'car': 1, 'scooter': 2, 'skateboard': 3, 'jetski': 4}

Test tensor:
tensor([[0.4963, 0.7682, 0.0885, 0.1320, 0.3074],
        [0.6341, 0.4901, 0.8964, 0.4556, 0.6323],
        [0.3489, 0.4017, 0.0223, 0.1689, 0.2939],
        [0.5185, 0.6977, 0.8000, 0.1610, 0.2823],
        [0.6816, 0.9152, 0.3971, 0.8742, 0.4194]])


"Model" predictions:
tensor([1, 2, 1, 2, 1])


Real predicted labels:
['car', 'scooter', 'car', 'scooter', 'car']




'scooter'

In [70]:
# Simplier example of above
# --- Testing argmax again

classes = ["Taco", "John", "Dog"]

test_preds = t.tensor([
   [0.2, 0.1, 0.5],
   [0.85, 0.50, 0.3],
   [0.2, 0.92, 0.55]
])

''' get first row of 2d preds
And get argmax 0 which gets the
Highest value by its index in
That row. Then of course apply
That to to the list
'''
classes[test_preds[0].argmax(0)]

'Dog'

In [71]:
''' 3) Predictons and labels
       In code projects something like "torch.sum(predictions == labels).item()" so
       why not mess around with that here.

       x = ground truths
       y = predictions
'''

x = t.tensor([1,0,1,1,1,1,0,1]) # ground truths
y = t.tensor([1,1,0,1,1,1,0,1]) # predictions

''' == will get boolean vec on which columns MATCH. Ex in x and y above, index 0 is
    both 1, so that's true, they equal eachother. Then the sum just how many Trues and
    adds them since in a numerical fashion True is 1 and False is 0. So:
    t.sum(tensor([ True, False, False,  True,  True,  True,  True,  True])) will be 6.
    And of course .item just gets the number itself, out of the tensor. '''
# print(x == y)
# print(t.sum(x == y))
print(t.sum(x == y).item())

6


# 14) Tensor Dataset

In [72]:
from torch.utils.data import TensorDataset

''' Takes a tensor and makes each
Element an individual tensor.
'''

x = t.tensor([7,5,9])
td = TensorDataset(x)

for i in td:
   print(i)

(tensor(7),)
(tensor(5),)
(tensor(9),)


In [73]:
from torchtext import models

''' different models that require
Different args. Weights can also
Be set to true to utilize them.
'''

# tm = test model
tm = models.XLMR_LARGE_ENCODER

tm

RobertaBundle(_encoder_conf=RobertaEncoderConf(vocab_size=250002, embedding_dim=1024, ffn_dimension=4096, padding_idx=1, max_seq_len=514, num_attention_heads=16, num_encoder_layers=24, dropout=0.1, scaling=None, normalize_before=False), _path='https://download.pytorch.org/models/text/xlmr.large.encoder.pt', _head=None, transform=<function <lambda> at 0x78ca2b84a4d0>)