In [None]:
data = """ Various subfields of AI research are centered around particular goals and the use of particular tools. The traditional goals of AI research include learning, reasoning, knowledge representation, planning, natural language processing, perception, and support for robotics.[a] To reach these goals, AI researchers have adapted and integrated a wide range of techniques, including search and mathematical optimization, formal logic, artificial neural networks, and methods based on statistics, operations research, and economics.[b] AI also draws upon psychology, linguistics, philosophy, neuroscience, and other fields.[4] Some companies, such as OpenAI, Google DeepMind and Meta,[5] aim to create artificial general intelligence (AGI)—AI that can complete virtually any cognitive task at least as well as a human.

Artificial intelligence was founded as an academic discipline in 1956,[6] and the field went through multiple cycles of optimism throughout its history,[7][8] followed by periods of disappointment and loss of funding, known as AI winters.[9][10] Funding and interest vastly increased after 2012 when graphics processing units started being used to accelerate neural networks and deep learning outperformed previous AI techniques.[11] This growth accelerated further after 2017 with the transformer architecture.[12] In the 2020s, an ongoing period of rapid progress in advanced generative AI became known as the AI boom. Generative AI's ability to create and modify content has led to several unintended consequences and harms, which has raised ethical concerns about AI's long-term effects and potential existential risks, prompting discussions about regulatory policies to ensure the safety and benefits of the technology."""

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
tokenizer = Tokenizer()

In [9]:
tokenizer.fit_on_texts([data])

In [10]:
tokenizer.word_index

{'and': 1,
 'of': 2,
 'ai': 3,
 'the': 4,
 'to': 5,
 'as': 6,
 'research': 7,
 'goals': 8,
 'a': 9,
 'artificial': 10,
 'in': 11,
 'particular': 12,
 'learning': 13,
 'processing': 14,
 'techniques': 15,
 'neural': 16,
 'networks': 17,
 'create': 18,
 'intelligence': 19,
 'an': 20,
 'funding': 21,
 'known': 22,
 'after': 23,
 'generative': 24,
 "ai's": 25,
 'has': 26,
 'about': 27,
 'various': 28,
 'subfields': 29,
 'are': 30,
 'centered': 31,
 'around': 32,
 'use': 33,
 'tools': 34,
 'traditional': 35,
 'include': 36,
 'reasoning': 37,
 'knowledge': 38,
 'representation': 39,
 'planning': 40,
 'natural': 41,
 'language': 42,
 'perception': 43,
 'support': 44,
 'for': 45,
 'robotics': 46,
 'reach': 47,
 'these': 48,
 'researchers': 49,
 'have': 50,
 'adapted': 51,
 'integrated': 52,
 'wide': 53,
 'range': 54,
 'including': 55,
 'search': 56,
 'mathematical': 57,
 'optimization': 58,
 'formal': 59,
 'logic': 60,
 'methods': 61,
 'based': 62,
 'on': 63,
 'statistics': 64,
 'operations': 

In [11]:
len(tokenizer.word_index)

183

In [16]:
input_sequences = []
for sentence in data.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])
    print(input_sequences)

[[28, 29]]
[[28, 29], [28, 29, 2]]
[[28, 29], [28, 29, 2], [28, 29, 2, 3]]
[[28, 29], [28, 29, 2], [28, 29, 2, 3], [28, 29, 2, 3, 7]]
[[28, 29], [28, 29, 2], [28, 29, 2, 3], [28, 29, 2, 3, 7], [28, 29, 2, 3, 7, 30]]
[[28, 29], [28, 29, 2], [28, 29, 2, 3], [28, 29, 2, 3, 7], [28, 29, 2, 3, 7, 30], [28, 29, 2, 3, 7, 30, 31]]
[[28, 29], [28, 29, 2], [28, 29, 2, 3], [28, 29, 2, 3, 7], [28, 29, 2, 3, 7, 30], [28, 29, 2, 3, 7, 30, 31], [28, 29, 2, 3, 7, 30, 31, 32]]
[[28, 29], [28, 29, 2], [28, 29, 2, 3], [28, 29, 2, 3, 7], [28, 29, 2, 3, 7, 30], [28, 29, 2, 3, 7, 30, 31], [28, 29, 2, 3, 7, 30, 31, 32], [28, 29, 2, 3, 7, 30, 31, 32, 12]]
[[28, 29], [28, 29, 2], [28, 29, 2, 3], [28, 29, 2, 3, 7], [28, 29, 2, 3, 7, 30], [28, 29, 2, 3, 7, 30, 31], [28, 29, 2, 3, 7, 30, 31, 32], [28, 29, 2, 3, 7, 30, 31, 32, 12], [28, 29, 2, 3, 7, 30, 31, 32, 12, 8]]
[[28, 29], [28, 29, 2], [28, 29, 2, 3], [28, 29, 2, 3, 7], [28, 29, 2, 3, 7, 30], [28, 29, 2, 3, 7, 30, 31], [28, 29, 2, 3, 7, 30, 31, 32], [28, 29

In [17]:
input_sequences

[[28, 29],
 [28, 29, 2],
 [28, 29, 2, 3],
 [28, 29, 2, 3, 7],
 [28, 29, 2, 3, 7, 30],
 [28, 29, 2, 3, 7, 30, 31],
 [28, 29, 2, 3, 7, 30, 31, 32],
 [28, 29, 2, 3, 7, 30, 31, 32, 12],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2, 12],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2, 12, 34],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2, 12, 34, 4],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2, 12, 34, 4, 35],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2, 12, 34, 4, 35, 8],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2, 12, 34, 4, 35, 8, 2],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2, 12, 34, 4, 35, 8, 2, 3],
 [28, 29, 2, 3, 7, 30, 31, 32, 12, 8, 1, 4, 33, 2, 12, 34, 4, 35, 8, 2, 3, 7],
 [28,
  29,
  2,
  3,
  7,
  30,
  31,
  32,
  

In [18]:
max_len = max([len(x) for x in input_sequences])
max_len

140

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [20]:
padded_input_sequences


array([[  0,   0,   0, ...,   0,  28,  29],
       [  0,   0,   0, ...,  28,  29,   2],
       [  0,   0,   0, ...,  29,   2,   3],
       ...,
       [  0,   0,  10, ...,   1, 182,   2],
       [  0,  10,  19, ..., 182,   2,   4],
       [ 10,  19, 101, ...,   2,   4, 183]], shape=(254, 140), dtype=int32)

In [21]:
max_len = max([len(x) for x in input_sequences])
max_len

140

In [25]:
x= padded_input_sequences[:,:-1]
x

array([[  0,   0,   0, ...,   0,   0,  28],
       [  0,   0,   0, ...,   0,  28,  29],
       [  0,   0,   0, ...,  28,  29,   2],
       ...,
       [  0,   0,  10, ..., 181,   1, 182],
       [  0,  10,  19, ...,   1, 182,   2],
       [ 10,  19, 101, ..., 182,   2,   4]], shape=(254, 139), dtype=int32)

In [23]:
y = padded_input_sequences[:,-1]
y

array([ 29,   2,   3,   7,  30,  31,  32,  12,   8,   1,   4,  33,   2,
        12,  34,   4,  35,   8,   2,   3,   7,  36,  13,  37,  38,  39,
        40,  41,  42,  14,  43,   1,  44,  45,  46,   9,   5,  47,  48,
         8,   3,  49,  50,  51,   1,  52,   9,  53,  54,   2,  15,  55,
        56,   1,  57,  58,  59,  60,  10,  16,  17,   1,  61,  62,  63,
        64,  65,   7,   1,  66,  67,   3,  68,  69,  70,  71,  72,  73,
        74,   1,  75,  76,  77,  78,  79,  80,   6,  81,  82,  83,   1,
        84,  85,  86,   5,  18,  10,  87,  19,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,   6,  99,   6,   9, 100,  19, 101,
       102,   6,  20, 103, 104,  11, 105, 106,   1,   4, 107, 108, 109,
       110, 111,   2, 112, 113, 114, 115, 116, 117, 118, 119, 120,   2,
       121,   1, 122,   2,  21,  22,   6,   3, 123, 124, 125,  21,   1,
       126, 127, 128,  23, 129, 130, 131,  14, 132, 133, 134, 135,   5,
       136,  16,  17,   1, 137,  13, 138, 139,   3,  15, 140, 14

In [26]:
x.shape

(254, 139)

In [27]:
y.shape

(254,)

In [28]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=283)

In [29]:
y.shape

(254, 283)

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [31]:
model = Sequential()
model.add(Embedding(283, 100, input_length=56))
model.add(LSTM(150))
model.add(Dense(283, activation='softmax'))

model.build(input_shape=(None, 56))



In [32]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [33]:
model.summary()

In [35]:
model.fit(x,y,epochs=50)

Epoch 1/50


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 194ms/step - accuracy: 1.0000 - loss: 0.1970
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 188ms/step - accuracy: 1.0000 - loss: 0.1909
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 218ms/step - accuracy: 1.0000 - loss: 0.1857
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 170ms/step - accuracy: 1.0000 - loss: 0.1797
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 171ms/step - accuracy: 1.0000 - loss: 0.1747
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 168ms/step - accuracy: 1.0000 - loss: 0.1693
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 195ms/step - accuracy: 1.0000 - loss: 0.1652
Epoch 8/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 168ms/step - accuracy: 1.0000 - loss: 0.1602
Epoch 9/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1

<keras.src.callbacks.history.History at 0x210bbe896d0>

In [37]:
import numpy as np
import time
text = "Various subfields of"

for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Various subfields of ai
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
Various subfields of ai research
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
Various subfields of ai research are
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Various subfields of ai research are centered
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Various subfields of ai research are centered around
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Various subfields of ai research are centered around particular
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
Various subfields of ai research are centered around particular goals
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Various subfields of ai research are centered around particular goals and
[1m1/1[0m [32m━━━━