In [1]:
corpus = ["When we try to use a usual classifier to classify an imbalanced dataset, the model favors the majority class due to its larger volume presence",
          "You can also treat outliers as missing values. But then these missing values also have to be filled. So to fill missing values you can use any of the methods as discussed above in this article"]

In [2]:
for i in corpus:
    print(i)
    print("_"*50)

When we try to use a usual classifier to classify an imbalanced dataset, the model favors the majority class due to its larger volume presence
__________________________________________________
You can also treat outliers as missing values. But then these missing values also have to be filled. So to fill missing values you can use any of the methods as discussed above in this article
__________________________________________________


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np

In [4]:
tk = Tokenizer()

In [5]:
tk.fit_on_texts(corpus)

In [6]:
len(tk.word_counts)

46

In [7]:
len(tk.word_index)

46

In [8]:
data = tk.texts_to_sequences(corpus)

In [9]:
print(data)

[[10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19, 2, 20, 21, 2, 22, 23, 24, 1, 25, 26, 27, 28], [6, 7, 8, 29, 30, 9, 3, 4, 31, 32, 33, 3, 4, 8, 34, 1, 35, 36, 37, 1, 38, 3, 4, 6, 7, 5, 39, 40, 2, 41, 9, 42, 43, 44, 45, 46]]


In [10]:
inputs = []
outputs = []
for lst in data:
    for word in range(len(lst)-1):
        inputs.append(lst[:word+1])
        outputs.append(lst[word+1])


In [11]:
inputs

[[10],
 [10, 11],
 [10, 11, 12],
 [10, 11, 12, 1],
 [10, 11, 12, 1, 5],
 [10, 11, 12, 1, 5, 13],
 [10, 11, 12, 1, 5, 13, 14],
 [10, 11, 12, 1, 5, 13, 14, 15],
 [10, 11, 12, 1, 5, 13, 14, 15, 1],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19, 2],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19, 2, 20],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19, 2, 20, 21],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19, 2, 20, 21, 2],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19, 2, 20, 21, 2, 22],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19, 2, 20, 21, 2, 22, 23],
 [10, 11, 12, 1, 5, 13, 14, 15, 1, 16, 17, 18, 19, 2, 20, 21, 2, 22, 23, 24],
 [10,
  11,
  12,
  1,
  5,
  13,
  14,
  15,
  1,
  16,
  17,
  18,
  19,
  2,
  20,
  21,
  2,
  22,
  23,
  24,
  1],
 [10,
  11,
  12,
  1,
  5,
  13,
 

In [12]:
outputs

[11,
 12,
 1,
 5,
 13,
 14,
 15,
 1,
 16,
 17,
 18,
 19,
 2,
 20,
 21,
 2,
 22,
 23,
 24,
 1,
 25,
 26,
 27,
 28,
 7,
 8,
 29,
 30,
 9,
 3,
 4,
 31,
 32,
 33,
 3,
 4,
 8,
 34,
 1,
 35,
 36,
 37,
 1,
 38,
 3,
 4,
 6,
 7,
 5,
 39,
 40,
 2,
 41,
 9,
 42,
 43,
 44,
 45,
 46]

In [13]:
len(inputs), len(outputs)

(59, 59)

In [14]:
from keras.utils import pad_sequences, to_categorical

In [15]:
pad_sequences(inputs).shape

(59, 35)

In [16]:
fv = pad_sequences(inputs, padding = "pre")

In [17]:
class_label =  to_categorical(outputs, num_classes=46+1)

In [18]:
to_categorical(outputs).shape

(59, 47)

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Input, Embedding, SimpleRNN

In [20]:
model = Sequential()
model.add(Input(shape=(40,)))
model.add(Embedding(47,2))
model.add(SimpleRNN(100, return_sequences=False))
model.add(Dense(47, activation="softmax"))

In [21]:
model.summary()

In [22]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [23]:
model.fit(fv, class_label, epochs=500)

Epoch 1/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.0000e+00 - loss: 3.8525
Epoch 2/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step - accuracy: 0.0869 - loss: 3.8265  
Epoch 3/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0877 - loss: 3.7964 
Epoch 4/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0869 - loss: 3.7765 
Epoch 5/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1329 - loss: 3.7407 
Epoch 6/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1538 - loss: 3.6741 
Epoch 7/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1651 - loss: 3.6221 
Epoch 8/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1538 - loss: 3.5649 
Epoch 9/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7585 - loss: 1.3766 
Epoch 70/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7793 - loss: 1.3130 
Epoch 71/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7906 - loss: 1.2533 
Epoch 72/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7680 - loss: 1.2371 
Epoch 73/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7472 - loss: 1.2521 
Epoch 74/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7906 - loss: 1.1874 
Epoch 75/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8227 - loss: 1.1624 
Epoch 76/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7481 - loss: 1.2309 
Epoch 77/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9236 - loss: 0.4812 
Epoch 138/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9227 - loss: 0.4489
Epoch 139/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9453 - loss: 0.4616 
Epoch 140/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9340 - loss: 0.4113 
Epoch 141/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9227 - loss: 0.4137 
Epoch 142/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9340 - loss: 0.4060 
Epoch 143/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9340 - loss: 0.4440 
Epoch 144/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9444 - loss: 0.4231 
Epoch 145/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9566 - loss: 0.2635 
Epoch 206/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9887 - loss: 0.2383 
Epoch 207/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9670 - loss: 0.2270 
Epoch 208/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9783 - loss: 0.2522 
Epoch 209/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9783 - loss: 0.2049 
Epoch 210/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9887 - loss: 0.1979 
Epoch 211/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9783 - loss: 0.2340 
Epoch 212/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9887 - loss: 0.1992 
Epoch 213/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 0.1281 
Epoch 274/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9783 - loss: 0.1237 
Epoch 275/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9887 - loss: 0.1063 
Epoch 276/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9783 - loss: 0.1451 
Epoch 277/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.1345 
Epoch 278/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9887 - loss: 0.1214 
Epoch 279/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9887 - loss: 0.1185 
Epoch 280/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9783 - loss: 0.1328 
Epoch 281/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0840 
Epoch 342/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0874 
Epoch 343/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9887 - loss: 0.0731 
Epoch 344/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0692 
Epoch 345/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0925 
Epoch 346/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0778 
Epoch 347/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0887 
Epoch 348/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0840 
Epoch 349/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0430 
Epoch 410/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0540 
Epoch 411/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0442 
Epoch 412/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0434 
Epoch 413/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0438 
Epoch 414/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0491 
Epoch 415/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0415 
Epoch 416/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0446 
Epoch 417/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0402
Epoch 478/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0272 
Epoch 479/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0346 
Epoch 480/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0393 
Epoch 481/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0393 
Epoch 482/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0311 
Epoch 483/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0288 
Epoch 484/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0293 
Epoch 485/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x17e79fa78d0>

In [24]:
x = ["Laxman Madasu from Jagtial"]
np.argmax(model.predict(pad_sequences(tk.texts_to_sequences(x), maxlen=35)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step


7

In [25]:
tk.texts_to_sequences(x)

[[]]

In [26]:
tk.index_word[2]

'the'

In [27]:
np.argmax(model.predict(pad_sequences(tk.texts_to_sequences("discussed"), maxlen=35)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step


7

In [28]:
tk.index_word[4]

'values'

In [29]:
import time

In [31]:
x = "classifier"

for i in range(10):
    word = tk.index_word[np.argmax(model.predict(pad_sequences(tk.texts_to_sequences(x), maxlen=113)))]
    print(x)
    x = x + " "+word
    time.sleep(0.9)
    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
classifier
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
classifier to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
classifier to to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
classifier to to to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
classifier to to to to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
classifier to to to to to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
classifier to to to to to to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
classifier to to to to to to to
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
classifier to to to to to to to to
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
classifier to to to to to to to to to


In [32]:
df = pd.read_csv(r"C:\Users\madas\Downloads\archive (2)\spotify_reviews.csv", usecols=["content"])

In [33]:
df

Unnamed: 0,content
0,It's good
1,"I love this app so much, I've been using Spoti..."
2,Perfect
3,Best all around music streaming app I have use...
4,Are y'all fr gatekeeping the play button on so...
...,...
84160,The most decent music streaming app around rig...
84161,"As a premium user for a few years, Spotify is ..."
84162,"There is lot of ads all of a sudden, and it's ..."
84163,The UI could be better. I think there should b...


In [34]:
df.drop_duplicates(inplace=True)

In [35]:
df.duplicated().sum()

0

In [36]:
tk = Tokenizer()

In [37]:
tk.fit_on_texts(df.content[0:1000])

In [38]:
len(tk.word_counts)

2378

In [39]:
data = tk.texts_to_sequences(df.content[0:1000])

In [40]:
print(data)

[[26, 23], [3, 28, 11, 4, 20, 74, 118, 124, 125, 12, 10, 78, 112, 177, 155, 42, 300, 716, 16, 1008, 15, 11, 4, 7, 196, 103, 113, 46, 126, 51, 21, 14, 43, 3, 231, 11, 3, 36, 31, 106, 1, 24, 27, 1, 17, 211, 6, 7, 75, 119, 17, 30, 1009, 31, 22, 18, 53, 6, 8, 103, 197, 113, 7, 30, 70, 35, 62, 54, 388, 164, 338, 339, 32, 3, 51, 32, 58, 19], [232], [29, 40, 461, 9, 212, 4, 3, 21, 97, 1, 557, 233, 7, 47], [35, 389, 717, 1010, 1, 37, 234, 27, 17], [558, 5], [1, 4, 7, 23, 15, 5, 120, 340, 2, 301, 14, 5, 114, 44, 1011], [248, 4], [3, 32, 11, 12], [1012, 1, 1013], [1014], [341, 1015, 95, 8, 143, 57, 302, 156, 303, 15, 1, 390, 177, 213, 21, 124, 249, 10, 45, 3, 36, 32, 104, 19, 57, 144, 17, 2, 18, 53, 3, 36, 75, 1016, 2, 342, 57, 559, 88, 127, 165, 57, 560, 32, 3, 718, 97, 2, 83, 104, 178, 26, 561, 1, 719, 1017, 1018, 1019, 562, 2, 563, 6, 18, 1020, 7, 1021, 250, 20, 3, 51, 145, 343, 42, 251, 1022, 391, 3, 75, 36, 128, 104, 19, 31, 166, 27, 14, 115, 68, 214, 107, 11], [23], [20, 23], [68, 392, 13,

In [41]:
inputs = []
outputs = []
for lst in data:
    for word in range(len(lst)-1):
        inputs.append(lst[:word+1])
        outputs.append(lst[word+1])


In [42]:
inputs

[[26],
 [3],
 [3, 28],
 [3, 28, 11],
 [3, 28, 11, 4],
 [3, 28, 11, 4, 20],
 [3, 28, 11, 4, 20, 74],
 [3, 28, 11, 4, 20, 74, 118],
 [3, 28, 11, 4, 20, 74, 118, 124],
 [3, 28, 11, 4, 20, 74, 118, 124, 125],
 [3, 28, 11, 4, 20, 74, 118, 124, 125, 12],
 [3, 28, 11, 4, 20, 74, 118, 124, 125, 12, 10],
 [3, 28, 11, 4, 20, 74, 118, 124, 125, 12, 10, 78],
 [3, 28, 11, 4, 20, 74, 118, 124, 125, 12, 10, 78, 112],
 [3, 28, 11, 4, 20, 74, 118, 124, 125, 12, 10, 78, 112, 177],
 [3, 28, 11, 4, 20, 74, 118, 124, 125, 12, 10, 78, 112, 177, 155],
 [3, 28, 11, 4, 20, 74, 118, 124, 125, 12, 10, 78, 112, 177, 155, 42],
 [3, 28, 11, 4, 20, 74, 118, 124, 125, 12, 10, 78, 112, 177, 155, 42, 300],
 [3,
  28,
  11,
  4,
  20,
  74,
  118,
  124,
  125,
  12,
  10,
  78,
  112,
  177,
  155,
  42,
  300,
  716],
 [3,
  28,
  11,
  4,
  20,
  74,
  118,
  124,
  125,
  12,
  10,
  78,
  112,
  177,
  155,
  42,
  300,
  716,
  16],
 [3,
  28,
  11,
  4,
  20,
  74,
  118,
  124,
  125,
  12,
  10,
  78,
  112,
  

In [43]:
outputs

[23,
 28,
 11,
 4,
 20,
 74,
 118,
 124,
 125,
 12,
 10,
 78,
 112,
 177,
 155,
 42,
 300,
 716,
 16,
 1008,
 15,
 11,
 4,
 7,
 196,
 103,
 113,
 46,
 126,
 51,
 21,
 14,
 43,
 3,
 231,
 11,
 3,
 36,
 31,
 106,
 1,
 24,
 27,
 1,
 17,
 211,
 6,
 7,
 75,
 119,
 17,
 30,
 1009,
 31,
 22,
 18,
 53,
 6,
 8,
 103,
 197,
 113,
 7,
 30,
 70,
 35,
 62,
 54,
 388,
 164,
 338,
 339,
 32,
 3,
 51,
 32,
 58,
 19,
 40,
 461,
 9,
 212,
 4,
 3,
 21,
 97,
 1,
 557,
 233,
 7,
 47,
 389,
 717,
 1010,
 1,
 37,
 234,
 27,
 17,
 5,
 4,
 7,
 23,
 15,
 5,
 120,
 340,
 2,
 301,
 14,
 5,
 114,
 44,
 1011,
 4,
 32,
 11,
 12,
 1,
 1013,
 1015,
 95,
 8,
 143,
 57,
 302,
 156,
 303,
 15,
 1,
 390,
 177,
 213,
 21,
 124,
 249,
 10,
 45,
 3,
 36,
 32,
 104,
 19,
 57,
 144,
 17,
 2,
 18,
 53,
 3,
 36,
 75,
 1016,
 2,
 342,
 57,
 559,
 88,
 127,
 165,
 57,
 560,
 32,
 3,
 718,
 97,
 2,
 83,
 104,
 178,
 26,
 561,
 1,
 719,
 1017,
 1018,
 1019,
 562,
 2,
 563,
 6,
 18,
 1020,
 7,
 1021,
 250,
 20,
 3,
 51,
 145,
 343,
 

In [44]:
len(inputs), len(outputs)

(15644, 15644)

In [45]:
pad_sequences(inputs).shape

(15644, 105)

In [46]:
fv = pad_sequences(inputs, padding = "pre")

In [47]:
class_label =  to_categorical(outputs)

In [48]:
to_categorical(outputs).shape

(15644, 2379)

In [49]:
model = Sequential()
model.add(Input(shape=(114,)))
model.add(Embedding(2379,5))
model.add(SimpleRNN(100, return_sequences=False))
model.add(Dense(2379, activation="softmax"))

In [50]:
model.summary()

In [51]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [52]:
model.fit(fv[:1000], class_label[:1000], epochs=200)

Epoch 1/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.0118 - loss: 7.6378
Epoch 2/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0379 - loss: 6.1624
Epoch 3/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0258 - loss: 5.7306
Epoch 4/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0177 - loss: 5.6669
Epoch 5/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0139 - loss: 5.6975
Epoch 6/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0341 - loss: 5.5739
Epoch 7/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0271 - loss: 5.6470
Epoch 8/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0282 - loss: 5.5972
Epoch 9/200
[1m32/32[0m [32m━━━━━━━━━

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1982 - loss: 3.5660
Epoch 69/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1827 - loss: 3.5397
Epoch 70/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2143 - loss: 3.4361
Epoch 71/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2041 - loss: 3.4309
Epoch 72/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2061 - loss: 3.4684
Epoch 73/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2510 - loss: 3.2791
Epoch 74/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2177 - loss: 3.3268
Epoch 75/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2565 - loss: 3.2497
Epoch 76/200
[1m32/32[0m [32m━━━━━━━━━━━━━

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.3677 - loss: 4.8123
Epoch 136/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6619 - loss: 1.9858
Epoch 137/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6666 - loss: 1.8942
Epoch 138/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7033 - loss: 1.8177
Epoch 139/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6951 - loss: 1.7794
Epoch 140/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7276 - loss: 1.6588
Epoch 141/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7680 - loss: 1.5732
Epoch 142/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7604 - loss: 1.5571
Epoch 143/200
[1m32/32[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x17e7fe05f90>

In [54]:
X = "I love this "
import time
for y in range(20):
    word = tk.index_word[np.argmax(model.predict(pad_sequences(tk.texts_to_sequences([X]),maxlen=23)))]
    X = X+" "+word
    print(X)
    time.sleep(0.9)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
I love this  able
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
I love this  able so
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
I love this  able so much
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
I love this  able so much i've
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
I love this  able so much i've been
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
I love this  able so much i've been using
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
I love this  able so much i've been using spotify
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
I love this  able so much i've been using spotify for
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
I love this  able so much i've been using spotify for more
[1m1/1[0m [

In [56]:
df['content'][1]

"I love this app so much, I've been using Spotify for more than 2 years, with different accounts of course. But this app is sometimes really annoying if u don't have premium. when I say this, I can't even see the lyrics on the songs anymore? And is also playing songs that aint even in my playlist, and a really most annoying is that why are there only 6 skip per hour? like I don't like every song."

In [57]:
import pickle

In [60]:
Mdl1 = pickle.dump(model,open('model.pkl','wb'))
Mdl1 =pickle.load(open('model.pkl','rb'))

In [61]:
Mdl2 = pickle.dump(tk,open('tk.pkl','wb'))
Mdl2 =pickle.load(open('tk.pkl','rb'))

In [64]:
X = input("Enter your First word:")
for y in range(20):
    word = tk.index_word[np.argmax(Mdl1.predict(pad_sequences(Mdl2.texts_to_sequences([X]),maxlen=23)))]
    X = X+" "+word
    print(X)

Enter your First word:Best all around music
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Best all around music streaming
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Best all around music streaming app
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Best all around music streaming app i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Best all around music streaming app i have
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Best all around music streaming app i have used
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Best all around music streaming app i have used the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Best all around music streaming app i have used the family
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Best all around music streaming app i have used the family plan
[1m1/1