<a href="https://colab.research.google.com/github/anopsy/romanticpsychobot/blob/main/QueBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import tensorflow as tf
import numpy as np 

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [14]:
# Download the dataset
!wget https://raw.githubusercontent.com/anopsy/romanticpsychobot/main/romanticpsychoclean.csv

--2023-03-29 15:53:42--  https://raw.githubusercontent.com/anopsy/romanticpsychobot/main/romanticpsychoclean.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 147668 (144K) [text/plain]
Saving to: ‘romanticpsychoclean.csv.1’


2023-03-29 15:53:43 (6.36 MB/s) - ‘romanticpsychoclean.csv.1’ saved [147668/147668]



In [15]:
# Load the dataset
data = open('./romanticpsychoclean.csv').read()
clean_corpus =[]
# Lowercase and split the text
corpus = data.lower().split("\n")

for line in corpus:
  cleanline = line.replace("\"","")
  verycleanline = cleanline.replace(",","")
  clean_corpus.append(verycleanline)

# Preview the result
print(clean_corpus)

['śnieg pada na białe porshe', 'robię źle a chciałem dobrze', 'przepraszam nie jestem stąd', 'podejdziesz za blisko to ukąszę', 'nie płaczę po noterdame', 'ten ból ciągle płonie w nas', 'błagam pomóż mi go ugasić', 'nie płaczę po notre dame w radiu leci tout va bien', 'czuję się jak orelsan', 'jetlag to mój nowy drag i jestem naćpany tak', 'nie wiem czego wciąż mi brak', 'co tak cenne jest i', 'czego mogę chcieć i lamborghini jetski', 'popatrz w moje oczy piwne jak oktoberfest', 'i powiedz za czym tęsknisz avignon i brest', 'i w tle te lawendowe pola', "chcę jeszcze trochę słońca zanim trafię na de gaulle'a", 'rok spędziłem w samolotach pociągach i na promach', 'myślę o twoich dłoniach palcach na moich skroniach', 'lazur piasek ja i ona życiem pijani na skałce', 'i odsłonięte ramiona jej głowa na mojej klatce', 'usta jak pain au chocolat mógłbym ciągle na nie patrzeć', 'lepiej weź już szczędź ten morał bo ja nie żyję jak w bajce nie', 'śnieg pada na białe porshe', 'robię źle a chciałem

In [16]:
# Initialize the Tokenizer class
tokenizer = Tokenizer()

# Generate the word index dictionary
tokenizer.fit_on_texts(corpus)

# Define the total words. You add 1 for the index `0` which is just the padding token.
total_words = len(tokenizer.word_index) + 1

print(f'word index dictionary: {tokenizer.word_index}')
print(f'total words: {total_words}')

word index dictionary: {'nie': 1, 'w': 2, 'jak': 3, 'to': 4, 'na': 5, 'i': 6, 'z': 7, 'się': 8, 'a': 9, 'za': 10, 'mnie': 11, 'mi': 12, 'jest': 13, 'co': 14, 'mam': 15, 'tylko': 16, 'do': 17, 'że': 18, 'ale': 19, 'tak': 20, 'bo': 21, 'już': 22, 'jestem': 23, 'po': 24, 'o': 25, 'ja': 26, 'bez': 27, 'ze': 28, 'tu': 29, 'ten': 30, 'od': 31, 'dla': 32, 'nic': 33, 'czy': 34, 'ma': 35, 'tym': 36, 'te': 37, 'mój': 38, 'wszystko': 39, 'dlaczego': 40, 'no': 41, 'teraz': 42, 'moja': 43, 'życie': 44, 'gdy': 45, 'dziś': 46, 'czuję': 47, 'znowu': 48, 'przez': 49, 'gdzie': 50, 'nawet': 51, 'tych': 52, 'tam': 53, 'niż': 54, 'byłem': 55, 'wiem': 56, 'tego': 57, 'świat': 58, 'przy': 59, 'sie': 60, 'cię': 61, 'la': 62, 'więcej': 63, "c'est": 64, 'vie': 65, 'moje': 66, 'są': 67, 'nas': 68, 'chcę': 69, 'albo': 70, 'ludzi': 71, 'ci': 72, 'mogę': 73, 'które': 74, 'każdy': 75, 'jeszcze': 76, 'sobie': 77, 'chyba': 78, 'widzę': 79, 'może': 80, 'kiedy': 81, 'by': 82, 'robię': 83, 'dobrze': 84, 'jakbym': 85, 'co

In [17]:
# Initialize the sequences list
input_sequences = []

# Loop over every line
for line in clean_corpus:

	# Tokenize the current line
	token_list = tokenizer.texts_to_sequences([line])[0]

	# Loop over the line several times to generate the subphrases
	for i in range(1, len(token_list)):
		
		# Generate the subphrase
		n_gram_sequence = token_list[:i+1]

		# Append the subphrase to the sequences list
		input_sequences.append(n_gram_sequence)

# Get the length of the longest line
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create inputs and label by splitting the last token in the subphrases
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# Convert the label into one-hot arrays
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [18]:
# Get sample sentence
sentence = clean_corpus[0].split()
print(f'sample sentence: {sentence}')

# Initialize token list
token_list = []

# Look up the indices of each word and append to the list
for word in sentence: 
  token_list.append(tokenizer.word_index[word])

# Print the token list
print(token_list)

sample sentence: ['śnieg', 'pada', 'na', 'białe', 'porshe']
[432, 317, 5, 433, 828]


In [19]:
# Pick element
elem_number = 5

# Print token list and phrase
print(f'token list: {xs[elem_number]}')
print(f'decoded to text: {tokenizer.sequences_to_texts([xs[elem_number]])}')

# Print label
print(f'one-hot label: {ys[elem_number]}')
print(f'index of label: {np.argmax(ys[elem_number])}')

token list: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  83 212]
decoded to text: ['robię źle']
one-hot label: [0. 0. 0. ... 0. 0. 0.]
index of label: 9


In [20]:
# Hyperparameters
embedding_dim = 1000
lstm_units = 100
learning_rate = 0.02

# Build the model
model = Sequential([
          Embedding(total_words, embedding_dim, input_length=max_sequence_len-1),
          Bidirectional(LSTM(lstm_units)),
          Dense(total_words, activation='softmax')
])

# Use categorical crossentropy because this is a multi-class problem
model.compile(
    loss='categorical_crossentropy', 
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 
    metrics=['accuracy']
    )

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 1000)          6169000   
                                                                 
 bidirectional (Bidirectiona  (None, 200)              880800    
 l)                                                              
                                                                 
 dense (Dense)               (None, 6169)              1239969   
                                                                 
Total params: 8,289,769
Trainable params: 8,289,769
Non-trainable params: 0
_________________________________________________________________


In [None]:
epochs = 100

# Train the model
history = model.fit(xs, ys, epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

In [None]:
import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

# Visualize the accuracy
plot_graphs(history, 'accuracy')

# Visualize the loss
plot_graphs(history, 'loss')

# TU W SEED_TEXT MOZESZ WPISAC SWOJ POCZATEK

In [None]:
# Define seed text
seed_text = "Wjeżdżam na ostro jak habanero Co to jest balet zobaczysz dopiero"

# Define total words to predict
next_words = 100

# Loop until desired length is reached
for _ in range(next_words):

	# Convert the seed text to a token sequence
	token_list = tokenizer.texts_to_sequences([seed_text])[0]

	# Pad the sequence
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	
	# Feed to the model and get the probabilities for each index
	probabilities = model.predict(token_list)

	# Get the index with the highest probability
	predicted = np.argmax(probabilities, axis=-1)[0]

	# Ignore if index is 0 because that is just the padding.
	if predicted != 0:
		
		# Look up the word associated with the index. 
		output_word = tokenizer.index_word[predicted]

		# Combine with the seed text
		seed_text += " " + output_word

# Print the result	
print(seed_text)

# TU W SEED_TEXT MOZESZ WPISAC SWOJ POCZATEK

In [None]:
# Define seed text
seed_text = "Wjeżdżam na ostro jak habanero Co to jest balet zobaczysz dopiero"

# Define total words to predict
next_words = 100

# Loop until desired length is reached
for _ in range(next_words):

	# Convert the seed text to a token sequence
  token_list = tokenizer.texts_to_sequences([seed_text])[0]

	# Pad the sequence
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	
	# Feed to the model and get the probabilities for each index
  probabilities = model.predict(token_list)

  # Pick a random number from [1,2,3]
  choice = np.random.choice([1,2,3])
	
  # Sort the probabilities in ascending order 
  # and get the random choice from the end of the array
  predicted = np.argsort(probabilities)[0][-choice]

	# Ignore if index is 0 because that is just the padding.
  if predicted != 0:
		
		# Look up the word associated with the index. 
	  output_word = tokenizer.index_word[predicted]

		# Combine with the seed text
	  seed_text += " " + output_word

# Print the result	
print(seed_text)