In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [27]:
!apt-get update
!apt-get -y install mecab libmecab-dev mecab-ipadic-utf8

!pip install pysrt
!pip install onnx
!pip install onnxruntime
!pip install mecab-python3

# Set the MECABRC environment variable
import os
os.environ['MECABRC'] = "/etc/mecabrc"

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,031 kB]
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:9 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,396 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [3,278 kB]
Get:13 http://archive.ubuntu.com/ubuntu jam

In [28]:
# Import des bibliothèques nécessaires
import os
import pysrt
import re

# Chemin vers le fichier .srt
DATA_FOLDER = "/content/data_srt"
DATA_SRT = [os.path.join(DATA_FOLDER, f) for f in os.listdir(DATA_FOLDER) if f.endswith('.srt')]
output_file_path = "/content/data_qa.txt"  # Chemin du fichier de sortie
output_data = 0

# Fonction pour nettoyer le texte : supprimer les balises, symboles et parenthèses
def clean_text(text):
    # Supprime les balises comme {\\an8}, les notes musicales ♪ et les parenthèses ()
    cleaned = re.sub(r'\{.*?\}|♪|[\（\(].*?[\）\)]|～|⸺', '', text)
    cleaned = re.sub(r'<.*?>', '', cleaned)
    cleaned = cleaned.replace('\n', ' ')  # Remplace les retours à la ligne par un espace
    return cleaned.strip()  # Supprime les espaces en trop

# Fonction pour ignorer les interjections ou phrases très courtes
def is_valid_text(text):
    # On ignore les interjections ou les phrases très courtes qui n'ont pas de sens complet
    if len(text) < 3:  # Ignore les chaînes de moins de 3 caractères
        return False
    # Ignore certaines interjections communes ou symboles
    if re.match(r"^[あっ！ふっ！たあっ！-]+$", text):
        return False
    return True

with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for srt_file in DATA_SRT:
        # Ouverture du fichier .srt
        subs = pysrt.open(srt_file)

        # Boucle pour extraire et nettoyer les sous-titres
        for idT in range(len(subs) - 1):  # On itère sur tous les sous-titres, sauf le dernier
            # Extraction et nettoyage du texte du sous-titre
            question = clean_text(subs[idT].text)
            response = clean_text(subs[idT + 1].text)

            # Vérification si les textes sont valides
            if is_valid_text(question) and is_valid_text(response):
                # Formater la ligne à écrire
                line = f"{question}\t{response}\n"

                # Vérifier que la ligne a le bon format avant de l'écrire
                if line.strip() and "\t" in line:  # Vérifie qu'il y a quelque chose après le tab
                    output_file.write(line)
                    output_data += 1

# Afficher le résultat nettoyé
print(f"Résultat nettoyé : {output_data} messages.")


Résultat nettoyé : 20605 messages.


In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import onnx
import onnxruntime as rt
from tensorflow.keras.preprocessing.sequence import pad_sequences
import MeCab
import re

# Check GPU availability and set device accordingly
if tf.config.list_physical_devices('GPU'):
    device = '/GPU:0'  # Use GPU if available
else:
    device = '/CPU:0'  # Fallback to CPU

print("Device :",device)

with tf.device('/GPU:0'):

  def tokenize_japanese(text):
      """
      Tokenizes Japanese text using MeCab.

      Args:
          text: The input Japanese text.

      Returns:
          A list of tokens.
      """
      tagger = MeCab.Tagger("-Owakati")
      wakati_text = tagger.parse(text)
      tokens = wakati_text.strip().split()
      return tokens



  # 1. Préparation des données
  data_path = '/content/data_qa.txt'

  with open(data_path, 'r', encoding='utf-8') as f:
      lines = f.read().split('\n')

  # Tokenisation et création du vocabulaire
  vocab = set()
  encoder_input_data = []
  decoder_input_data = []
  decoder_target_data = []

  for line in lines:
      if line:
          input_text, target_text = line.split('\t')

          input_tokens = tokenize_japanese(input_text)
          target_tokens = tokenize_japanese(target_text)

          encoder_input_data.append(input_tokens)
          decoder_input_data.append(['<start>'] + target_tokens)
          decoder_target_data.append(target_tokens + ['<end>'])

          vocab.update(input_tokens + target_tokens)

  # Add <start> and <end> tokens to vocab before creating word_to_index
  vocab.add('<start>')
  vocab.add('<end>')

  # Création d'index pour le vocabulaire
  word_to_index = {token: index for index, token in enumerate(vocab)}
  index_to_word = {index: token for token, index in word_to_index.items()}
  vocab_size = len(vocab)

  # Padding des séquences
  max_encoder_seq_length = max(len(seq) for seq in encoder_input_data)
  max_decoder_seq_length = max(len(seq) for seq in decoder_input_data)

  encoder_input_data = [[word_to_index[token] for token in seq] + [0] * (max_encoder_seq_length - len(seq)) for seq in encoder_input_data]
  decoder_input_data = [[word_to_index[token] for token in seq] + [0] * (max_decoder_seq_length - len(seq)) for seq in decoder_input_data]
  decoder_target_data = [[word_to_index[token] for token in seq] + [0] * (max_decoder_seq_length - len(seq)) for seq in decoder_target_data]

  encoder_input_data = np.array(encoder_input_data)
  decoder_input_data = np.array(decoder_input_data)
  decoder_target_data = np.array(decoder_target_data)

  # 2. Création du modèle Seq2Seq avec LSTM et régularisation
  embedding_dim = 256
  units = 512

  encoder_inputs = keras.Input(shape=(None,))
  encoder_embedding = keras.layers.Embedding(vocab_size, embedding_dim)(encoder_inputs)
  encoder_lstm = keras.layers.LSTM(units, return_state=True, dropout=0.2, recurrent_dropout=0.2)  # Dropout ajouté
  encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
  encoder_states = [state_h, state_c]

  decoder_inputs = keras.Input(shape=(None,))
  decoder_embedding = keras.layers.Embedding(vocab_size, embedding_dim)(decoder_inputs)
  decoder_lstm = keras.layers.LSTM(units, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2)  # Dropout ajouté
  decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
  decoder_dense = keras.layers.Dense(vocab_size, activation='softmax', kernel_regularizer=keras.regularizers.l2(0.01))  # Régularisation L2 ajoutée
  decoder_outputs = decoder_dense(decoder_outputs)

  model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

  # Compilation et entraînement du modèle avec ajustement des hyperparamètres
  optimizer = keras.optimizers.Adam(learning_rate=0.001)  # Taux d'apprentissage ajusté
  model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  model.fit([encoder_input_data, decoder_input_data], decoder_target_data, epochs=20, batch_size=32)  # Époques et taille du lot ajustées

# Enregistrez le modèle Keras
model.save("s2s_bot_mobile_ja.h5")




Device : /GPU:0
Epoch 1/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 118ms/step - accuracy: 0.7659 - loss: 2.7343
Epoch 2/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 119ms/step - accuracy: 0.7921 - loss: 1.6088
Epoch 3/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 120ms/step - accuracy: 0.7953 - loss: 1.5896
Epoch 4/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 121ms/step - accuracy: 0.7966 - loss: 1.5680
Epoch 5/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 122ms/step - accuracy: 0.7982 - loss: 1.5554
Epoch 6/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 120ms/step - accuracy: 0.8031 - loss: 1.5144
Epoch 7/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 122ms/step - accuracy: 0.8030 - loss: 1.5189
Epoch 8/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 122ms/step - accuracy: 0.8044 - loss: 1.4966


In [33]:
model.summary()

In [41]:
def generate_response(input_text, temperature=2.0):
    """Generates a response for a given input text.

    Args:
        input_text: The input text as a string.

    Returns:
        The generated response as a string.
    """

    # Tokenize the input text
    input_tokens = tokenize_japanese(input_text)

    # Convert tokens to indices
    input_seq = [word_to_index[token] for token in input_tokens if token in word_to_index]

    # Pad the input sequence
    input_seq = pad_sequences([input_seq], maxlen=max_encoder_seq_length, padding='post')

    # Get the initial decoder state
    states_value = encoder_model.predict(input_seq)

    # Initialize the target sequence with the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word_to_index['<start>']

    # Generate tokens until the end token is reached
    decoded_sentence = ''
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        # Apply temperature to softmax
        output_tokens = output_tokens[0, -1, :] / temperature
        sampled_token_index = np.argmax(output_tokens)

        # Convert the token index to word
        sampled_token = index_to_word[sampled_token_index]

        # If the end token is reached, stop generating
        if sampled_token == '<end>':
            break

        # Append the predicted token to the decoded sentence
        decoded_sentence += sampled_token + ' '

        # Update the target sequence and decoder state
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

# Create inference models
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_state_input_h = keras.Input(shape=(units,))
decoder_state_input_c = keras.Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Example usage
input_text = "番ステキな季節が来た"
response = generate_response(input_text)
print("Question :",input_text)
print("Response :",response)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Question : 番ステキな季節が来た
Response : 私 は 何 を し てる の ？
