In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# ---------------------------
# 1️⃣ Charger le dataset CSV
# ---------------------------
data = pd.read_csv('qa_sarcastic_eco_it.csv')

questions = data['Question'].astype(str).tolist()
answers = data['Reponse'].astype(str).tolist()

answers = ['<start> ' + ans + ' <end>' for ans in answers]


num_words = 5000  # taille du vocabulaire

tokenizer_questions = Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer_questions.fit_on_texts(questions)
questions_seq = tokenizer_questions.texts_to_sequences(questions)
max_len_questions = max([len(seq) for seq in questions_seq])
questions_seq = pad_sequences(questions_seq, maxlen=max_len_questions, padding='post')

# Tokenizer pour les réponses
tokenizer_answers = Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer_answers.fit_on_texts(answers)
answers_seq = tokenizer_answers.texts_to_sequences(answers)
max_len_answers = max([len(seq) for seq in answers_seq])
answers_seq = pad_sequences(answers_seq, maxlen=max_len_answers, padding='post')

vocab_size_answers = len(tokenizer_answers.word_index) + 1


#Préparer les données pour LSTM

decoder_input_seq = answers_seq[:, :-1]

decoder_target_seq = answers_seq[:, 1:]


embedding_dim = 128
lstm_units = 256

# Encodeur
encoder_inputs = Input(shape=(max_len_questions,))
enc_emb = Embedding(input_dim=num_words, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Décodeur
decoder_inputs = Input(shape=(max_len_answers-1,))
dec_emb_layer = Embedding(input_dim=vocab_size_answers, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_answers, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Modèle complet
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

decoder_target_seq = np.expand_dims(decoder_target_seq, -1)

model.fit([questions_seq, decoder_input_seq], decoder_target_seq,
          batch_size=16,
          epochs=100,
          validation_split=0.1)
