In [None]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
#print(stopwords.words('english'))

In [None]:
import sys  
!{sys.executable} -m pip install contractions

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from nltk.util import ngrams

data_path = 'data.txt'

input_texts = []
output_texts = []
with open(data_path) as f:
    lines = f.read().split('\n')
for line in lines[: min(600, len(lines) - 1)]:
    input_text = line.split('\t')[0]
    target_text = line.split('\t')[1]
    input_texts.append(input_text)
    output_texts.append(target_text)


import contractions
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    text = contractions.fix(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text_tokens = text.split()
    filtered_text = [word for word in text_tokens if word not in stop_words]
    text = ' '.join(filtered_text)
    # Lemmatize words
    # lemmatizer = WordNetLemmatizer()
    # text_tokens = text.split()
    # lemmatized_text = [lemmatizer.lemmatize(word) for word in text_tokens]
    # text = ' '.join(lemmatized_text)
    return text

# Preprocess input and output sequences
input_texts = [preprocess_text(text) for text in input_texts]
output_texts = [preprocess_text(text) for text in output_texts]

# Define the n-gram order
n = 2

# Generate n-gram sequences
input_ngram_sequences = []
output_ngram_sequences = []

for text in input_texts:
    ngrams_sequence = [' '.join(ngram) for ngram in ngrams(text.split(), n)]
    input_ngram_sequences.append(' '.join(ngrams_sequence))

for text in output_texts:
    ngrams_sequence = [' '.join(ngram) for ngram in ngrams(text.split(), n)]
    output_ngram_sequences.append(' '.join(ngrams_sequence))

# Configure the Tokenizer with n-grams support
tokenizer = Tokenizer(lower=True, filters='', split=' ')
tokenizer.fit_on_texts(input_ngram_sequences + output_ngram_sequences)

# Convert text sequences to integer sequences
input_sequences = tokenizer.texts_to_sequences(input_ngram_sequences)
output_sequences = tokenizer.texts_to_sequences(output_ngram_sequences)

# Define maximum sequence length
max_seq_length = max(len(seq) for seq in input_sequences + output_sequences)

# Pad sequences to the same length
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')
output_sequences = pad_sequences(output_sequences, maxlen=max_seq_length, padding='post')

# Define vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Define encoder input, decoder input, and decoder output
encoder_input = input_sequences
decoder_input = np.zeros_like(output_sequences)
decoder_input[:, 1:] = output_sequences[:, :-1]
decoder_output = np.eye(vocab_size)[output_sequences]

# Define the Seq2Seq model
latent_dim = 128
dropout_rate = 0.2

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, dropout=dropout_rate, recurrent_dropout=dropout_rate)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]
encoder_dropout = Dropout(dropout_rate)
encoder_dropout_output = encoder_dropout(state_h)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=dropout_rate, recurrent_dropout=dropout_rate)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dropout = Dropout(dropout_rate)
decoder_dropout_outputs = decoder_dropout(decoder_outputs)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_dropout_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Define K-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True)

# Define RLHF parameters
rl_epochs = 5  # Number of RL epochs
rl_learning_rate = 0.001  # RL learning rate
rl_batch_size = 64  # RL batch size
reinforce_reward = 1  # Reward value for positive reinforcement

for fold, (train_indices, val_indices) in enumerate(kf.split(input_sequences)):
    print(f'Fold {fold + 1}')
    x_train, x_val = input_sequences[train_indices], input_sequences[val_indices]
    y_train, y_val = decoder_output[train_indices], decoder_output[val_indices]
    decoder_input_train, decoder_input_val = decoder_input[train_indices], decoder_input[val_indices]

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Define early stopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=3)

    # Train the model using supervised learning
    history = model.fit([x_train, decoder_input_train], y_train,
                        validation_data=([x_val, decoder_input_val], y_val),
                        epochs=10, batch_size=64, callbacks=[early_stop])

    # Apply RLHF
    for rl_epoch in range(rl_epochs):
        # Generate translations using the current model
        predictions = model.predict([x_train, decoder_input_train])
        decoded_sequences = []
        for prediction in predictions:
            decoded_sequence = []
            for token in prediction:
                sampled_token_index = np.argmax(token)
                sampled_token = tokenizer.index_word.get(sampled_token_index, '')
                if sampled_token != '':
                    decoded_sequence.append(sampled_token)
            decoded_sequences.append(decoded_sequence)

        # Calculate rewards based on BLEU score or other metrics
        # rewards = calculate_rewards(decoded_sequences, ground_truth_sequences)

        # Prepare RL data by converting sequences to integer sequences
        rl_input_sequences = tokenizer.texts_to_sequences(decoded_sequences)
        rl_input_sequences = pad_sequences(rl_input_sequences, maxlen=max_seq_length, padding='post')

        # Train the model using RL
        model.optimizer.lr.assign(rl_learning_rate)
        model.fit([x_train, rl_input_sequences], y_train, epochs=1, batch_size=rl_batch_size)

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate([x_val, decoder_input_val], y_val)
    print(test_loss)
    print(test_acc)

