In [None]:
import yaml
import nltk
from nltk.stem import WordNetLemmatizer



In [None]:
# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Step 01: Load data
def load_yaml(file_path):
    with open(file_path, 'r') as file:
        data = yaml.safe_load(file)
    return data



In [None]:
nlu_data = load_yaml('nlu.yml')
stories_data = load_yaml('stories.yml')
rules_data = load_yaml('rules.yml')

# Combine the loaded data into a single dictionary
data = {'nlu': nlu_data, 'stories': stories_data, 'rules': rules_data}

In [None]:
def preprocess_stories_rules_data(data):
    processed_data = {'stories': [], 'rules': []}

    # Preprocess Stories
    stories = data.get('stories', {}).get('stories', [])
    for story_data in stories:
        story_name = story_data.get('story', '')
        steps = story_data.get('steps', [])
        processed_data['stories'].append({'story': story_name, 'steps': steps})

    # Preprocess Rules
    rules = data.get('rules', {}).get('rules', [])
    for rule_data in rules:
        rule_name = rule_data.get('rule', '')
        steps = rule_data.get('steps', [])
        processed_data['rules'].append({'rule': rule_name, 'steps': steps})

    return processed_data

# Get the preprocessed Stories and Rules data
train_stories_rules = preprocess_stories_rules_data(data)

# Print the preprocessed Stories and Rules data
print(train_stories_rules)

{'stories': [{'story': 'greet', 'steps': [{'intent': 'greet'}, {'action': 'utter_greet'}]}, {'story': 'goodbye', 'steps': [{'intent': 'goodbye'}, {'action': 'utter_goodbye'}]}, {'story': 'places_info', 'steps': [{'intent': 'placesinfo'}, {'action': 'utter_placesinfo'}]}, {'story': 'location', 'steps': [{'intent': 'location'}, {'action': 'utter_location'}]}, {'story': 'categories', 'steps': [{'intent': 'categories'}, {'action': 'utter_categories'}]}, {'story': 'recommendation', 'steps': [{'intent': 'recommendation'}, {'action': 'utter_recommendation'}]}, {'story': 'mood_happy', 'steps': [{'intent': 'mood_happy'}, {'action': 'utter_mood_happy'}]}, {'story': 'mood_unhappy', 'steps': [{'intent': 'mood_unhappy'}, {'action': 'utter_mood_unhappy'}]}, {'story': 'mood_sad', 'steps': [{'intent': 'mood_sad'}, {'action': 'utter_mood_sad'}]}, {'story': 'bot_challenge', 'steps': [{'intent': 'bot_challenge'}, {'action': 'utter_bot_challenge'}]}, {'story': 'happy', 'steps': [{'intent': 'happy'}, {'act

In [None]:
def preprocess_nlu_data(data):
    processed_data = {'nlu': []}

    # Preprocess NLU data
    nlu_intents = data.get('nlu', {}).get('intents', [])
    for intent_data in nlu_intents:
        intent_name = intent_data.get('intent', '')
        examples = intent_data.get('examples', [])
        processed_data['nlu'].append({'intent': intent_name, 'examples': examples})

    return processed_data


In [None]:
def extract_dialogue_training_data(stories_rules_data):
    dialogue_data = {'user_input': [], 'bot_response': []}

    # Extract user input and bot response pairs from stories
    for story_data in stories_rules_data.get('stories', []):
        steps = story_data.get('steps', [])
        user_turns = [step.get('intent', '') for step in steps if 'intent' in step]
        bot_turns = [step.get('action', '') for step in steps if 'action' in step]

        # Exclude empty turns
        user_turns = [turn for turn in user_turns if turn]
        bot_turns = [turn for turn in bot_turns if turn]

        # Combine user and bot turns into dialogue pairs
        dialogue_pairs = list(zip(user_turns, bot_turns))

        # Add the dialogue pairs to the training data
        dialogue_data['user_input'].extend([pair[0] for pair in dialogue_pairs])
        dialogue_data['bot_response'].extend([pair[1] for pair in dialogue_pairs])

    return dialogue_data

# Get the dialogue training data
train_dialogue_data = extract_dialogue_training_data(train_stories_rules)

# Print the dialogue training data
print(train_dialogue_data)


{'user_input': ['greet', 'goodbye', 'placesinfo', 'location', 'categories', 'recommendation', 'mood_happy', 'mood_unhappy', 'mood_sad', 'bot_challenge', 'happy'], 'bot_response': ['utter_greet', 'utter_goodbye', 'utter_placesinfo', 'utter_location', 'utter_categories', 'utter_recommendation', 'utter_mood_happy', 'utter_mood_unhappy', 'utter_mood_sad', 'utter_bot_challenge', 'utter_happy']}


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def encode_dialogue_data(user_input, bot_response):
    # Combine user input and bot response into a single list
    dialogue_corpus = user_input + bot_response

    # Use CountVectorizer to convert text data into numerical vectors
    vectorizer = CountVectorizer()
    dialogue_vectors = vectorizer.fit_transform(dialogue_corpus).toarray()

    # Separate the vectors back into user input and bot response
    user_input_vectors = dialogue_vectors[:len(user_input)]
    bot_response_vectors = dialogue_vectors[len(user_input):]

    return user_input_vectors, bot_response_vectors

# Encode the dialogue training data
train_user_input_vectors, train_bot_response_vectors = encode_dialogue_data(
    train_dialogue_data['user_input'],
    train_dialogue_data['bot_response']
)

# Print the encoded dialogue training data
print("User Input Vectors:")
print(train_user_input_vectors)
print("\nBot Response Vectors:")
print(train_bot_response_vectors)


User Input Vectors:
[[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

Bot Response Vectors:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a new special token for padding
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Set the new padding token
model.config.pad_token_id = tokenizer.pad_token_id

# Example: Fine-tune the model on your dialogue data
# Replace this with your actual dialogue data
dialogue_data = [
    "User: How are you?",
    "Bot: I'm good, thanks!",
    "User: Tell me a joke.",
    "Bot: Why don't scientists trust atoms? Because they make up everything!"
]

# Save the dialogue data to a text file
with open("dialogue_data.txt", "w") as file:
    file.write("\n".join(dialogue_data))

# Tokenize the dialogue data with padding and truncation
tokenized_data = tokenizer(dialogue_data, return_tensors="pt", max_length=512, truncation=True, padding=True)

# Create a TextDataset for training
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="dialogue_data.txt",  # Use the path to the text file
    block_size=128,  # Adjust based on your data and available resources
    overwrite_cache=True,
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=1,
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
)

# Fine-tune the GPT-2 model on your data
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")
