<a href="https://colab.research.google.com/github/abhishektripathi66/RecomendationSystem/blob/main/characterlevelPrediciton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset, DataLoader

# Load the dataset
dataset_path = "testdata.csv"
df = pd.read_csv(dataset_path)

# Preprocess the data
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'eos_token': '[PAD]'})  # Add padding token
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Tokenize the input and next words
input_texts = df["prefix"].tolist()
next_words = df["next_word"].tolist()
encoded_data = tokenizer(input_texts, text_pair=next_words, padding=True, truncation=True, return_tensors="pt")

class CustomDataset(Dataset):
    def __init__(self, encoded_data):
        self.input_ids = encoded_data["input_ids"]
        self.attention_mask = encoded_data["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.input_ids[idx]  # Use input_ids as labels
        }

train_dataset = CustomDataset(encoded_data)

# Define the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Fine-tune the model with additional adjustments
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)  # Adjusted learning rate

model.train()
for epoch in range(100):  # Train for 5 epochs
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

# Save the model
model.save_pretrained("word_prediction_model")
tokenizer.save_pretrained("word_prediction_model")





{'input_ids': tensor([   81,  2200,  3697,  8808, 50257, 50257, 50257]), 'attention_mask': tensor([1, 1, 1, 1, 0, 0, 0]), 'labels': tensor([   81,  2200,  3697,  8808, 50257, 50257, 50257])}
Epoch 1, Loss: 37.49110164642334
Epoch 2, Loss: 5.981650543212891
Epoch 3, Loss: 3.753898525238037
Epoch 4, Loss: 2.8920857906341553
Epoch 5, Loss: 2.4336049556732178
Epoch 6, Loss: 2.1835779428482054
Epoch 7, Loss: 1.9286766290664672
Epoch 8, Loss: 1.6818181991577148
Epoch 9, Loss: 1.5303081750869751
Epoch 10, Loss: 1.2995360136032104
Epoch 11, Loss: 1.1748485565185547
Epoch 12, Loss: 1.0438292145729064
Epoch 13, Loss: 0.9968490958213806
Epoch 14, Loss: 0.855038583278656
Epoch 15, Loss: 0.7869224429130555
Epoch 16, Loss: 0.6829974412918091
Epoch 17, Loss: 0.7049263000488282
Epoch 18, Loss: 0.6196663618087769
Epoch 19, Loss: 0.6181263446807861
Epoch 20, Loss: 0.6203503727912902
Epoch 21, Loss: 0.56784508228302
Epoch 22, Loss: 0.6478658676147461
Epoch 23, Loss: 0.5848889231681824
Epoch 24, Loss: 0.5

('word_prediction_model/tokenizer_config.json',
 'word_prediction_model/special_tokens_map.json',
 'word_prediction_model/vocab.json',
 'word_prediction_model/merges.txt',
 'word_prediction_model/added_tokens.json')

In [35]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("word_prediction_model")
tokenizer = GPT2Tokenizer.from_pretrained("word_prediction_model")

# Function to generate predictions for custom input prefixes
def predict_next_word(prefix):
    # Tokenize the input prefix
    input_ids = tokenizer.encode(prefix, return_tensors="pt")

    # Generate predictions
    with torch.no_grad():
        output = model.generate(input_ids, max_length=20, num_return_sequences=1)

    print(output)
    # Decode and return the predicted word
    predicted_word = tokenizer.decode(output[0], skip_special_tokens=True)
    return predicted_word

# Example usage
prefix = input("Enter the prefix: ")  # Input prefix for prediction
predicted_word = predict_next_word(prefix)
# predicted_word = predicted_word.replace(prefix, "", 1)
print("Predicted next word:", predicted_word)


Enter the prefix: ref


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 5420,   330, 31688, 10659,    43, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257]])
Predicted next word: refacREFACTL
