<a href="https://colab.research.google.com/github/akash-assist/minorProject/blob/main/hindimodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Define your dataset class
class HindiStoryDataset(Dataset):
    def __init__(self, data_file, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Load and preprocess your dataset
        with open(data_file, 'r', encoding='utf-8') as file:
            self.data = file.readlines()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Tokenize and encode the text
        text = self.data[idx]
        input_ids = self.tokenizer.encode(text, max_length=self.max_length, truncation=True)
        return torch.tensor(input_ids, dtype=torch.long)

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load your dataset
dataset = HindiStoryDataset("stories-akash.txt", tokenizer)

# DataLoader for batching and shuffling
def collate_fn(batch):
    return pad_sequence(batch, batch_first=True)

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training parameters
num_epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        batch = batch.to(device)
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1}: Loss - {loss.item()}")

# Save the trained model
model.save_pretrained("hindi_story_model")

Epoch 1: Loss - 0.8754977583885193
Epoch 2: Loss - 0.5614485740661621
Epoch 3: Loss - 0.6339242458343506


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up your model and tokenizer
model_path = "hindi_story_model"  # Replace this with the path to your trained model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()
model.to(device)  # Move the model to the same device as the input

# Prompt generation
prompt = "मैं तुम्हारा एक सच्चा प्रेमी हूँ"

# Generate text
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)  # Move the input to the same device
sample_outputs = model.generate(
    generated,
    do_sample=True,
    max_length=200,
    top_k=50,
    temperature=0.7
)

# Decode the generated output
generated_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)

print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


मैं तुम्हारा एक सच्चा प्रेमी हूँ अपने को सके कहाने कु परेमे करने परिश्रपण और से परिश्रपण और सपने के सपने किया कपने को बुच्चाल गो
