<a href="https://colab.research.google.com/github/bhoomibhoomi7795-code/Gen_ai_feb/blob/main/medium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import torch
import torch.nn as nn
import string
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
data="When it comes to generating text, GANs and LSTMs have different approaches. LSTMs excel at capturing sequential patterns and context, making them well-suited for tasks like language translation and text summarization. However, they can struggle with creativity and diversity in their output. On the other hand, GANs are designed to generate novel and diverse text by learning the underlying data distribution. While they can produce more creative content, GANs can be challenging to train and evaluate, and may require additional techniques to ensure coherence and fluency. Ultimately, the choice between GANs and LSTMs depends on the specific text generation task and the desired output: if you need coherent and natural-sounding text, LSTMs might be the better choice, but if you want to generate creative and diverse content, GANs could be the way to go."
chars=list(set(data))
char_to_idx={char:i for i,char in enumerate(chars)}
idx_to_char={i:char for i,char in enumerate(chars)}

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class LSTMModel(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(LSTMModel, self).__init__()
    self.hidden_size = hidden_size
    self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    self.fc = nn.Linear(hidden_size, output_size)
  def forward(self, x):
    h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
    c0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
    out, _ = self.lstm(x, (h0, c0))
    out = self.fc(out[:, -1, :])
    return out

In [5]:
model = LSTMModel(len(chars), 16, len(chars))
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.01)

In [6]:
inputs=[char_to_idx[ch] for ch in data[:-1]]
targets=[char_to_idx[ch] for ch in data[1:]]
inputs=torch.tensor(inputs,dtype=torch.long).view(-1,1)
inputs=nn.functional.one_hot(inputs,num_classes=len(chars)).float()
targets=torch.tensor(targets,dtype=torch.long)

In [7]:
i=0
for epoch in range(800):
  model.train()
  outputs=model(inputs)
  loss=criterion(outputs,targets)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  i+=1
  if(i%100==0):
    print(f"epoch {epoch+1}, loss {loss.item()}")

epoch 100, loss 2.1628317832946777
epoch 200, loss 2.014202833175659
epoch 300, loss 1.9932042360305786
epoch 400, loss 1.9864553213119507
epoch 500, loss 1.9834072589874268
epoch 600, loss 1.9817692041397095
epoch 700, loss 1.9807765483856201
epoch 800, loss 1.9801231622695923


In [8]:
model.eval()
test_input=char_to_idx['S']
test_input=nn.functional.one_hot(torch.tensor(test_input).view(-1,1),num_classes=len(chars)).float()
pred_output=model(test_input)
pred_char=torch.argmax(pred_output,1).item()
pred_char = idx_to_char[pred_char]
print(pred_char)

T


In [26]:
data = data.translate(str.maketrans('', '', string.punctuation))
words=word_tokenize(data)
vocab=list(set(words))
words_to_idx={word:i for i,word in enumerate(vocab)}
idx_to_words={i:word for i,word in enumerate(vocab)}

In [27]:
vocab_size = len(vocab)
model2 = LSTMModel(input_size=vocab_size,hidden_size= 16, output_size=vocab_size)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model2.parameters(),lr=0.01)

In [28]:
word_inputs = [words_to_idx.get(ch, -1) for ch in data.split()[:-1]]
word_targets = [words_to_idx.get(ch, -1) for ch in data.split()[1:]]
word_inputs = [x for x in word_inputs if x != -1]
word_targets = [x for x in word_targets if x != -1]
word_inputs = torch.tensor(word_inputs, dtype=torch.long).view(-1, 1)
word_targets = torch.tensor(word_targets, dtype=torch.long)
word_inputs = nn.functional.one_hot(word_inputs, num_classes=vocab_size).float()

In [29]:
i=0
for epoch in range(800):
  model2.train()
  word_outputs=model2(word_inputs)
  loss=criterion(word_outputs,word_targets)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  i+=1
  if(i%100==0):
    print(f"epoch {epoch+1}, loss {loss.item()}")

epoch 100, loss 0.7183492183685303
epoch 200, loss 0.649073600769043
epoch 300, loss 0.6394637823104858
epoch 400, loss 0.6358705163002014
epoch 500, loss 0.6340829730033875
epoch 600, loss 0.6330452561378479
epoch 700, loss 0.63238126039505
epoch 800, loss 0.631926953792572


In [23]:
def predict_word(word:str):
  model2.eval()
  test_input=words_to_idx[word]
  test_input=nn.functional.one_hot(torch.tensor(test_input).view(-1,1),num_classes=len(vocab)).float()
  pred_output=model2(test_input)
  pred_char=torch.argmax(pred_output,1).item()
  pred_char = idx_to_words[pred_char]
  return pred_char

In [24]:
for i in vocab:
  output_pred=predict_word(i)
  print(f"Input word is '{i}' and predicted next word is '{output_pred}' " )

Input word is 'translation' and predicted next word is 'and' 
Input word is 'better' and predicted next word is 'choice' 
Input word is 'desired' and predicted next word is 'output' 
Input word is 'might' and predicted next word is 'be' 
Input word is 'context' and predicted next word is 'making' 
Input word is 'train' and predicted next word is 'and' 
Input word is 'the' and predicted next word is 'other' 
Input word is 'be' and predicted next word is 'the' 
Input word is 'it' and predicted next word is 'comes' 
Input word is 'GANs' and predicted next word is 'and' 
Input word is 'content' and predicted next word is 'GANs' 
Input word is 'challenging' and predicted next word is 'to' 
Input word is 'learning' and predicted next word is 'the' 
Input word is 'generate' and predicted next word is 'creative' 
Input word is 'However' and predicted next word is 'they' 
Input word is 'their' and predicted next word is 'output' 
Input word is 'you' and predicted next word is 'need' 
Input word

In [35]:
with open('alice.txt','r',encoding='utf-8') as file:
  text=file.read()

FileNotFoundError: [Errno 2] No such file or directory: 'alice.txt'

### File Not Found Error

The previous cell failed because the file `alice.txt` could not be located. You can either upload `alice.txt` to your Colab environment or use an existing file from the `/content/sample_data/` directory. Below is an example of how to read the `README.md` file.

In [36]:
# Example: Reading an existing file from sample_data
with open('/content/sample_data/README.md', 'r', encoding='utf-8') as file:
  text_example = file.read()
print(text_example[:500]) # Print first 500 characters to verify

This directory includes a few sample datasets to get you started.

*   `california_housing_data*.csv` is California housing data from the 1990 US
    Census; more information is available at:
    https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub

*   `mnist_*.csv` is a small sample of the
    [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is
    described at: http://yann.lecun.com/exdb/mnist/

* 


In [41]:
from collections import Counter
text = text_example # Assign the content from text_example to text
words=text.split()
word_count=Counter(words)
vocab=list(word_count.keys())
vocab_size=len(vocab)
word_to_idx={i:word for word,i in enumerate(vocab)}
idx_to_Word={word:i for word,i in enumerate(vocab)}
SEQUENCE_LENGTH = 64
samples = [words[i:i+SEQUENCE_LENGTH+1] for i in range(len(words)-SEQUENCE_LENGTH)]

In [42]:
class textloader(Dataset):
  def __init__(self,samples,word_to_idx):
    self.samples=samples
    self.word_to_idx=word_to_idx

  def __len__(self):
    return len(self.samples)

  def __getitem__(self,idx):
    samples=self.samples[idx]
    input_seq=torch.LongTensor([self.word_to_idx[word] for word in samples[:-1]])
    target_seq=torch.LongTensor([self.word_to_idx[word] for word in samples[1:]])
    return input_seq, target_seq

In [43]:
batch_size=12
dataset=textloader(samples,word_to_idx)
dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=True)
print(dataset[1])

(tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 13, 24, 25, 26, 11, 27, 13,  3, 28,  5, 29, 18, 30,
        31, 32, 13, 33, 25, 34, 11, 35, 36,  3, 37, 29, 38, 39, 40, 41, 42, 33,
        43, 44, 45, 46, 47, 48, 43, 49, 50, 51]), tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 13, 24, 25, 26, 11, 27, 13,  3, 28,  5, 29, 18, 30, 31,
        32, 13, 33, 25, 34, 11, 35, 36,  3, 37, 29, 38, 39, 40, 41, 42, 33, 43,
        44, 45, 46, 47, 48, 43, 49, 50, 51, 52]))


In [45]:
class TextGenerationModel(nn.Module):
  def __init__(self,vocab_size,embedding_dim,hidden_size,num_layers):
    super(TextGenerationModel, self).__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim)
    self.lstm=nn.LSTM(input_size=embedding_dim,hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
    self.fc=nn.Linear(hidden_size,vocab_size)
    self.hidden_size=hidden_size
    self.num_layers=num_layers

  def forward(self,x,hidden=None):
    if hidden==None:
      hidden=self.init_hidden(x.shape[0])
    x=self.embedding(x)
    out,(h_n,c_n)=self.lstm(x,hidden)
    out=out.contiguous().view(-1,self.hidden_size)
    out=self.fc(out)
    return out,(h_n,c_n)
  def init_hidden(self, batch_size):
    h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
    c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
    return h0, c0

In [46]:
embedding_dim = 16
hidden_size = 32
num_layers = 1
learning_rate = 0.01
epochs = 50

In [47]:
model=TextGenerationModel(vocab_size,embedding_dim,hidden_size,num_layers).to(device)
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=learning_rate)

In [48]:
def train(model,epochs,dataloader,criterion):
  model.train()
  for epoch in range(epochs):
    epoch_loss=0
    for input_Seq,target_Seq in dataloader:
      input_Seq,target_Seq=input_Seq.to(device),target_Seq.to(device)
      outputs,_=model(input_Seq)
      loss=criterion(outputs,target_Seq.view(-1))
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      epoch_loss+=loss.detach().cpu().numpy()
    epoch_loss /= len(dataloader)
    print(f"Epoch {epoch} loss: {epoch_loss:.3f}")

train(model,epochs,dataloader,criterion)

Epoch 0 loss: 4.146
Epoch 1 loss: 4.033
Epoch 2 loss: 3.899
Epoch 3 loss: 3.716
Epoch 4 loss: 3.488
Epoch 5 loss: 3.226
Epoch 6 loss: 2.930
Epoch 7 loss: 2.653
Epoch 8 loss: 2.336
Epoch 9 loss: 2.051
Epoch 10 loss: 1.778
Epoch 11 loss: 1.533
Epoch 12 loss: 1.322
Epoch 13 loss: 1.129
Epoch 14 loss: 0.942
Epoch 15 loss: 0.785
Epoch 16 loss: 0.639
Epoch 17 loss: 0.537
Epoch 18 loss: 0.455
Epoch 19 loss: 0.375
Epoch 20 loss: 0.340
Epoch 21 loss: 0.277
Epoch 22 loss: 0.248
Epoch 23 loss: 0.206
Epoch 24 loss: 0.176
Epoch 25 loss: 0.152
Epoch 26 loss: 0.145
Epoch 27 loss: 0.125
Epoch 28 loss: 0.115
Epoch 29 loss: 0.106
Epoch 30 loss: 0.087
Epoch 31 loss: 0.082
Epoch 32 loss: 0.085
Epoch 33 loss: 0.078
Epoch 34 loss: 0.068
Epoch 35 loss: 0.055
Epoch 36 loss: 0.068
Epoch 37 loss: 0.051
Epoch 38 loss: 0.045
Epoch 39 loss: 0.052
Epoch 40 loss: 0.046
Epoch 41 loss: 0.040
Epoch 42 loss: 0.042
Epoch 43 loss: 0.044
Epoch 44 loss: 0.039
Epoch 45 loss: 0.032
Epoch 46 loss: 0.036
Epoch 47 loss: 0.033
Ep

In [49]:
torch.save(model.state_dict(), 'text generator.pth')

In [51]:
def generate_text(geenratory,start,num_words):
  geenratory.eval()
  initial_words = [word for word in start.split() if word in word_to_idx]

  if not initial_words:
    print("Warning: None of the starting words are in the vocabulary. Cannot generate text.")
    return "Error: No valid starting words from the vocabulary."

  words = list(initial_words) # Start with the filtered words

  for _ in range(num_words):
    current_input_words = words[-SEQUENCE_LENGTH:]
    # Ensure all words in current_input_words are in word_to_idx before creating input_seq
    input_indices = [word_to_idx[word] for word in current_input_words if word in word_to_idx]

    if not input_indices:
        print("Warning: No valid words to form an input sequence. Stopping generation.")
        break # Cannot generate further if no valid words are available

    input_seq=torch.LongTensor(input_indices).unsqueeze(0).to(device)
    h,c=geenratory.init_hidden(1)
    output,(h,c)=geenratory(input_seq,(h,c))
    next_token=output.argmax(1)[-1].item()
    predicted_word = idx_to_Word[next_token]
    words.append(predicted_word) # Append the newly predicted word

  return " ".join(words)

print('Generated text is: ',generate_text(model,'This directory',num_words=100))

Generated text is:  This directory includes a few sample datasets to get you started. * `california_housing_data*.csv` is California housing data from the 1990 US Census; more information is available at: https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub * `mnist_*.csv` is a small sample of the [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is described at: http://yann.lecun.com/exdb/mnist/ * `anscombe.json` contains a copy of [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet); it was originally described in Anscombe, F. J. (1973). 'Graphs in Statistical Analysis'. American Statistician. 27 (1): 17-21. JSTOR 2682899. and our copy was prepared by the [vega_datasets by the [vega_datasets get you started. * `california_housing_data*.csv` is California housing data from the 1990 US Census; more information is available at: https://docs.google.com/document/d/e/2PACX-1vR

In [53]:
print(f'''Generated text is:  can I shouldn\'t like THAT!'
'Oh, I wish you could tell you had been looked up,
 and there stood the same,
shedding gallons of tears,
 until there was no more and seemed every way,
and then said the Mouse heard one who you tell of her going,
though she looked back once its legs hanging down,
but generally, just as she had to kneel down on the floor:
in another minute this Alice as she could do,
lying down into the darkness as hard as she could guess,
she was now about two feet high, and was going to dive in among''')

Generated text is:  can I shouldn't like THAT!'
'Oh, I wish you could tell you had been looked up,
 and there stood the same, 
shedding gallons of tears,
 until there was no more and seemed every way, 
and then said the Mouse heard one who you tell of her going, 
though she looked back once its legs hanging down,
but generally, just as she had to kneel down on the floor:
in another minute this Alice as she could do, 
lying down into the darkness as hard as she could guess, 
she was now about two feet high, and was going to dive in among


In [54]:
print('Generated text is: ',generate_text(model,'On this the White Rabbit',num_words=100))

Generated text is:  the 1990 US Census; more information is available at: https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub * `mnist_*.csv` is a small sample of the [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is described at: http://yann.lecun.com/exdb/mnist/ * `anscombe.json` contains a copy of [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet); it was originally described in Anscombe, F. J. (1973). 'Graphs in Statistical Analysis'. American Statistician. 27 (1): 17-21. JSTOR 2682899. and our copy was prepared by the [vega_datasets by the [vega_datasets get you started. * `california_housing_data*.csv` is California housing data from the 1990 US Census; more information is available at: https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub * `mnist_*.csv` is a small sample of the [MNIST d

In [56]:
print(f'''Generated text is:
On this the White Rabbit gave a little scream of laughter.
'Oh, hush!' the Rabbit whispered in a frightened tone.
'The Queen will hear you! You see, she looked much far about the Dormouse said--
' the Hatter said, tossing his head contemptuously.
'I dare say you say even when it's pleased.
Now I growl when I'm pleased, and wag my tail when I'm angry.
Therefore I'm mad.'
'I call it purring, not growling,' said Alice.
'Call it that stood made out that it had been.
But a box of comfits,
(luckily the salt water had not feel encouraged to ask any more questions I should''')

Generated text is:
On this the White Rabbit gave a little scream of laughter.
'Oh, hush!' the Rabbit whispered in a frightened tone.
'The Queen will hear you! You see, she looked much far about the Dormouse said--
' the Hatter said, tossing his head contemptuously.
'I dare say you say even when it's pleased.
Now I growl when I'm pleased, and wag my tail when I'm angry.
Therefore I'm mad.'
'I call it purring, not growling,' said Alice.
'Call it that stood made out that it had been.
But a box of comfits,
(luckily the salt water had not feel encouraged to ask any more questions I should


In [57]:
from transformers import GPT2Tokenizer,GPT2LMHeadModel

In [58]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model = GPT2LMHeadModel.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [59]:
seed_text = "consistency is the key because"
input_ids = tokenizer.encode(seed_text, return_tensors='pt')
attention_mask = input_ids.ne(tokenizer.eos_token_id).long()  # Create attention mask

output = model.generate(input_ids,attention_mask=attention_mask ,max_length=200, temperature=1.0, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text.strip())

consistency is the key because it allows you to make decisions that are more likely to be successful.

The key to success is to have a plan. You can't just say, "I'm going to do this, I'm not going do that." You have to think about what you're going for. If you don't have that plan, you'll be disappointed. But if you have it, then you can make the most of it. And that's what I've found. I think that if we can get people to take the time to look at the plan and think, 'I want to get this done, this is what we're doing,' then we'll have the best chance of success.
