In [3]:
import pandas as pd 

In [4]:
import pandas as pd

df = pd.read_json("hf://datasets/toughdata/quora-question-answer-dataset/Quora-QuAD.jsonl", lines=True)

In [5]:
df

Unnamed: 0,question,answer
0,Why whenever I get in the shower my girlfriend...,Isn‚Äôt it awful? You would swear that there was...
1,"What is a proxy, and how can I use one?",A proxy server is a system or router that prov...
2,"What song has the lyrics ""someone left the cak...",MacArthur's Park\n
3,I am the owner of an adult website called http...,Don't let apps that are liers put adds on your...
4,Does the Bible mention anything about a place ...,St. John in the book of Revelation mentions an...
...,...,...
56397,"Alexandria Ocasio-Cortez said ""Going by track ...","I think she‚Äôs right, one is a homosexual with ..."
56398,Is becoming a doctor financially worth it?,Yes if you want to help people and eliminate p...
56399,Where can one find the best biryani in bangalore?,Biryani crafts.These guys will give proper aut...
56400,Which smartphone is best for middle class people?,Oneplus nord\n[LINKED_TEXT: https://latesttech...


In [6]:
df.shape

(56402, 2)

In [7]:
def tokenize(text):

  text=text.lower()
  text=text.replace("?","")
  return text.split()

In [8]:
tokenize("Why whenever I get in the shower my girlfriend want to join?")

['why',
 'whenever',
 'i',
 'get',
 'in',
 'the',
 'shower',
 'my',
 'girlfriend',
 'want',
 'to',
 'join']

In [9]:
df.isnull().sum()

question    0
answer      0
dtype: int64

In [10]:
import re

def remove_url(text):
    return re.sub(r'http\S+|www\.\S+', '', text)


In [11]:
df["question"] = df["question"].apply(remove_url)
df["answer"]   = df["answer"].apply(remove_url)


In [12]:
vocab={"<unk>":0}

In [13]:
## make a vocab
def build_vocab(row):
  #print(row["question"],row["answer"])
  tokens_q=tokenize(row["question"])
  tokens_a=tokenize(row["answer"])
  merged_tokens=tokens_q+tokens_a
  #print(merged_tokens)
  ## har 
  for token in merged_tokens:
    if token not in vocab:
      vocab[token]=len(vocab)
    

In [14]:
df.apply(build_vocab,axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
56397    None
56398    None
56399    None
56400    None
56401    None
Length: 56402, dtype: object

In [15]:
vocab

{'<unk>': 0,
 'why': 1,
 'whenever': 2,
 'i': 3,
 'get': 4,
 'in': 5,
 'the': 6,
 'shower': 7,
 'my': 8,
 'girlfriend': 9,
 'want': 10,
 'to': 11,
 'join': 12,
 'isn‚Äôt': 13,
 'it': 14,
 'awful': 15,
 'you': 16,
 'would': 17,
 'swear': 18,
 'that': 19,
 'there': 20,
 'wasn‚Äôt': 21,
 'enough': 22,
 'hot': 23,
 'water': 24,
 'go': 25,
 'around!': 26,
 'what': 27,
 'is': 28,
 'a': 29,
 'proxy,': 30,
 'and': 31,
 'how': 32,
 'can': 33,
 'use': 34,
 'one': 35,
 'proxy': 36,
 'server': 37,
 'system': 38,
 'or': 39,
 'router': 40,
 'provides': 41,
 'gateway': 42,
 'between': 43,
 'users': 44,
 'internet.': 45,
 'therefore,': 46,
 'helps': 47,
 'prevent': 48,
 'cyber': 49,
 'attackers': 50,
 'from': 51,
 'entering': 52,
 'private': 53,
 'network.': 54,
 'server,': 55,
 'referred': 56,
 'as': 57,
 'an': 58,
 '‚Äúintermediary‚Äù': 59,
 'because': 60,
 'goes': 61,
 'end-users': 62,
 'web': 63,
 'pages': 64,
 'they': 65,
 'visit': 66,
 'online.': 67,
 'when': 68,
 'computer': 69,
 'connects': 70

In [16]:
## no of unique tokens
len(vocab)

339788

In [None]:
bahut jyada data hain

In [19]:
## convert words to indices 
def words_to_indices(text,vocab):
  indexed_text=[]
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab["<unk>"])
  return indexed_text
  tokens=tokenize(text)

In [21]:
import torch 
from torch.utils.data import Dataset,DataLoader


In [47]:
MAX_ANSWER_LEN = 256

def truncate_sequence(seq, max_len):
    return seq[:max_len] if len(seq) > max_len else seq


In [48]:
class qadataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab
  def __len__(self):
    return self.df.shape[0]
  def __getitem__(self,idx):
    row=self.df.iloc[idx]
    question_indices=words_to_indices(row["question"],self.vocab)
    answer_indices=truncate_sequence(words_to_indices(row["answer"],self.vocab),MAX_ANSWER_LEN)
    return torch.tensor(question_indices),torch.tensor(answer_indices)

In [49]:
words_to_indices("hello how are you",vocab)

[2254, 32, 161, 16]

In [50]:
dataset=qadataset(df,vocab)

In [51]:
dataset[0]

(tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
 tensor([13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 11, 25, 26]))

In [52]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    questions = [item[0] for item in batch]   # extract question tensors
    answers = [item[1] for item in batch]     # extract answer tensors

    # pad questions
    questions_padded = pad_sequence(
        questions,
        batch_first=True,
        padding_value=0
    )

    # pad answers
    answers_padded = pad_sequence(
        answers,
        batch_first=True,
        padding_value=0
    )

    return questions_padded, answers_padded


In [53]:
dataloader=DataLoader(dataset,batch_size=32,collate_fn=collate_fn,shuffle=True)

In [54]:
for batch_idx, (questions, answers) in enumerate(dataloader):
    print(f"Batch {batch_idx+1}:")
    print("Questions shape:", questions.shape)
    print("Answers shape:", answers.shape)
    break   # just check first batch


Batch 1:
Questions shape: torch.Size([32, 38])
Answers shape: torch.Size([32, 256])


In [31]:
len(dataloader)

1763

In [32]:
for question,answer in dataloader:
  print(question,answer)

tensor([[   386,      6,  14097,  15591,    377,  62087,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0],
        [   109,     28,     29,   2670,  15941,   1507,  15942,     39,  15943,
           8355,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0],
        [    27,    161,      6,    429,   5939,   5940,   1186,   4133,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0],
        [   132,     16,    409,    494,  11119,     90,     77,   8433,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0],
        [    27,     28,      6,   1244,     43,  19569,     31,  19570,      0,
              0,      0,      0,      0

In [38]:
## now lets buildrnn model
import torch.nn as nn
import torch

class RNNModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, 128)

        # batch_first=True is VERY important
        self.rnn = nn.RNN(
            input_size=128,
            hidden_size=256,
            batch_first=True
        )

        self.linear = nn.Linear(256, vocab_size)

    def forward(self, question):
        # question shape: (batch, seq_len)
        embedded = self.embedding(question)  # (batch, seq_len, 128)

        output, hidden = self.rnn(embedded)

        # hidden = (1, batch, 256) ‚Üí we convert to (batch, 256)
        hidden = hidden.squeeze(0)

        # Predict next word ‚Üí shape (batch, vocab_size)
        logits = self.linear(hidden)

        return logits


In [35]:
da=nn.Embedding(33699,embedding_dim=128)

In [39]:
model=RNNModel(len(vocab))

In [36]:
dataset[0][0]

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [37]:
da(dataset[0][0]).shape

torch.Size([12, 128])

In [40]:
leraning_rate=0.001
num_epochs=15
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=leraning_rate)

In [43]:
for question,answer in dataloader :
  print(question.shape[1],answer.shape[1])
 

52 16684
41 948
24 406
50 5069
45 799
27 1085
42 581
44 405
39 859
45 525
44 882
43 256
52 1358
36 518
50 715
32 1190
40 469
36 1013
47 520
38 443
38 806
34 764
34 981
29 861
33 563
42 657
48 604
42 464
37 2082
45 316
41 1747
38 893
38 2228
45 305
28 902
50 3444
42 706
45 864
50 626
27 850
41 1553
34 411
37 967
38 468
32 1983
38 956
50 1215
42 893
50 512
43 1499
42 628
36 1646
45 361
38 947
38 531
44 835
39 383
44 713
37 517
34 1475
31 601
45 4069
45 835
33 592
45 1224
32 598
28 594
36 713
44 561
45 1266
50 385
44 769
38 1039
52 549
45 512
44 1104
46 2213
35 1201
41 604
36 869
43 392
38 1056
36 548
25 802
45 631
30 537
35 1306
38 2328
48 850
45 25186
35 808
45 2276
36 486
42 716
30 334
37 394
40 969
43 480
34 535
50 369
44 764
38 455
26 877
34 1270
36 566
41 978
43 838
34 703
40 625
45 1092
39 652
45 583
38 601
40 2822
45 561
20 523
41 763
38 489
34 565
39 534
39 859
32 362
29 1700
44 303
45 322
37 517
25 778
48 389
50 551
38 1309
40 516
45 462
32 558
43 1206
31 502
37 520
39 634
42 61

In [None]:
for epoch in range(num_epochs):
    total_loss=0
    
        
    for question,answer in dataloader:
        optimizer.zero_grad()
        outputs=model(question)
        loss=criterion(outputs,answer[:,0])

        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    print(f"epoch {epoch+1},loss:{total_loss}")

        