In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install nltk
!pip install pymed

[0mCollecting pymed
  Downloading pymed-0.8.9-py3-none-any.whl (9.6 kB)
Installing collected packages: pymed
Successfully installed pymed-0.8.9
[0m

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib
from pymed import PubMed
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
%matplotlib inline

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7f269a4a1f10>

In [5]:
pubmed = PubMed(tool="MyTool", email="coolarun477@gmail.com")
results = pubmed.query("covid", max_results=9000)
results = [ele.toDict() for ele in results]

In [6]:
print('total: ', len(results))

total:  9000


In [7]:
text = ''
for doc in results:
  title = doc.get('title') or ''
  abstract = doc.get('abstract') or ''
  conclusions = doc.get('conclusions') or ''
  combined_text = title +  ' ' + abstract + ' ' + conclusions
  keywords = list(set(doc.get('keywords', [])))
  text += ' ' + combined_text
  text += ' '.join(keywords)

In [8]:
chars = sorted(list(set(text)))
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}
encode = lambda s: [stoi[i] for i in s]
decode = lambda e: "".join([itos[i] for i in e])
vocab_size = len(chars)

In [9]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(len(data)* 0.9)
train_data = data[:n] # 90% train data
val_data = data[n:]

In [10]:
dropout = 0.2
block_size = 356 # max context length
embd_dim = 400
n_layer = 10
num_heads = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'
learning_rate = 3e-4
batch_size = 64
eval_iters = 200
max_iters = 17000
eval_interval = 500

In [11]:
# sigle attention head
class Head(nn.Module):
  def __init__(self, embd_dim, head_size):
    super().__init__()
    self.query = nn.Linear(embd_dim, head_size, bias=False)
    self.key = nn.Linear(embd_dim, head_size, bias=False)
    self.val = nn.Linear(embd_dim, head_size, bias=False)
    self.dropout = nn.Dropout(dropout)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
  
  def forward(self, x):
    B, T, C = x.shape
    q = self.query(x) # (B, T, head_size)
    k = self.key(x) # (B, T, head_size)
    v = self.val(x) # (B, T, head_size)
    wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf')) # (B, T, T)
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    wei = self.dropout(wei) # (B, T, T)
    out = wei @ v # (B, T, head_size)
    return out

In [12]:
# mulit head attention (just a compose of single head attention)
class MultiHeadAttention(nn.Module):
  def __init__(self, embd_dim, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(embd_dim, head_size) for _ in range(num_heads)])
    self.dropout = nn.Dropout(dropout)
    self.proj = nn.Linear(embd_dim, embd_dim)
  
  def forward(self, x):
    out = [h(x) for h in self.heads]
    out = torch.cat(out, dim=-1)
    out = self.proj(out)
    out = self.dropout(out)
    return out

In [13]:
class FeedForward(nn.Module):
  def __init__(self, embd_dim):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(embd_dim, 4*embd_dim),
        nn.ReLU(),
        nn.Linear(4*embd_dim, embd_dim),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.net(x)

In [14]:
class Block(nn.Module):
  def __init__(self, embd_dim, num_heads):
    super().__init__()
    head_size = embd_dim // num_heads
    self.sa = MultiHeadAttention(embd_dim, num_heads, head_size)
    self.fw = FeedForward(embd_dim)
    self.ln1 = nn.LayerNorm(embd_dim)
    self.ln2 = nn.LayerNorm(embd_dim)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.fw(self.ln2(x))
    return x

In [15]:
class BigramModel(nn.Module):
  def __init__(self,):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, embd_dim)
    self.position_embedding_table = nn.Embedding(block_size, embd_dim)
    self.blocks = nn.Sequential(*[Block(embd_dim, num_heads) for _ in range(n_layer)])
    self.lf = nn.LayerNorm(embd_dim)
    self.lm_head = nn.Linear(embd_dim, vocab_size)

  def forward(self, idx, target=None):
    B, T = idx.shape

    token_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = token_emb + pos_emb
    x = self.blocks(x)
    x = self.lf(x)
    logits = self.lm_head(x)

    if target is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      target = target.view(B*T)
      loss = F.cross_entropy(logits, target)
    
    return loss, logits

In [16]:
model = BigramModel()
m = model.to(device)

In [17]:
n_params = sum(ele.numel() for ele in m.parameters())/1e6
print(n_params, 'M parameters')

20.522222 M parameters


In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [19]:
def get_batch(split):
  data = train_data if split=='train' else val_data
  ix = torch.randint(len(data)-block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+1+block_size] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

In [20]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      loss, logits = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

In [21]:
for iter in range(max_iters):
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch('train')
  loss, logits = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

step 0: train loss 7.3712, val loss 7.3718
step 500: train loss 2.2353, val loss 2.2373
step 1000: train loss 1.4904, val loss 1.5058
step 1500: train loss 1.2754, val loss 1.2917
step 2000: train loss 1.1791, val loss 1.2021
step 2500: train loss 1.1294, val loss 1.1607
step 3000: train loss 1.0777, val loss 1.1048
step 3500: train loss 1.0296, val loss 1.0629
step 4000: train loss 1.0036, val loss 1.0425
step 4500: train loss 0.9794, val loss 1.0169
step 5000: train loss 0.9591, val loss 1.0000
step 5500: train loss 0.9386, val loss 0.9880
step 6000: train loss 0.9274, val loss 0.9741
step 6500: train loss 0.9132, val loss 0.9615
step 7000: train loss 0.8997, val loss 0.9524
step 7500: train loss 0.8867, val loss 0.9424
step 8000: train loss 0.8780, val loss 0.9414
step 8500: train loss 0.8679, val loss 0.9306
step 9000: train loss 0.8605, val loss 0.9324
step 9500: train loss 0.8485, val loss 0.9193
step 10000: train loss 0.8416, val loss 0.9168
step 10500: train loss 0.8366, val lo

In [22]:
def generate(idx, max_new_tokens):
  for _ in range(max_new_tokens):
    idx_cond = idx[:, -block_size:] # crop idx to the block_size tokens
    loss, logits = model(idx_cond)
    logits = logits[:, -1, :]
    probs = F.softmax(logits, dim=-1)
    idx_next = torch.multinomial(probs, num_samples=1)
    idx = torch.cat((idx, idx_next), dim=1)
  return idx

In [23]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(generate(context, max_new_tokens=1000)[0].tolist()))


Link-of-license COVID-19 pandemic Increased dental support well-being compliants COVID-19 Nigoblods, Lengthanolizations Have Not: The Viral RNA-1global Treatment option. Consensual racism and ethical enhances a patient with insect of existin drugs (vbra) at discourse, even after an elementopy networking in the care service. The Virus, a failure Iral SARS, one of determinant hierarchy, MDT is less often explained. Consent of exhibits information can following older adults during October 2020, recently studies, which focused on patient with two drugs of extra service (e.g., unit) and at time "case" monitoring of 'can used nasopharyngeal infusion in pregnant women, eventually suggested, allergical conditions and regularity. Medicines used individuals around two of them brokeen equipment inclusion criteria, outcomes and the factoristical points of the anti-SST code of 28 monitoring. Corresponding to a useful survey (rate of pregnancy of tawal signs, women under commercial judocular positi

In [24]:
open('generated_output.txt', 'w').write(decode(generate(context, max_new_tokens=10000)[0].tolist()))

10001