<a href="https://colab.research.google.com/github/utanashati/conversational-ai-workshop/blob/main/1/Conversational_AI_workshop_Task_1a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Can your AI beat the Turing test?

## Task 1: Language models

Welcome to the workshop! 

In this first part you will train a character-based language model called minGPT from scratch! This will *not* give you a chat bot yet, but it gives you a model which is able to generate text, which lays the basis of open domain chat bots. Please have a bit of patience - we will get to this in task 2!


### Important resources/links
* [The workshop GitHub repo](https://github.com/utanashati/conversational-ai-workshop)
* [Official minGPT repo](https://github.com/karpathy/minGPT)

## Setting things up

In [None]:
!nvidia-smi | grep -q 'failed' && echo "You are using a runtime without a GPU. Change the runtime type before going further!"
!git clone https://github.com/aig-upf/conversational-ai-workshop.git workshop_repo
!git clone https://github.com/lorenzosteccanella/minGPT
!cp -r minGPT/mingpt .

In [None]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [None]:
import numpy as np
import math
import os
import sys
import torch

# PyTorch implements a genertic dataset class for us to load data into our model
from torch.utils.data import Dataset

# We import some modules from minGPT - No worries we will explain the code later!
from mingpt.utils import sample
from mingpt.trainer import Trainer, TrainerConfig
from mingpt.model import GPT, GPTConfig

In [None]:
class CharDataset(Dataset):

    def __init__(self, data, block_size):
        # get set of unique characters in dataset
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        # build look-up table of character -> idx and idx -> character
        self.stoi = { ch: i for i, ch in enumerate(chars) }
        self.itos = { i: ch for i, ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:(idx + self.block_size + 1)]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y y


In [None]:
#@title Pick a dataset { run: "auto" }
dataset = 'donald_trump_tweets' #@param ["shakespeare", "chess", "donald_trump_tweets", "music","javascript", "typescript", "json", "html"]

In [None]:
# how large are input context window is
block_size = 64   
# load entire data into memory :)
text = open(os.path.join('workshop_repo', '1', 'datasets', dataset + '.txt'), 'r').read()
# create an instance of our dataset class
train_dataset = CharDataset(text, block_size)
# Create an instance of the configuration class, defining how large of a model we want to train
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=8, n_head=8, n_embd=512)
# Initialize a model based on the config
model = GPT(mconf)

In [None]:
#@title Training configuration { run: "auto" }
max_epochs = 1 #@param {type: "integer"}
batch_size = 128 #@param {type: "integer"}

In [None]:
# Your task here is to try to implement by your owkn a multiclass cross entropy that will be used to train your model.


import torch.nn.functional as F

def loss_fn(logits, targets):

  # ### reshape logits and targets

  # logits = logits.view(-1, logits.size(-1))
  # targets = targets.view(-1)

  # ### let's transform logits in probability and one hot encode the targets

  # prob = F.softmax(logits, 1)
  # one_hot_targets = F.one_hot(targets, num_classes= 99)

  # ### let's define multiclass cross entropy

  # loss = - (one_hot_targets * prob.log()).sum() / targets.shape[0]

  return loss

In [None]:
# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=max_epochs, batch_size=128, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf, loss_fn)
trainer.train()

# Generate some text!

In order to generate text we need to set a context. The model will then extend it.

In [None]:
#@title Sampling parameters { run: "auto" }
temperature = 1 #@param {type: "slider", "min":0, "max": 10, "step": 0.1}
top_p = 1.6 #@param {type: "slider", "min":0, "max": 10, "step": 0.1}
top_k =  10 #@param {type: "slider", "min":1, "max": 20, "step": 1}

In [None]:
while True:
  context = input('Enter a context:\n>>> ')
  if len(context) == 0:
    continue
  x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
  y = sample(model, x, 1000, temperature=temperature, sample=True, top_k=top_k, top_p=top_p)[0]
  completion = ''.join([train_dataset.itos[int(i)] for i in y])
  print(completion)