<a href="https://colab.research.google.com/github/tahaShm/knowledge-distillation/blob/transfer-run/transferset_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 65.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 64.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.14.0-py3-none-any.whl (175 kB)
[K     |████████████████████████████████

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model=xxx)
set_seed(42)
generator("", max_length=30, num_return_sequences=5)

NameError: ignored

In [None]:
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=500)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, I\'m a language model, and the problem we\'re going to solve is the concept of the "logical relationship," so we can try'},
 {'generated_text': 'Hello, I\'m a language model, and I can be mean like this: "oh my god I am very bad", because I\'m talking about'},
 {'generated_text': "Hello, I'm a language model, but, this is a world I live in.\n\nI see that my brain doesn't seem to take"},
 {'generated_text': 'Hello, I\'m a language model, I\'m not going to talk about other languages, I\'m going to talk about the actual programming language."\n'},
 {'generated_text': "Hello, I'm a language model, a language model which is not just a binary model, but a model of language data, which is a model"},
 {'generated_text': "Hello, I'm a language model, and I'm a master's in that. So I need to understand what I'm doing… what it feels"},
 {'generated_text': "Hello, I'm a language model, and it's just about what I thought I was having fun learning there. I've made it so e

In [None]:
import csv
import os
import argparse
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_cosine_with_hard_restarts_schedule_with_warmup
import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
class MyDataset(Dataset):
	def __init__(self, data_file_name, data_dir='./'):
		super().__init__()

		data_path = os.path.join(data_file_name)

		self.data_list = []
		self.end_of_text_token = " <|endoftext|> "
		
		with open(data_path) as csv_file:
			csv_reader = csv.reader(csv_file, delimiter='\t')
			
			for row in csv_reader:
				data_str = f"{row[0]}: {row[1]}{self.end_of_text_token}"
				self.data_list.append(data_str)
		
	def __len__(self):
		return len(self.data_list)

	def __getitem__(self, item):
		return self.data_list[item]

In [None]:
def get_data_loader(data_file_name):
	dataset = MyDataset(data_file_name)
	data_loader = DataLoader(dataset, batch_size=1, shuffle=True)
	return data_loader

In [None]:
def train(epochs, data_loader, batch_size, tokenizer, model, device):	
	batch_counter = 0
	sum_loss = 0.0

	for epoch in range(epochs):
		print (f'Running {epoch+1} epoch')

		for idx, txt in enumerate(data_loader):
			txt = torch.tensor(tokenizer.encode(txt[0]))
			txt = txt.unsqueeze(0).to(device)
			outputs = model(txt, labels=txt)
			loss, _ = outputs[:2]
			loss.backward()
			sum_loss += loss.data

			if idx%batch_size==0:
				batch_counter += 1
				optimizer.step()
				scheduler.step()
				optimizer.zero_grad()
				model.zero_grad()

			if batch_counter == 10:
				print(f"Total Loss is {sum_loss}") #printed after every 10*batch_size
				batch_counter = 0
				sum_loss = 0.0

	return model

In [None]:
def save_model(model, name):
	"""
	Summary:
		Saving model to the Disk
	Parameters:
		model: Trained model object
		name: Name of the model to be saved
	"""
	print ("Saving model to Disk")
	torch.save(model.state_dict(), f"{name}")
	return

def load_models():
	"""
	Summary:
		Loading Pre-trained model
	"""
	print ('Loading/Downloading GPT-2 Model')
	tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
	model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
	return tokenizer, model

In [None]:

# parser = argparse.ArgumentParser(description='Arguments for training Text Augmentation model')

# parser.add_argument('--epoch', default= 3,type=int, action='store', help='Number of epochs to run')
# parser.add_argument('--warmup', default=300, type=int, action='store', help='Number of warmup steps to run')
# parser.add_argument('--model_name', default='mymodel.pt', type=str, action='store', help='Name of the model file')
# parser.add_argument('--data_file', default='mydata.csv', type=str, action='store', help='Name of the data file')
# parser.add_argument('--batch', type=int, default=32, action='store', help='Batch size')
# parser.add_argument('--learning_rate', default=3e-5, type=float, action='store', help='Learning rate for the model')
# parser.add_argument('--max_len', default=200, type=int, action='store', help='Maximum length of sequence')
# args = parser.parse_args()

BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 3e-5
WARMUP_STEPS = 300
MAX_SEQ_LEN = 200
MODEL_NAME = 'mymodel.pt'
DATA_FILE = 'mydata.tsv'

In [None]:
TOKENIZER, MODEL = load_models()
LOADER = get_data_loader(DATA_FILE)

DEVICE = 'cpu'
if torch.cuda.is_available():
  DEVICE = 'cuda'

model = MODEL.to(DEVICE)

Loading/Downloading GPT-2 Model


Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

In [None]:
DEVICE

'cuda'

In [None]:
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)

In [None]:
model = train(EPOCHS, LOADER, BATCH_SIZE, TOKENIZER, MODEL, DEVICE)
save_model(model, MODEL_NAME)

Running 1 epoch
Total Loss is 1801.5150146484375
Total Loss is 1941.7943115234375
Total Loss is 1821.6468505859375
Total Loss is 1700.4072265625
Total Loss is 1544.7379150390625
Total Loss is 1466.67724609375
Total Loss is 1388.838134765625
Total Loss is 1343.998291015625
Total Loss is 1296.03857421875
Total Loss is 1264.1611328125
Total Loss is 1288.0946044921875
Total Loss is 1267.6551513671875
Total Loss is 1255.740478515625
Total Loss is 1241.79248046875
Total Loss is 1264.81298828125
Total Loss is 1239.6658935546875
Total Loss is 1230.7244873046875
Saving model to Disk


## Generation

In [None]:
def choose_from_top_k_top_n(probs, k=50, p=0.8):
  ind = np.argpartition(probs, -k)[-k:]
  top_prob = probs[ind]
  top_prob = {i: top_prob[idx] for idx,i in enumerate(ind)}
  sorted_top_prob = {k: v for k, v in sorted(top_prob.items(), key=lambda item: item[1], reverse=True)}

  t=0
  f=[]
  pr = []
  for k,v in sorted_top_prob.items():
    t+=v
    f.append(k)
    pr.append(v)
    if t>=p:
      break
  
  top_prob = pr / np.sum(pr)
  token_id = np.random.choice(f, 1, p = top_prob)
  return int(token_id)

In [None]:
def generate(tokenizer, model, sentences, label):
  with torch.no_grad():
    for idx in range(sentences):
      finished = False
      cur_ids = torch.tensor(tokenizer.encode(label)).unsqueeze(0)
      for i in range(100):
        outputs = model(cur_ids, labels=cur_ids)
        loss, logits = outputs[:2]

        softmax_logits = torch.softmax(logits[0,-1], dim=0)

        if i < 5:
          n = 10
        else:
          n = 5
        
        next_token_id = choose_from_top_k_top_n(softmax_logits.numpy()) #top-k-top-n sampling
        cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long() * next_token_id], dim = 1)
        if next_token_id in tokenizer.encode('<|endoftext|>'):
          finished = True
          break
      
      if finished:
        output_list = list(cur_ids.squeeze().numpy())
        output_text = tokenizer.decode(output_list)
        print (output_text)
      else:
        output_list = list(cur_ids.squeeze().numpy())
        output_text = tokenizer.decode(output_list)
        print (output_text)

In [None]:
def load_models(model_name):
	"""
	Summary:
		Loading the trained model
	"""
	print ('Loading Trained GPT-2 Model')
	tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
	model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
	model_path = model_name
	model.load_state_dict(torch.load(model_path))
	return tokenizer, model

In [None]:

# parser = argparse.ArgumentParser(description='Arguments for inferencing Text Augmentation model')

# parser.add_argument('--model_name', default='mymodel.pt', type=str, action='store', help='Name of the model file')
# parser.add_argument('--sentences', type=int, default=5, action='store', help='Number of sentences in outputs')
# parser.add_argument('--label', type=str, action='store', help='Label for which to produce text')
# args = parser.parse_args()

start = time.time()

SENTENCES = 100
MODEL_NAME = 'mymodel.pt'
LABEL = 'store'

TOKENIZER, MODEL = load_models(MODEL_NAME)

generate(TOKENIZER, MODEL, SENTENCES, LABEL)

end = time.time()
print(end - start)

Loading Trained GPT-2 Model
store:true},

{

{ " id " : " 1 ",

" email " : " lisa@dw.com ",

" phone " : " +62 823452284 ",

" call xtd " : " 0861 003 4555 ",

" message " : " Ok ok ok ok ok?",

" password " : "???? ",

" account " :
store:1.00p.<|endoftext|>
store:  <|endoftext|>
store:<|endoftext|>
store:1","display_in_home":"0","title":"$250.00 for 1 month. Now $400.00 2 years. Unlimited phone calls, texts and data. This is a special Offer. Promo code BANQUET50. For details call 0300661250. 2 month minimum spend 8.50 per line 2 months min purchase £20.00 plus charge 2p per min use 20GB minimum. ","excludes_uplink":"No offers
store: http://www.thegoldrushstore.com/storeid/2009575669901.aspx <|endoftext|>
store: _______]/ <|endoftext|>
store: A unique chance to win £20,000 in 2 years, £500 vouchers, free online and 4 years £100 travel rewards - so come on out! - Book Online <|endoftext|>
store: $1.99/month for months 7 - 9, $2.99/month for months 10 - 12. Standard All Access Digital 

KeyboardInterrupt: ignored