<a href="https://colab.research.google.com/github/andersonfaller/detoxlm/blob/main/Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install accelerate
!pip install bitsandbytes

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 8.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 85.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 65.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.14.0-py3-none-any.whl (175 kB)
[K     |████████████████████████████████| 175 kB 7.

In [2]:
import collections
import itertools
import functools
import math
import numpy as np
import os
import random
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import pandas as pd

from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from typing import List, Type

from googleapiclient import discovery
import json

In [3]:
!wget nc https://github.com/andersonfaller/detoxlm/raw/main/prompts.7z
!7z e prompts.7z

--2022-11-16 22:47:42--  http://nc/
Resolving nc (nc)... failed: No address associated with hostname.
wget: unable to resolve host address ‘nc’
--2022-11-16 22:47:42--  https://github.com/andersonfaller/detoxlm/raw/main/prompts.7z
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/andersonfaller/detoxlm/main/prompts.7z [following]
--2022-11-16 22:47:43--  https://raw.githubusercontent.com/andersonfaller/detoxlm/main/prompts.7z
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15732789 (15M) [application/octet-stream]
Saving to: ‘prompts.7z’


2022-11-16 22:47:43 (174 MB/s) - ‘prompts.7z’ saved [1

In [4]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

Using cuda:0


In [5]:
from google.colab import drive
import pickle

drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Para desenvolvimento, sugerimos usar o GPT menor, com 1.3B parametros.
#model = transformers.GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", device_map="auto", load_in_8bit=True)
#tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

# Se estiver usando o colab pro, conseguimos rodar um GPT-J de 6B de parametros.
model = transformers.GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True)
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = model.to(device)

Downloading:   0%|          | 0.00/836 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/12.1G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [7]:
def my_multinomial(weights):
   total = sum(weights)
   r = random.uniform(0, total)
   posicao = 0
   for i, w in enumerate(weights):
      if posicao + w >= r:
         return i
      posicao += w

def decode_sampling(logits, temperature = 1):
  probs = torch.softmax(logits/temperature, dim = -1)
  return my_multinomial(probs)

def decode_top_k(logits, temperature = 1, k = 1):
  probs = torch.softmax(logits/temperature, dim = -1)
  top_k = probs.topk(k)
  return top_k.indices[my_multinomial(top_k.values)].item()

def decode_top_p(logits, temperature = 1, p = 1):
  probs = torch.softmax(logits/temperature, dim = -1)
  top_p_idx = []
  top_p_values = []
  idx = torch.argsort(logits, descending = True)
  for i in idx:
    top_p_idx.append(i)
    top_p_values.append(probs[i])
    if sum(top_p_values) >= p:
      break
  return top_p_idx[my_multinomial(top_p_values)].item()

def generate(model, prompt, max_length, temperature = 1, top_k = -1, top_p = -1):
  if top_k > 0:
    decoder = lambda x: decode_top_k(x, temperature, top_k)
  elif top_p > 0:
    decoder = lambda x: decode_top_p(x, temperature, top_p)
  else:
    decoder = lambda x: decode_sampling(x, temperature)
  
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids[0]

  for i in range(max_length):
    with torch.no_grad():
      logits = model(input_ids.cuda()).logits[-1] # último token
    next_token = decoder(logits)
    input_ids = torch.cat([input_ids, torch.LongTensor([next_token])])
  
  return tokenizer.decode(input_ids)

In [8]:
prompt = ["Usually, in situations like this,"]

In [9]:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
max_length = 15
generate(model, prompt, max_length, temperature = 0.9)

"Usually, in situations like this, the kid is out of school. And if I'm not careful, we"

In [None]:
API_KEY = input()

In [11]:
client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  #static_discovery=False,
)

In [12]:
with open('prompts.jsonl') as file:
  prompts = [json.loads(line) for line in file.readlines()]

df_prompts = pd.DataFrame.from_records([p['prompt'] for p in prompts])
df_continuations = pd.DataFrame.from_records([p['continuation'] for p in prompts])
series_challenging = pd.DataFrame.from_records(prompts)['challenging']


In [None]:
#df_my_continuations = pd.DataFrame()
df_my_continuations = pickle.load(open('/content/drive/MyDrive/df_my_continuations.pkl', 'rb'))

In [None]:
max_length = 20

for i, row in df_prompts.iterrows():
  if i in df_my_continuations.index:
    continue

  prompt = [row['text']]
  continuation = generate(model, prompt, max_length, temperature = 0.9)
  continuation = continuation[len(prompt[0]):].strip()

  analyze_request = {
    'comment': { 'text': row['text'] },
    'requestedAttributes': {'TOXICITY': {}}
  }
  try:
    response = client.comments().analyze(body=analyze_request).execute()
    prompt_toxicity = response['attributeScores']['TOXICITY']['summaryScore']['value']
  except:
    print('ERRO')
    prompt_toxicity = np.nan

  analyze_request = {
    'comment': { 'text': continuation },
    'requestedAttributes': {'TOXICITY': {}}
  }
  try:
    response = client.comments().analyze(body=analyze_request).execute()
    continuation_toxicity = response['attributeScores']['TOXICITY']['summaryScore']['value']
  except:
    print('ERRO')
    continuation_toxicity = np.nan
  
  df_my_continuations.loc[i, 'prompt'] = row['text']
  df_my_continuations.loc[i, 'prompt_toxicity'] = prompt_toxicity
  df_my_continuations.loc[i, 'continuation'] = continuation
  df_my_continuations.loc[i, 'continuation_toxicity'] = continuation_toxicity

  pickle.dump(df_my_continuations, open('/content/drive/MyDrive/df_my_continuations.pkl', 'wb'))

  #print(i, prompt_toxicity, continuation_toxicity)


ERRO
ERRO


In [None]:
df_my_continuations