In [1]:
# prompt: install datasets
# !pip install datasets

In [2]:
# !pip install transformers

In [3]:
# !pip install torch numpy tqdm openai nltk matplotlib

# Fast Detect GPT

In [4]:

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
import os

def from_pretrained(cls, model_name, kwargs, cache_dir):
    local_path = os.path.join(cache_dir, 'local.' + model_name.replace("/", "_"))
    try:
        obj = cls.from_pretrained(local_path, **kwargs)
    except Exception as ex:
        print(ex)
        obj = cls.from_pretrained(model_name, **kwargs, cache_dir=cache_dir)
        obj.save_pretrained(local_path)
    return obj

# predefined models
model_fullnames = {  'gpt2': 'gpt2',
                     'gpt2-xl': 'gpt2-xl',
                     'opt-2.7b': 'facebook/opt-2.7b',
                     'gpt-neo-2.7B': 'EleutherAI/gpt-neo-2.7B',
                     'gpt-j-6B': 'EleutherAI/gpt-j-6B',
                     'gpt-neox-20b': 'EleutherAI/gpt-neox-20b',
                     'mgpt': 'sberbank-ai/mGPT',
                     'pubmedgpt': 'stanford-crfm/pubmedgpt',
                     'mt5-xl': 'google/mt5-xl',
                     'llama-13b': 'huggyllama/llama-13b',
                     'llama2-13b': 'TheBloke/Llama-2-13B-fp16',
                     'bloom-7b1': 'bigscience/bloom-7b1',
                     'opt-13b': 'facebook/opt-13b',
                     }
float16_models = ['gpt-j-6B', 'gpt-neox-20b', 'llama-13b', 'llama2-13b', 'bloom-7b1', 'opt-13b']

def get_model_fullname(model_name):
    return model_fullnames[model_name] if model_name in model_fullnames else model_name

def load_model(model_name, device, cache_dir):
    model_fullname = get_model_fullname(model_name)
    print(f'Loading model {model_fullname}...')
    model_kwargs = {}
    if model_name in float16_models:
        model_kwargs.update(dict(torch_dtype=torch.float16))
    if 'gpt-j' in model_name:
        model_kwargs.update(dict(revision='float16'))
    model = from_pretrained(AutoModelForCausalLM, model_fullname, model_kwargs, cache_dir)
    print('Moving model to GPU...', end='', flush=True)
    start = time.time()
    model.to(device)
    print(f'DONE ({time.time() - start:.2f}s)')
    return model

def load_tokenizer(model_name, for_dataset, cache_dir):
    model_fullname = get_model_fullname(model_name)
    optional_tok_kwargs = {}
    if "facebook/opt-" in model_fullname:
        print("Using non-fast tokenizer for OPT")
        optional_tok_kwargs['fast'] = False
    if for_dataset in ['pubmed']:
        optional_tok_kwargs['padding_side'] = 'left'
    else:
        optional_tok_kwargs['padding_side'] = 'right'
    base_tokenizer = from_pretrained(AutoTokenizer, model_fullname, optional_tok_kwargs, cache_dir=cache_dir)
    if base_tokenizer.pad_token_id is None:
        base_tokenizer.pad_token_id = base_tokenizer.eos_token_id
        if '13b' in model_fullname:
            base_tokenizer.pad_token_id = 0
    return base_tokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# !pip install scikit-learn

In [6]:

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve, auc

# 15 colorblind-friendly colors
COLORS = ["#0072B2", "#009E73", "#D55E00", "#CC79A7", "#F0E442",
            "#56B4E9", "#E69F00", "#000000", "#0072B2", "#009E73",
            "#D55E00", "#CC79A7", "#F0E442", "#56B4E9", "#E69F00"]


def get_roc_metrics(real_preds, sample_preds):
    fpr, tpr, _ = roc_curve([0] * len(real_preds) + [1] * len(sample_preds), real_preds + sample_preds)
    roc_auc = auc(fpr, tpr)
    return fpr.tolist(), tpr.tolist(), float(roc_auc)


def get_precision_recall_metrics(real_preds, sample_preds):
    precision, recall, _ = precision_recall_curve([0] * len(real_preds) + [1] * len(sample_preds),
                                                  real_preds + sample_preds)
    pr_auc = auc(recall, precision)
    return precision.tolist(), recall.tolist(), float(pr_auc)


In [7]:
import random

import numpy as np
import torch
import torch.nn.functional as F
import tqdm
import argparse
import json

In [8]:

def get_samples(logits, labels):
    assert logits.shape[0] == 1
    assert labels.shape[0] == 1
    nsamples = 10000
    lprobs = torch.log_softmax(logits, dim=-1)
    distrib = torch.distributions.categorical.Categorical(logits=lprobs)
    samples = distrib.sample([nsamples]).permute([1, 2, 0])
    return samples

def get_likelihood(logits, labels):
    assert logits.shape[0] == 1
    assert labels.shape[0] == 1
    labels = labels.unsqueeze(-1) if labels.ndim == logits.ndim - 1 else labels
    lprobs = torch.log_softmax(logits, dim=-1)
    log_likelihood = lprobs.gather(dim=-1, index=labels)
    return log_likelihood.mean(dim=1)

def get_sampling_discrepancy(logits_ref, logits_score, labels):
    assert logits_ref.shape[0] == 1
    assert logits_score.shape[0] == 1
    assert labels.shape[0] == 1
    if logits_ref.size(-1) != logits_score.size(-1):
        # print(f"WARNING: vocabulary size mismatch {logits_ref.size(-1)} vs {logits_score.size(-1)}.")
        vocab_size = min(logits_ref.size(-1), logits_score.size(-1))
        logits_ref = logits_ref[:, :, :vocab_size]
        logits_score = logits_score[:, :, :vocab_size]

    samples = get_samples(logits_ref, labels)
    log_likelihood_x = get_likelihood(logits_score, labels)
    log_likelihood_x_tilde = get_likelihood(logits_score, samples)
    miu_tilde = log_likelihood_x_tilde.mean(dim=-1)
    sigma_tilde = log_likelihood_x_tilde.std(dim=-1)
    discrepancy = (log_likelihood_x.squeeze(-1) - miu_tilde) / sigma_tilde
    return discrepancy.item()

def get_sampling_discrepancy_analytic(logits_ref, logits_score, labels):
    assert logits_ref.shape[0] == 1
    assert logits_score.shape[0] == 1
    assert labels.shape[0] == 1
    if logits_ref.size(-1) != logits_score.size(-1):
        # print(f"WARNING: vocabulary size mismatch {logits_ref.size(-1)} vs {logits_score.size(-1)}.")
        vocab_size = min(logits_ref.size(-1), logits_score.size(-1))
        logits_ref = logits_ref[:, :, :vocab_size]
        logits_score = logits_score[:, :, :vocab_size]

    labels = labels.unsqueeze(-1) if labels.ndim == logits_score.ndim - 1 else labels
    lprobs_score = torch.log_softmax(logits_score, dim=-1)
    probs_ref = torch.softmax(logits_ref, dim=-1)
    log_likelihood = lprobs_score.gather(dim=-1, index=labels).squeeze(-1)
    mean_ref = (probs_ref * lprobs_score).sum(dim=-1)
    var_ref = (probs_ref * torch.square(lprobs_score)).sum(dim=-1) - torch.square(mean_ref)
    discrepancy = (log_likelihood.sum(dim=-1) - mean_ref.sum(dim=-1)) / var_ref.sum(dim=-1).sqrt()
    discrepancy = discrepancy.mean()
    return discrepancy.item()



In [9]:
import random

import numpy as np
import torch
import os
import glob
import argparse
import json
import transformers
import datasets

In [10]:
# !git clone https://github.com/baoguangsheng/fast-detect-gpt.git

In [11]:
# %cd fast-detect-gpt
%cd /home/ziangcao2022/workspace/CS330/STF_CS330_FastGPT/private_support_code/fast-detect-gpt

/mnt/disks/disk/CS330/STF_CS330_FastGPT/private_support_code/fast-detect-gpt


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [12]:
%ls

LICENSE       [0m[01;34mexp_main[0m/         main_ext.sh       supervised.sh
README.md     gpt3to4.sh        requirements.txt  temperature.sh
attack.sh     [01;34mlocal_infer_ref[0m/  [01;34mscripts[0m/          topk.sh
[01;34mexp_gpt3to4[0m/  main.sh           setup.sh          topp.sh


In [13]:
# reference_model_name = "gpt-j-6B"
# scoring_model_name = "gpt-neo-2.7B"

reference_model_name = "gpt2"
scoring_model_name = "gpt2"


dataset = "xsum"
ref_path = "./local_infer_ref"
device = "cpu"
cache_dir = "../cache"

In [14]:
class ProbEstimator:
    def __init__(self):
        self.real_crits = []
        self.fake_crits = []
        for result_file in glob.glob(os.path.join(ref_path, '*.json')):
            with open(result_file, 'r') as fin:
                res = json.load(fin)
                self.real_crits.extend(res['predictions']['real'])
                self.fake_crits.extend(res['predictions']['samples'])
        print(f'ProbEstimator: total {len(self.real_crits) * 2} samples.')


    def crit_to_prob(self, crit):
        offset = np.sort(np.abs(np.array(self.real_crits + self.fake_crits) - crit))[100]
        cnt_real = np.sum((np.array(self.real_crits) > crit - offset) & (np.array(self.real_crits) < crit + offset))
        cnt_fake = np.sum((np.array(self.fake_crits) > crit - offset) & (np.array(self.fake_crits) < crit + offset))
        return cnt_fake / (cnt_real + cnt_fake)

In [15]:



class FastDetectGPT:
    def __init__(self):
        self.device = device
        # load model
        self.scoring_tokenizer = load_tokenizer(scoring_model_name, dataset, cache_dir)
        self.scoring_model = load_model(scoring_model_name, device, cache_dir)
        self.scoring_model.eval()
        self.reference_model_name = reference_model_name
        self.scoring_model_name = scoring_model_name
        if self.reference_model_name != self.scoring_model_name:
            self.reference_tokenizer = load_tokenizer(self.reference_model_name, dataset, cache_dir)
            self.reference_model = load_model(self.reference_model_name, device, cache_dir)
            self.reference_model.eval()
        # evaluate criterion
        self.criterion_name = "sampling_discrepancy_analytic"
        self.criterion_fn = get_sampling_discrepancy_analytic
        self.prob_estimator = ProbEstimator()
        # input text
        print('Local demo for Fast-DetectGPT, where the longer text has more reliable result.')
        print('')

    def infer(self, text):
        # evaluate text     # (1, 112)
        tokenized = self.scoring_tokenizer(text, return_tensors="pt", padding=True, return_token_type_ids=False).to(self.device)
        labels = tokenized.input_ids[:, 1:]
        with torch.no_grad():
            logits_score = self.scoring_model(**tokenized).logits[:, :-1]
            if self.reference_model_name == self.scoring_model_name:
                logits_ref = logits_score
            else:
                tokenized = self.reference_tokenizer(text, return_tensors="pt", padding=True, return_token_type_ids=False).to(self.device)
                assert torch.all(tokenized.input_ids[:, 1:] == labels), "Tokenizer is mismatch."
                logits_ref = self.reference_model(**tokenized).logits[:, :-1]
            crit = self.criterion_fn(logits_ref, logits_score, labels)
        # estimate the probability of machine generated text
        prob = self.prob_estimator.crit_to_prob(crit)
        print(f'Fast-DetectGPT criterion is {crit:.4f}, suggesting that the text has a probability of {prob * 100:.0f}% to be fake.')
        return prob

In [16]:
detector = FastDetectGPT()

Loading model gpt2...
Moving model to GPU...DONE (0.00s)
ProbEstimator: total 1800 samples.
Local demo for Fast-DetectGPT, where the longer text has more reliable result.



In [17]:
from typing import List

def model2hfname(model: str) -> str:
    return {
        "bert-tiny": "prajjwal1/bert-tiny",
        "bert-med": "prajjwal1/bert-medium",
        "small": "gpt2",
        "med": "gpt2-medium",
        "large": "gpt2-large",
        "full": "gpt2-xl",
        "gpt2-sm": "gpt2",
        "gpt2-med": "gpt2-medium",
        "gpt2-lg": "gpt2-large",
        "gpt2": "gpt2-xl",
        "neo": "EleutherAI/gpt-neo-2.7B",
    }[model]

def get_model_and_tokenizer(model: str, Cls = transformers.AutoModelForCausalLM, **model_kwargs):
    hf_model_name = model2hfname(model)

    m = Cls.from_pretrained(hf_model_name, **model_kwargs)
    if isinstance(m, transformers.GPT2LMHeadModel):
        m.transformer.gradient_checkpointing_enable()

    tok = transformers.AutoTokenizer.from_pretrained(hf_model_name)

    if tok.pad_token_id is None:
        if Cls == transformers.AutoModelForCausalLM:
            tok.pad_token = tok.eos_token
        else:
            print("Adding pad token to tokenizer")
            tok.add_special_tokens({"pad_token": "[PAD]"})
            tok.pad_token = "[PAD]"
    return m, tok


def stop_tokens(tokenizer, stop_string: str = ".") -> List[int]:
    tokens = []
    for idx in range(len(tokenizer)):
        if tokenizer.decode(idx) == stop_string:
            tokens.append(idx)
    return tokens

def top_k_logits(logits, k):
    if k == 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1]
    return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)


In [18]:
max_sample_tokens = 10
model_name = "med"

In [19]:
# import gymnasium as gym
# action_space = gym.spaces.MultiDiscrete([2, 10])
# action_space.sample()

In [20]:
# observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(50257,), dtype=np.float32)
# observation_space.sample(), observation_space.shape

In [21]:
# env.cur_logits

In [22]:
# env.vocab_size

In [23]:
# !which python

In [38]:
class LMEnv:
    def __init__(self, initial_text, sampling_mode: str = "likelihood", topK_logistics: int=10):

        ## Basic Config
        self.max_sample_tokens = max_sample_tokens
        self.model, self.tok = get_model_and_tokenizer(model_name)
        assert isinstance(self.model, transformers.GPT2LMHeadModel)
        self.stop_tokens = stop_tokens(self.tok)
        self._seed = None
        self.vocab_size = len(self.tok)
        # Current inputs and logits
        self.initial_text = initial_text


        self.topK_logistics = topK_logistics

        ## Basic Action Space and Obs Space
        # The first integer can take values 0 or 1 (2 possibilities)
        # The second integer can take values 1 to 10 (10 possibilities)
        import gymnasium as gym
        self.action_space = gym.spaces.MultiDiscrete([2, self.topK_logistics])
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(self.topK_logistics,), dtype=np.float32)


        self.sampling_mode = sampling_mode  # "likelihood" or "argmax"
        self.purturb_mode = "argmax"

        # self.input_ids, self.input_ids_perturb
        self.input_ids = None
        # self.input_ids_perturb = None

        self.reset()

    def _feedforward(self):
        # sampled_tokens = []
        # cum_logits = []
        # n = 0
        # cur_input = self.input_ids
        # past_kvs = None
        # with torch.inference_mode():
        #     while n < max_tokens:
        #         # print(cur_input.shape)
        #         outputs = self.model(cur_input, past_key_values=past_kvs, use_cache=True)
        #         local_logits = outputs.logits[:, -1, :]

        outputs = self.model(self.input_ids, past_key_values=None, use_cache=True)
        local_logits = outputs.logits[:, -1, :]
        return local_logits
    
    def _sample_tokens(self, local_logits):
        sampled_tokens = []
        if self.sampling_mode == "argmax":
            sampled_token = torch.argmax(local_logits, dim=-1)
        elif self.sampling_mode == "likelihood":
            x = F.softmax(local_logits, dim=-1)
            # print(local_logits.shape, x.shape)
            sampled_token = torch.multinomial(F.softmax(local_logits, dim=-1), num_samples=1).squeeze(dim=1)
            # sampled_token = torch.multinomial(x, num_samples=1).squeeze(dim=1)
        else:
            raise NotImplementedError

        sampled_tokens.append(sampled_token[0])

        return sampled_tokens
         
    def _perturb_tokens(self, local_logits, perturb_ranking):
        sampled_tokens = []
        # Get the top k predictions （1-10）
        topk_values, topk_indices = torch.topk(local_logits, perturb_ranking)
        # Select the last item
        sampled_tokens.append(topk_values[0][-1])
        return sampled_tokens

    def _obs_wrapper(self, local_logits):
        # Sorted topk_values
        topk_values, topk_indices = torch.topk(local_logits, self.topK_logistics)
        return topk_values.detach().numpy()

    def _cat_new_word(self, sampled_tokens):
        return torch.cat((self.input_ids, torch.tensor(sampled_tokens).long().unsqueeze(dim=0)), dim=1)


    def reset(self):
        ## initial_text
        initial_text = self.initial_text
        self.input_ids = self.tok(initial_text, return_tensors="pt")["input_ids"]
        ## First 1 step
        local_logits = self._feedforward()
        self.last_logits = local_logits

        sampled_tokens = self._sample_tokens(local_logits)
        # TODO: Update the self.ids based ons the ampled_tokens??
        self.input_ids = self._cat_new_word(sampled_tokens)

        obs = self._obs_wrapper(local_logits)
        return obs
    
    def step(self, action):
        # TODO: def step(self, action):
        reward = 0

        # Parse Action
        ## perturb: Binary variable perturb -- either 1 or 0
        perturb = action[0]
        ## perturb_ranking: 10 options -- shift the choice from 0-9 toward 1-10
        perturb_ranking = action[1] + 1

        sampled_tokens = self._sample_tokens(self.last_logits)
        sampled_output = self._cat_new_word(sampled_tokens)

        if not perturb:
            self.input_ids = sampled_output
        else:
            reward -= 1 # Cost of applying perturb
            perturbed_tokens = self._perturb_tokens(self.last_logits, perturb_ranking)


            perturbed_output = self._cat_new_word(perturbed_tokens)

            # Record Scores -- prob
            sampled_score = detector.infer(self.tok.decode(torch.squeeze(sampled_output, dim=0)))
            perturbed_score = detector.infer(self.tok.decode(torch.squeeze(perturbed_output, dim=0)))

            assert sampled_score>=0
            assert perturbed_score>=0

            reward += (sampled_score-perturbed_score) * 100 # Benefits of applying perturb

            self.input_ids = perturbed_output


        ## GET NEW OBS
        local_logits = self._feedforward()
        self.last_logits = local_logits

        obs = self._obs_wrapper(local_logits)


        info = None


        # TODO: Find an appropriate way to update 'done' -- Somehow use your sample_done(self)
        # if if sampled_token[0].item() in self.stop_tokens:, done=True
        done = False

        return obs, reward, done, info



    def get_text(self):
        return self.tok.decode(torch.squeeze(self.input_ids, dim=0))

    def sample_done(self):
        return self.input_ids[-1] in self.stop_tokens or self.cur_input.shape[1] >= self.max_sample_tokens


In [39]:
env = LMEnv("Here is the start of my new era!")

In [40]:
env.input_ids

tensor([[4342,  318,  262,  923,  286,  616,  649, 6980,    0,  921]])

In [41]:
env.stop_tokens

[13, 764]

In [42]:
for idx in tqdm.tqdm(range(10), desc=f"Sampling..."):
  env.step([0, 0])
  # print(env.input_ids)
  # print(env.cur_logits)
print(env.tok.decode(torch.squeeze(env.input_ids, dim=0)))

Sampling...: 100%|██████████| 10/10 [00:01<00:00,  8.42it/s]

Here is the start of my new era! You

Can even add a split in between each





In [43]:
env.get_text()

'Here is the start of my new era! You\n\nCan even add a split in between each'

In [35]:
detector.infer(env.get_text())

Fast-DetectGPT criterion is 1.2470, suggesting that the text has a probability of 45% to be fake.


0.45

In [44]:
env.reset()
for idx in tqdm.tqdm(range(20), desc=f"Sampling..."):
  if idx % 2:
    env.step(action=[0, int(idx)])
  else:
    env.step(action=[1, 2])
  # print(env.input_ids)
  # print(env.cur_logits)
print(env.tok.decode(torch.squeeze(env.input_ids, dim=0)))

Sampling...:   0%|          | 0/20 [00:00<?, ?it/s]

Fast-DetectGPT criterion is 0.2564, suggesting that the text has a probability of 12% to be fake.





OverflowError: out of range integral type conversion attempted

In [None]:
detector.infer(env.get_text())

Fast-DetectGPT criterion is -4.6827, suggesting that the text has a probability of 0% to be fake.


0.0