In [138]:
import numpy as np
import scipy
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('ggplot')
%matplotlib inline
from tqdm import tqdm
from datasets import load_dataset

import logging
import spacy


In [139]:
from scipy.interpolate import interp1d
import pandas as pd
from src.PrepareSentenceContext import PrepareSentenceContext

In [140]:
import spacy
import re


class Sentences(object):
    def __init__(self, texts):
        def iterate(texts):
            for t in texts:
                yield t

        self.sents = iterate(texts)


class PandasParser(object):
    """
    Iterate over the text column of a dataframe
    """

    def __init__(self, text_value='text'):
        self.text_value = text_value
        self.sents = None

    def __call__(self, df):
        texts = list(df[self.text_value])
        return Sentences(texts)



class PrepareSentenceContext(object):
    """
    Parse text and extract length and context information

    This information is needed for evaluating log-perplexity of the text with respect to a language model
    and later on to test the likelihood that the sentence was sampled from the model with the relevant context.
    """

    def __init__(self, engine='spacy', context_policy=None,
                 context=None):
        if engine == 'spacy':
            self.nlp = spacy.load("en_core_web_sm")
        if engine == 'pandas':
            self.nlp = PandasParser()

        self.context_policy = context_policy
        self.context = context

    def __call__(self, text):
        return self.parse_text(text)

    def parse_text(self, text):
        texts = []
        contexts = []
        lengths = []
        tags = []
        previous = None

        text = re.sub("(</?[a-zA-Z0-9 ]+>)\s+", r"\1. ", text)
        parsed = self.nlp(text)

        tag = None
        for i, sent in enumerate(parsed.sents):
            tag_text = re.findall(r"(</?[a-zA-Z0-9 ]+>)", str(sent))
            if len(tag_text) > 0:
                if tag is None: # opening tag
                    tag = tag_text[0]
                else:  # closing tag
                    tag = None

            else:  # only continue if text is not a tag
                tags.append(tag)
                lengths.append(len(sent))
                sent_text = str(sent)
                texts.append(sent_text)

                if self.context is not None:
                    context = self.context
                elif self.context_policy is None:
                    context = None
                elif self.context_policy == 'previous_sentence':
                    context = previous
                    previous = sent_text
                else:
                    context = None

                contexts.append(context)
        return {'text': texts, 'length': lengths, 'context': contexts, 'tags': tags}


In [141]:
input_file = "Data/ChatGPT/mix/American Civil War mix.txt"

with open(input_file, 'rt') as f:
    text = f.read()

In [123]:
import re
sent = nlp("</edit>\n It was fought. ")
tag_text = re.findall(r"(</?[a-zA-Z0-9 ]+>)[\s]+(.+)", str(sent))
tag_text

[('</edit>', 'It was fought. ')]

In [142]:
parser = PrepareSentenceContext(engine='spacy', context_policy='previous_sentence')
chunks = parser(text)

In [145]:
null_data_file = "results/gpt2_no_context_wiki_machine.csv"
df_null = pd.read_csv(null_data_file)

from test_text_detect import get_pval_func_dict
pval_functions = get_pval_func_dict(df_null)


In [154]:
pval_functions[5](7)

array(0.04352532)

In [172]:
x = 5.403795
print(np.mean(df_null[df_null.length == 9].response > x))

0.10521849809079338


In [137]:
[chunks['text'][i] for i in range(len(chunks['text'])) if chunks['tags'][i] is not None]

['The Civil War is one of the most extensively studied and written about episodes in U.S. history.',
 'The central cause of the war was the dispute over whether slavery would be permitted to expand into the western territories, leading to more slave states, or be prevented from doing so, which was widely believed would place slavery on a course of ultimate extinction.',
 "An initial seven southern slave states responded to Lincoln's victory by seceding from the United States and, in February 1861, forming the Confederacy."]

array([False, False, False, False,  True, False, False, False, False])