In [1]:
import numpy as np
import scipy
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('ggplot')
%matplotlib inline
from tqdm import tqdm
from datasets import load_dataset

import logging
import spacy


In [2]:
from scipy.interpolate import interp1d
import pandas as pd
from src.PrepareSentenceContext import PrepareSentenceContext

In [3]:
import spacy
import re


class Sentences(object):
    def __init__(self, texts):
        def iterate(texts):
            for t in texts:
                yield t

        self.sents = iterate(texts)


class PandasParser(object):
    """
    Iterate over the text column of a dataframe
    """

    def __init__(self, text_value='text'):
        self.text_value = text_value
        self.sents = None

    def __call__(self, df):
        texts = list(df[self.text_value])
        return Sentences(texts)



class PrepareSentenceContext(object):
    """
    Parse text and extract length and context information

    This information is needed for evaluating log-perplexity of the text with respect to a language model
    and later on to test the likelihood that the sentence was sampled from the model with the relevant context.
    """

    def __init__(self, engine='spacy', context_policy=None,
                 context=None):
        if engine == 'spacy':
            self.nlp = spacy.load("en_core_web_sm")
        if engine == 'regex':
            self.nlp = SentenceParser()

        self.context_policy = context_policy
        self.context = context

    def __call__(self, text):
        return self.parse_sentences(text)

    def parse_sentences(self, text):
        texts = []
        contexts = []
        lengths = []
        tags = []
        previous = None

        text = re.sub("(</?[a-zA-Z0-9 ]+>)\s+", r"\1. ", text)  # to make sure that tags are in separate sentences
        parsed = self.nlp(text)

        tag = None
        for i, sent in enumerate(parsed.sents):
            tag_text = re.findall(r"(</?[a-zA-Z0-9 ]+>)", str(sent))
            if len(tag_text) > 0:
                if tag is None: # opening tag
                    tag = tag_text[0]
                else:  # closing tag
                    tag = None

            else:  # only continue if text is not a tag
                tags.append(tag)
                import pdb; pdb.set_trace()
                lengths.append(len(sent))
                sent_text = str(sent)
                texts.append(sent_text)

                if self.context is not None:
                    context = self.context
                elif self.context_policy is None:
                    context = None
                elif self.context_policy == 'previous_sentence':
                    context = previous
                    previous = sent_text
                else:
                    context = None

                contexts.append(context)
        return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags}


In [36]:
input_file = "Data/ChatGPT/mix/American Civil War mix.txt"

with open(input_file, 'rt') as f:
    text = f.read()

In [37]:
text = """
Barclays sponsored the league.\nNike provided a new match ball\u00a0\u2013 the T90 Ascente\u00a0\u2013 for this season.\nNewcastle United were champions, having finished in first place on the league table.\nManchester City were runners-up, followed by Chelsea in third place.\nSunderland and Manchester United were the two relegated teams from the 2008\u201309 Premier League.\n\n\n    \n\n\n13 August 2009:\nArsenal host Portsmouth at Emirates Stadium in their opening game of the 2009\u201310 Premier League season.\n\n\n16 August 2009:\n <edit> Chelsea host Everton at Stamford Bridge in their first game of the 2009\u201310 Premier League season.\n\n </edit>\n <edit> 19 August 2009: </edit>\nTottenham Hotspur host Wigan Athletic at White Hart Lane in their first game of the 2009\u201310 Premier League season.\n\n\n24 August 2009:\nLiverpool host Stoke City at Anfield in their opening game of the 2009\u201310 Premier League season.\n\n\n27 August 2009:\n <edit> Aston Villa host Fulham at Villa Park in their opening game of the 2009-10 Premier League season.\n\n1 September 2009: </edit>\n<edit> Manchester United travel to Swansea City to play in their opening game of the 2009-10 Premier League season.\n\n </edit>\n3 September 2009:\nNewcastle United travel to Arsenal to play in their opening game of the 2009-10 Premier League season.\n\n\n8 September 2009:\nManchester City travel to Sunderland to play in their opening game of the 2009-10 Premier.
"""

In [38]:
parser = PrepareSentenceContext(engine='spacy')

no.  0

Akiem Jamar Hicks (born November 16, 1989) is an American football defensive end for the Detroit Lions of the National Football League (NFL).

--------------------------
no.  1
He played college football at Cleveland State.



--------------------------
no.  2
Hicks was drafted in the fourth round of the 2011 NFL Draft by the Chicago Bears.

--------------------------
no.  3
He has also played for the Houston Texans, New England Patriots, and Detroit Lions.



--------------------------
no.  4
Hicks attended Cleveland State University.

--------------------------
no.  5
He started all 26 games he played in his four-year career, recording 122 tackles, 20 sacks and six forced fumbles.



--------------------------
no.  6
<edit>
--------------------------
no.  7
After his first season at Regina, he was drafted by the Omaha Nighthawks of the United Football League, but again decided to stay in college.   
--------------------------
no.  8
</edit>
<edit> Akiem Jamar Hicks (born Nove

In [24]:
for s in nlp(text).sents:
    print(len(s))


Akiem Jamar Hicks (born November 16, 1989) is an American football defensive end for the Detroit Lions of the National Football League (NFL).

[]
He played college football at Cleveland State.



[]
Hicks was drafted in the fourth round of the 2011 NFL Draft by the Chicago Bears.

[]
He has also played for the Houston Texans, New England Patriots, and Detroit Lions.



[]
Hicks attended Cleveland State University.

[]
He started all 26 games he played in his four-year career, recording 122 tackles, 20 sacks and six forced fumbles.



[]
<edit>
[]
After his first season at Regina, he was drafted by the Omaha Nighthawks of the United Football League, but again decided to stay in college.   
[]
</edit>
<edit> Akiem Jamar Hicks (born November 16, 1989) is an American football defensive end for the Chicago Bears of the National Football League (NFL).
[]
</edit>
On September 5, 2014, Hicks was signed to the Houston Texans' practice squad.

[]
He was promoted to the active roster on October 

In [19]:




import re
sents = nlp(text).sents
for s in sents:
    print("===============================================")
    print(s)
    tag_text = re.findall(r"(</?[a-zA-Z0-9 ]+>)", str(s))
    print(tag_text)

Manchester City travel to Sunderland to play in their opening game of the 2009-10 Premier.

In [41]:
sent



[None,
 None,
 None,
 None,
 None,
 None,
 '<edit>',
 None,
 None,
 '<edit>',
 None,
 None,
 None,
 None,
 None]

In [15]:
parser = PrepareSentenceContext(engine='spacy', context_policy='previous_sentence')
chunks = parser(text)



</edit>
 <edit> 19 August 2009: </edit>
Tottenham Hotspur host Wigan Athletic at White Hart Lane in their first game of the 2009–10 Premier League season.



</edit>
<edit>


In [145]:
null_data_file = "results/gpt2_no_context_wiki_machine.csv"
df_null = pd.read_csv(null_data_file)

from test_text_detect import get_pval_func_dict
pval_functions = get_pval_func_dict(df_null)


In [154]:
pval_functions[5](7)

array(0.04352532)

In [172]:
x = 5.403795
print(np.mean(df_null[df_null.length == 9].response > x))

0.10521849809079338


In [137]:
[chunks['text'][i] for i in range(len(chunks['text'])) if chunks['tags'][i] is not None]

['The Civil War is one of the most extensively studied and written about episodes in U.S. history.',
 'The central cause of the war was the dispute over whether slavery would be permitted to expand into the western territories, leading to more slave states, or be prevented from doing so, which was widely believed would place slavery on a course of ultimate extinction.',
 "An initial seven southern slave states responded to Lincoln's victory by seceding from the United States and, in February 1861, forming the Confederacy."]

In [173]:
255672 - 5000 - 13000 - 10000

227672

In [174]:
227672 / 2

113836.0