# Imports and Data Loading

In [1]:
import pandas as pd
import requests
import os

In [2]:
DATADIR = r"D:\data\claim_identification_corpus"

In [3]:
essays = pd.read_excel(os.path.join(DATADIR, "essay_scores.xlsx"))

# Extract Essays

In [15]:
def get_essay(filename):
    df = pd.read_csv(os.path.join(DATADIR, "Data", filename + ".csv"))

    res = []

    ind_para = 1
    while ind_para < len(df):
        this_para = df['discourse_part'] == "Paragraph_" + str(ind_para)
        if not any(this_para):
            break
        text = ' '.join(df['discourse_text'][this_para])
        res.append(text)
        ind_para += 1
    
    return '\n'.join(res)

In [16]:
essays['Text'] = essays['Essay'].apply(get_essay)

In [17]:
max(essays['Text'].apply(len))

4273

# Extract Essays by Annotation

In [17]:
all_essays = []

for i, row in essays.iterrows():
    try:
        filepath = f"{DATADIR}/Data/{row['Essay']}.csv"
        df = pd.read_csv(filepath)
    except FileNotFoundError as e:
        print(e)
        continue
    
    df['essay'] = row['Essay']
    df['score'] = row['Holistic Score']

    all_essays.append(df)

In [19]:
annot = pd.concat(all_essays, ignore_index=True)
annot

Unnamed: 0,discourse_id,discourse_part,discourse_boundary,discourse_text,adjudicated_discourse_type,adjudicated_effectiveness,adjudicated_hierarchical,adjudicated_parallel,essay,score
0,"1_MSU_essay|Paragraph_1|Nonannotated|0,20",Paragraph_1,"(0, 20)",Heroes or Celebrities,Nonannotated,-,-,-,1_MSU_essay,5.0
1,"1_MSU_essay|Paragraph_2|Nonannotated|0,565",Paragraph_2,"(0, 565)",Celebrities and heroes are often confused in t...,Nonannotated,-,-,-,1_MSU_essay,5.0
2,"1_MSU_essay|Paragraph_2|Final_Claim|567,660",Paragraph_2,"(567, 660)",This idea of a hero has been forgotten and sho...,Final_Claim,adequate,-,-,1_MSU_essay,5.0
3,"1_MSU_essay|Paragraph_3|Data|0,519",Paragraph_3,"(0, 519)",How does one define a hero? Is it because of t...,Data,effective,"Paragraph_3|Primary_Claim|521,628",-,1_MSU_essay,5.0
4,"1_MSU_essay|Paragraph_3|Primary_Claim|521,628",Paragraph_3,"(521, 628)","Although many people do heroic things, they ar...",Primary_Claim,effective,"Paragraph_2|Final_Claim|567,660","Paragraph_4|Primary_Claim|449,553",1_MSU_essay,5.0
...,...,...,...,...,...,...,...,...,...,...
2259,"9b_MSU_essay|Paragraph_2|Data|130,443",Paragraph_2,"(130, 443)",The first time a person learns that a certain ...,Data,adequate,"Paragraph_2|Primary_Claim|0,128",-,9b_MSU_essay,3.5
2260,"9b_MSU_essay|Paragraph_3|Primary_Claim|0,170",Paragraph_3,"(0, 170)",It is hard for people to develop new original ...,Primary_Claim,adequate,"Paragraph_1|Final_Claim|142,726","Paragraph_2|Primary_Claim|0,128; Paragraph_4|P...",9b_MSU_essay,3.5
2261,"9b_MSU_essay|Paragraph_3|Data|172,489",Paragraph_3,"(172, 489)","For example, producers of TV shows strive to p...",Data,adequate,"Paragraph_3|Primary_Claim|0,170",-,9b_MSU_essay,3.5
2262,"9b_MSU_essay|Paragraph_4|Primary_Claim|0,174",Paragraph_4,"(0, 174)",Almost every product one can think of has alre...,Primary_Claim,adequate,"Paragraph_1|Final_Claim|142,726","Paragraph_2|Primary_Claim|0,128; Paragraph_3|P...",9b_MSU_essay,3.5


In [13]:
gpt_sifr = GPT2PPL()

In [22]:
gpt_sifr.getPPL(annot['discourse_text'][0])

421

In [23]:
annot['ppl'] = annot['discourse_text'].apply(gpt_sifr.getPPL)

In [25]:
annot.to_csv(f"{DATADIR}/AnnotsWithPPL.csv")

## For uploading to GPTZero

In [20]:
def export_essays(essays, folder = "Exports", extension = "txt"):
    assert 'Text' in essays.columns

    def export(row):
        filename = os.path.join(DATADIR, folder, f"{row['Essay']}.{extension}")
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(row['Text'])
    
    essays.apply(export, axis=1)

In [21]:
export_essays(essays)

# GPTZero

The class GPTZeroAPI comes courtesy of https://github.com/Haste171/gptzero.

See https://gptzero.me/docs for more details.

In [None]:
class GPTZeroAPI:

    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = 'https://api.gptzero.me/v2/predict'
    
    def text_predict(self, document):
        url = f'{self.base_url}/text'
        headers = {
            'accept': 'application/json',
            'X-Api-Key': self.api_key,
            'Content-Type': 'application/json'
        }
        data = {
            'document': document
        }
        response = requests.post(url, headers=headers, json=data)
        return response.json()

# Tayyab and Chua's Implementation

GPTZero costs money to use the API :(

Let's try this version instead, where Tayyab and Chua attempt to imitate
GPTZero's methods. They claim that they achieve the same results.

https://github.com/BurhanUlTayyab/GPTZero

In [11]:
import torch
import re
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from collections import OrderedDict

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
class GPT2PPL:

    def __init__(self, device = "cpu", model_id = "gpt2"):
        self.device = device
        self.model_id = model_id
        self.model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
        self.tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

        self.max_length = self.model.config.n_positions
        self.stride = 512

    def __call__(self, sentence):
        """Take a sentence split by full stop and print perplexity.
        
        Burstiness is the max perplexity of each sentence.
        """
        results = OrderedDict()

        total_valid_char = re.findall("[a-zA-Z0-9]+", sentence)
        total_valid_char = sum([len(x) for x in total_valid_char])

        # print(total_valid_char)
        # assert total_valid_char >= 100
        if total_valid_char < 100:
            return

        lines = re.split(r'(?<=[.?!][ \[\(])|(?<=\n)\s*', sentence)
        lines = list(filter(lambda x: (x is not None) and (len(x) > 0), lines))

        ppl = self.getPPL(sentence)
        results['perplexity'] = ppl

        offset = ""
        ppl_per_line = []
        for line in lines:
            if re.search("[a-zA-Z0-9]+", line) == None:
                continue
            if len(offset) > 0:
                line = offset + line
                offset = ""
            
            # remove the new line or space in the first sentence if it exists
            if line[0] == '\n' or line[0] == ' ':
                line = line[1:]
            if line[-1] == '\n' or line[-1] == ' ':
                line = line[:-1]
            elif line[-1] == '[' or line[-1] == '(':
                offset = line[-1]
                line = line[:-1]
            
            ppl = self.getPPL(line)
            ppl_per_line.append(ppl)
    
        results['ppl_per_line'] = sum(ppl_per_line) / len(ppl_per_line)
        results['burstiness'] = max(ppl_per_line)

        return results

    def getPPL(self, sentence):
        encodings = self.tokenizer(sentence, return_tensors = "pt")
        seq_len = encodings.input_ids.size(1)

        nlls = []
        likelihoods = []
        prev_end_loc = 0
        for begin_loc in range(0, seq_len, self.stride):
            end_loc = min(begin_loc + self.max_length, seq_len)
            trg_len = end_loc - prev_end_loc
            input_ids = encodings.input_ids[:, begin_loc:end_loc]
            input_ids = input_ids.to(self.device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = self.model(input_ids, labels = target_ids)
                neg_log_likelihood = outputs.loss * trg_len
                likelihoods.append(neg_log_likelihood)
            
            nlls.append(neg_log_likelihood)

            prev_end_loc = end_loc
            if end_loc == seq_len:
                break
        
        ppl = int(torch.exp(torch.stack(nlls).sum() / end_loc))
        return ppl

In [33]:
gpt_sifr = GPT2PPL()

In [34]:
gpt_sifr(essays['Text'][0])

1965


OrderedDict([('perplexity', 21),
             ('ppl_per_line', 64.375),
             ('burstiness', 421)])

In [35]:
def averagePPL(text):
    data = gpt_sifr(text)
    return data['ppl_per_line']

In [36]:
essays['LinePPL'] = essays['Text'].apply(averagePPL)

1965
1334
939
1046
1670
1177
1662
1419
1989
935
997
1462
892
2137
1411
1429
735
3068
2952
2253
1164
2574
2251
1141
1391
1768
1977
1277
1178
1957
2237
2096
1484
795
1721
1736
1872
1693
1538
2472
736
1739
1240
1222
1992
1989
1417
1159
2102
1528
1606
1035
2318
1486
2200
1380
1617
536
1489
1055
763
1404
1141
2199
1835
1568
1492
1075
1579
1129
651
1158
2129
983
1465
1460
1735
2582
1236
1330
891
1086
2289
1046
1741
2307
792
2336
799
1500
1119
1381
2881
1260
1796
1960
1933
2262
1275
1269
1829
1966
1217
2754
1523
1384
3061
2246
2314
2655
1224
963
1049
1352
1581
1504
2479
1483
2422
723
1890
1910
1602
1261
1651
1189
1536
1660
1424
2271
689
1003
1438
1313
1962
1445
1287
2056
1021
1982
2268
1230
969
730
1970
1049
2241
1110
1962
2103
1032
1280
1684
729
706
2224
2041
997
1722
1214
1795
1214
1109
1121
1528
1653
1234
1727
2664
1381
2385
1606
1174
1334
895
1450
1338
1492
1389
790
1624
1573
1271
1096
868
2647
1640
1081
703
2392
1922
1435
1328
1411
1709
1286
2079
1864
2554
2771
1182
1127
2074
1645
2215
1

In [37]:
essays

Unnamed: 0,Essay,Number of words,Holistic Score,Text,LinePPL
0,1_MSU_essay,426,5.0,Heroes or Celebrities\nCelebrities and heroes ...,64.375000
1,10_MSU_essay,262,3.5,"Musicians, artists, writers, scientists, and m...",46.083333
2,100_MSU_essay,191,2.5,People have the freedom to choose what they wa...,73.214286
3,100b_MSU_essay,233,3.5,"Anyone can become a celebrity, but to become a...",69.727273
4,101_MSU_essay,365,1.5,Most people at a young age have those certain ...,94.900000
...,...,...,...,...,...
309,98_MSU_essay,542,4.5,"In our modern day society, it has become commo...",46.407407
310,98b_MSU_essay,480,4.0,"In an age of celebrity worship, where movie st...",71.562500
311,99_MSU_essay,357,2.5,"A limo pulls up, cameras are constantly flashi...",60.941176
312,99b_MSU_essay,378,2.0,Heroes and celebrities are clearly different. ...,57.136364


In [38]:
essays.to_csv(os.path.join(DATADIR, "EssaysWithPPL.csv"))