In [25]:
import pandas as pd
import re
import numpy as np
import spacy
from lexicalrichness import LexicalRichness

nlp = spacy.load('en_core_web_sm')

answers = pd.read_csv("data/generated4_gpt4b_clean_octo_pair_id.csv")

In [26]:
# Preprocesses data with spaCy for later use

answers["question_spacy"] = answers["question"].apply(lambda x: nlp(x))

answers["REAL_spacy"] = answers["real_answer"].apply(lambda x: nlp(x))
answers["REAL_lemma"] = answers["REAL_spacy"].apply(lambda x: " ".join([y.lemma_ for y in x]))

answers["GPT_spacy"] = answers["generated_answer"].apply(lambda x: nlp(x))
answers["GPT_lemma"] = answers["GPT_spacy"].apply(lambda x: " ".join([y.lemma_ for y in x]))

In [27]:
# Countes the number of sentences per essay

def num_of_sent(text):
    return len([sentence for sentence in text.sents])

answers["REAL_sent_count"] = answers["REAL_spacy"].apply(lambda x: num_of_sent(x))
answers["GPT_sent_count"] = answers["GPT_spacy"].apply(lambda x: num_of_sent(x))

In [28]:
# Countes the number of words per essay

def num_of_words(text):
    count = len([token.text.lower() for token in text if not token.is_stop and not token.is_punct and not token.is_space])
    return count

answers["question_word_count"] = answers["question_spacy"].apply(lambda x: num_of_words(x))
answers["REAL_word_count"] = answers["REAL_spacy"].apply(lambda x: num_of_words(x))
answers["GPT_word_count"] = answers["GPT_spacy"].apply(lambda x: num_of_words(x))

### Sentence complexity

Based on a number of particular dependnecy labels found in each sentence (Clausal modifier of noun; Conjunct; Adverbial clause modifier; Clausal complement; Clausal subject; Discourse; Parataxis)
The output values are mean values of the number of the tags per sentence

In [29]:
# Calcualtes the number of specified dependency label within a sentence
def calculate_dep_score(text):
    temp = []
    for sentence in nlp(text).sents:
        temp.append(sent_complexity_structure(sentence))
    return np.mean(temp)

# Return the number of specified dependency labels found
def sent_complexity_structure(doc):
    return len([token for token in doc if (token.dep_ == "acl" or token.dep_ == "conj" or token.dep_ == "advcl"or token.dep_ == "ccomp"
                                            or token.dep_ == "csubj" or token.dep_ == "discourse" or token.dep_ == "parataxis")])

# TODO: fix col name
answers["REAL_sent_complex_tags"] = answers["real_answer"].apply(lambda x: calculate_dep_score(x))
answers["GPT_sent_complex_tags"] = answers["generated_answer"].apply(lambda x: calculate_dep_score(x))

### Lexical diversity

Calculating lexical diverstity score using MTLD measure.

In [30]:
# calculates MTLD score for the whole essay

def calculate_lex_richness_MTLD2(text):
    lex = LexicalRichness(text) 
    lex_rich_score = lex.mtld()
    return(lex_rich_score)

answers["REAL_LD"] = answers["real_answer"].apply(lambda x: calculate_lex_richness_MTLD2(x))
answers["GPT_LD"] = answers["generated_answer"].apply(lambda x: calculate_lex_richness_MTLD2(x))

### Discourse markers

Calculating number of discourse markers from Penn Discourse Tree Bank per essay. Some discourse markers (about, as, by, both, for, from, given, in, like, on, once, only, still, when, with, without, yet, and) were excluded from the list because they can often be used as not discourse markers).

In [31]:
# Counts the number of discourse markers using PDTB list

discourse = pd.read_csv("markers/connectives_discourse_markers_PDTB.txt", sep="\'", encoding="UTF-8", header=None, usecols = [1,3])

discourse[3] = discourse[3].apply(lambda x: x.replace("t_conn_", ""))
discourse[1] = discourse[1].apply(lambda x: " " + x + " ")
discourse.sort_values(3, inplace=True, ascending=False)

# Countes the total numbers of discourse markers per essay
def count_discourse_markers(text):
    i = 0
    for marker in discourse.itertuples():
        if marker[1] in text:
            i += text.count(marker[1])
    return i


answers["REAL_discourse"] = answers["REAL_lemma"].apply(lambda x: count_discourse_markers(x))
answers["GPT_discourse"] = answers["GPT_lemma"].apply(lambda x: count_discourse_markers(x))

### Modals

Counting the number of modals using POS tag "MD" and the modals.csv.

In [32]:
# Counts the number of modals from the list of modals

modals = pd.read_csv("markers/modals.csv", sep=",", encoding="UTF-8", header=None)
modals[0] = modals[0].apply(lambda x: x.replace('_', ' '))

# Counts the number of modals per essay
def count_total_modals(text):
    counter = 0
    for modal in modals.itertuples():
        if modal[1] in text:
            counter += text.count(modal[1])
    return counter

answers["REAL_modals1"] = answers["REAL_lemma"].apply(lambda x: count_total_modals(x))
answers["GPT_modals1"] = answers["GPT_lemma"].apply(lambda x: count_total_modals(x))

In [33]:
# Counts the number of modals using POS tagging

answers["REAL_pos"] = answers["REAL_spacy"].apply(lambda x: " ".join([y.tag_ for y in x]))
answers["GPT_pos"] = answers["GPT_spacy"].apply(lambda x: " ".join([y.tag_ for y in x]))

answers["REAL_modals2"] = answers["REAL_pos"].str.count(r'MD')
answers["GPT_modals2"] = answers["GPT_pos"].str.count(r'MD')

In [34]:
# Calculates total number of modals per essay

answers["REAL_modals_all"] = answers["REAL_modals2"] + answers["REAL_modals1"]
answers["GPT_modals_all"] = answers["GPT_modals2"] + answers["GPT_modals1"]

### Epistemic markers
 
Getting the number of epistemic markers.

In [35]:
# Counts the total number of epistemic markers per essay

def find_epistemic_markers(text):
    epistemic_markers_count = []
    epistemic_markers_count.extend(re.findall(r'(?:I|We|we|One|one)(?:\s\w+)?(?:\s\w+)?\s(?:believes?|think|thinks|means?|worry|worries|know|guesse?s?|assumes?|wonders?|feels?)\b(?:that)?', text))
    epistemic_markers_count.extend(re.findall(r'(?:I|We|we|One|one)\s(?:don\'t|\sdoesn\'t\sdo\snot|\sdoes\snot)\s(?:believe|think|mean|worry|know|guess|assume|wonder|feel)\b(?:that)?', text))
    epistemic_markers_count.extend(re.findall(r'(?:It|it)\sis\s(?:believed|known|assumed|thought)\b(?:that)?', text))
    epistemic_markers_count.extend(re.findall(r'(?:I|We|we)\s(?:am|are|was|were)(?:\sjust)?\s(?:thinking|guessing|wondering)\b(?:that)?', text))
    epistemic_markers_count.extend(re.findall(r'(?:I\'m|[Ww]e\'re)(?:\sjust)?\s(?:thinking|guessing|wondering)\b(?:that)?', text)) 
    epistemic_markers_count.extend(re.findall(r'(?:I|We|we|One|one)(?:\s\w+)?\s(?:do|does)\snot\s(?:believe?|think|know)\b(?:that)?', text)) 
    epistemic_markers_count.extend(re.findall(r'(?:I|We|we|One|one)\swould(?:\s\w+)?(?:\snot)?\ssay\b(?:that)?', text)) 
    epistemic_markers_count.extend(re.findall(r'(?:I\sam|I\'m)(?:\s\w+)?\s(?:afraid|sure|confident)\b(?:that)?', text)) 
    epistemic_markers_count.extend(re.findall(r'(?:My|my|Our|our)\s(?:personal\s)?(?:experience|opinion|belief|view|knowledge|worry|worries|concerns?|guesse?s?|position|perception)(?:\son\s\w+)?\s(?:is|are)\b(?:that)?', text)) 
    epistemic_markers_count.extend(re.findall(r'[Ii]n\s(?:my|our)(?:\s\w+)?\s(?:view|opinion)\b',text))
    epistemic_markers_count.extend(re.findall(r'[Fr]rom\s(?:my|our)\s(?:point\sof\sview|perspective)\b', text)) 
    epistemic_markers_count.extend(re.findall(r'As\sfar\sas\s(?:I|We|we)\s(?:am|are)\sconcerned', text))
    epistemic_markers_count.extend(re.findall(r'(?:I|We|we|One|one)\s(?:can|could|may|might)(?:\s\w+)?\sconclude\b(?:that)?', text)) 
    epistemic_markers_count.extend(re.findall(r'I\s(?:am\swilling\sto|must)\ssay\b(?:that)?', text)) 
    epistemic_markers_count.extend(re.findall(r'"One\s(?:can|could|may|might)\ssay\b(?:that)?', text)) 
    epistemic_markers_count.extend(re.findall(r'[Oo]ne\s(?:can|could|may|might)\ssay\b(?:that)?',text)) 
    epistemic_markers_count.extend(re.findall(r'[Ii]t\sis\s(?:obvious|(?:un)?clear)\b', text)) 
    epistemic_markers_count.extend(re.findall(r'[Ii]t(?:\sjust)?\s(?:seems|feels|looks)', text)) 
    epistemic_markers_count.extend(re.findall(r'[Pp]ersonally\s(?:for\sme|speaking)', text))
    epistemic_markers_count.extend(re.findall(r'(?:[Ff]rankly|[Hh]onestly|[Cc]learly)', text))
    return len(epistemic_markers_count)

answers["REAL_EpMarkers"] = answers["real_answer"].apply(lambda x: find_epistemic_markers(x))
answers["GPT_EpMarkers"] = answers["generated_answer"].apply(lambda x: find_epistemic_markers(x))

### Nominalisations

Counting the number of nominalisations per essay. 

In [36]:
# Counts the total number of nominalisations per essay

def nominalisation_counter(text):
    suffixes_n = r'\b[A-Z]*\w+(?:tion|ment|ance|ence|ion|it(?:y|ies)|ness|ship)(?:s|es)?\b'
    
    nom_nouns = []    
    nouns = [token.text for token in text if token.pos_ == 'NOUN']  
    nom_nouns = [noun for noun in nouns if re.match(suffixes_n, noun)] 
    
    return(len(nom_nouns))
    
answers["REAL_nominalisation"] = answers["REAL_spacy"].apply(lambda x: nominalisation_counter(x))
answers["GPT_nominalisation"] = answers["GPT_spacy"].apply(lambda x: nominalisation_counter(x))

### Compute question/answer word overlap

In [37]:
def compute_overlap(text1, text2):
    text1_tokens = set([token.text.lower() for token in text1 if not token.is_stop and not token.is_punct and not token.is_space])
    text2_tokens = set([token.text.lower() for token in text2 if not token.is_stop and not token.is_punct and not token.is_space])
    return len(text1_tokens.intersection(text2_tokens))

answers["REAL_overlap"] = answers.apply(lambda x: compute_overlap(x["question_spacy"], x["REAL_spacy"]), axis=1)
answers["GPT_overlap"] = answers.apply(lambda x: compute_overlap(x["question_spacy"], x["GPT_spacy"]), axis=1)

### Compute averages

In [38]:
# Counts the average number of features (discourse markers, modals, epistemic markers, nominalisations) per sentence for each essay

def average_per_sentence(feature, sent):
    average = feature/sent
    return(average)

answers["REAL_dm_per_sent"] = answers.apply(lambda row: average_per_sentence(row["REAL_discourse"], row["REAL_sent_count"]), axis=1)
answers["GPT_dm_per_sent"] = answers.apply(lambda row: average_per_sentence(row["GPT_discourse"], row["GPT_sent_count"]), axis=1)

answers["REAL_mod_per_sent"] = answers.apply(lambda row: average_per_sentence(row["REAL_modals_all"], row["REAL_sent_count"]), axis=1)
answers["GPT_mod_per_sent"] = answers.apply(lambda row: average_per_sentence(row["GPT_modals_all"], row["GPT_sent_count"]), axis=1)

answers["REAL_ep_per_sent"] = answers.apply(lambda row: average_per_sentence(row["REAL_EpMarkers"], row["REAL_sent_count"]), axis=1)
answers["GPT_ep_per_sent"] = answers.apply(lambda row: average_per_sentence(row["GPT_EpMarkers"], row["GPT_sent_count"]), axis=1)

answers["REAL_nom_per_sent"] = answers.apply(lambda row: average_per_sentence(row["REAL_nominalisation"], row["REAL_sent_count"]), axis=1)
answers["GPT_nom_per_sent"] = answers.apply(lambda row: average_per_sentence(row["GPT_nominalisation"], row["GPT_sent_count"]), axis=1)

answers["REAL_overlap_per_word"] = answers.apply(lambda row: average_per_sentence(row["REAL_overlap"], row["question_word_count"]), axis=1)
answers["GPT_overlap_per_word"] = answers.apply(lambda row: average_per_sentence(row["GPT_overlap"], row["question_word_count"]), axis=1)

In [39]:
print("Average sentence complexity")
print("Real:", np.mean(answers["REAL_sent_complex_tags"]))
print("GPT: ", np.mean(answers["GPT_sent_complex_tags"]))
print()

print("Average MTLD lexical diversity score")
print("Real: ", np.mean(answers["REAL_LD"]))
print("GPT:", np.mean(answers["GPT_LD"]))
print()

print("Average number of discourse markers per answer")
print("Real:", np.mean(answers["REAL_dm_per_sent"]))
print("GPT: ", np.mean(answers["GPT_dm_per_sent"]))
print()

print("Average number of modals per answer")
print("Real:", np.mean(answers["REAL_mod_per_sent"]))
print("GPT: ", np.mean(answers["GPT_mod_per_sent"]))
print()

print("Average number of epistemic markers per answer")
print("Real:", np.mean(answers["REAL_ep_per_sent"]))
print("GPT: ", np.mean(answers["GPT_ep_per_sent"]))
print()

print("Average number of nominalisations per answer")
print("Real:", np.mean(answers["REAL_nom_per_sent"]))
print("GPT: ", np.mean(answers["GPT_nom_per_sent"]))
print()

print("Average overlap with question per answer normalized by question length")
print("Real:", np.mean(answers["REAL_overlap_per_word"]))
print("GPT: ", np.mean(answers["GPT_overlap_per_word"]))
print()

Average sentence complexity
Real: 2.4036480035927568
GPT:  2.147702687269995

Average MTLD lexical diversity score
Real:  68.50903895647262
GPT: 104.84397511886124

Average number of discourse markers per answer
Real: 0.7106143722156866
GPT:  0.5489400129784745

Average number of modals per answer
Real: 0.6309684307031302
GPT:  0.5679658215715908

Average number of epistemic markers per answer
Real: 0.266035858117717
GPT:  0.04532062488793258

Average number of nominalisations per answer
Real: 0.6482649640275412
GPT:  1.3725764406533638

Average overlap with question per answer normalized by question length
Real: 0.20403642604922023
GPT:  0.5591483720471176



In [40]:
answers.to_csv("data/answers-with-linguistic-markers.csv", index=False)