# Import packages

In [1]:
import ssl
import re
import string 

import pandas as pd
import numpy as np
from lexicalrichness import LexicalRichness

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
try:
     _create_unverified_https_context =     ssl._create_unverified_context
except AttributeError:
     pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aycaprivate/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import dataset

In [2]:
df = pd.read_json(path_or_buf="all.jsonl", lines=True)
df.shape

(24322, 5)

# Data Preprocessing

In [3]:
def data_preprocessing(df):
    '''This function is specifically designed for the HC3 dataset and the preprocessing thereof. 
    It mostly joins the data, deletes empty rows and duplicate rows.'''
    
    # Join all data, making it a string instead of a list
    df['chatgpt_answers'] = df['chatgpt_answers'].map(lambda x: ''.join(x))
    df['human_answers'] = df['human_answers'].map(lambda x: ''.join(x))

    # Deleting empty rows
    df = df.drop(df[df['chatgpt_answers'].map(lambda x: len(x) == 0)].index)
    df = df.drop(df[df['human_answers'].map(lambda x: len(x) == 0)].index)

    # Deleting duplicate rows
    # Answering the same question will result in the same answers
    # Therefore focusing on only question column
    df = df.drop_duplicates(subset = ['question'], ignore_index = True)
    df = df.drop_duplicates(subset = ['human_answers'], ignore_index = True)
    df = df.drop_duplicates(subset = ['chatgpt_answers'], ignore_index = True)
    
    return df

def remove_punc(df):
    '''Function to remove punctuation and make everything lowercase.'''
        
    # Remove punctuation in ChatGPT answers column
    df['chatgpt_answers'] = df['chatgpt_answers'].map(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
    
    # Remove punctuation in human answers column
    df['human_answers'] = df['human_answers'].map(lambda x: re.sub(r'\s(\'\S)', r'\1', x))
    df['human_answers'] = df['human_answers'].map(lambda x: re.sub(r'[^\w\s]', ' ', x.lower()))
    
    # Replace double spaces
    df['human_answers'].replace("  ", " ", regex = True, inplace = True)
    df['human_answers'].replace("   ", " ", regex = True, inplace = True)
    
    return df

def label_data(df):
    '''This functions gives the each answer a label (0 for human answers, 1 for ChatGPT answers).'''
    
    # Labeling human answers
    human = df[['question', 'human_answers', 'source']]
    human['type'] = "human"
    human.rename(columns = {'human_answers':'answer'}, inplace = True)
    human['class'] = 0
    
    # Labeling ChatGPT answers
    chatgpt = df[['question', 'chatgpt_answers', 'source']]
    chatgpt['type'] = "chatgpt"
    chatgpt.rename(columns = {'chatgpt_answers':'answer'}, inplace = True)
    chatgpt['class'] = 1
    
    # Shuffle dataset and reset index
    df = pd.concat([human, chatgpt], ignore_index = True)
    df = shuffle(df)
    df.reset_index(drop = True, inplace = True)
    
    return df

def data_cleaning(df):
    '''This function replaces some of the HC3-specific messiness, mostly in human answers.'''
    
    df['answer'].replace(" n t ", "nt ", regex = True, inplace = True)
    df['answer'].replace(" nt ", "nt ", regex = True, inplace = True)
    df['answer'].replace(" s ", "s ", regex = True, inplace = True)
    df['answer'].replace(" ve ", "ve ", regex = True, inplace = True)
    df['answer'].replace(" re ", "re ", regex = True, inplace = True)
    df['answer'].replace("i m ", "im ", regex = True, inplace = True)
    df['answer'].replace(" ll ", "ll ", regex = True, inplace = True)
    df['answer'].replace(" t ", "t ", regex = True, inplace = True)
    df['answer'].replace(" i d  ", " id ", regex = True, inplace = True)
    df['answer'].replace(" e  ", " ", regex = True, inplace = True)
    df['answer'].replace("thf_media", " ", regex = True, inplace = True)
    df['answer'].replace("url_32", " ", regex = True, inplace = True)
    df['answer'].replace("url_40", " ", regex = True, inplace = True)
    df['answer'] = df['answer'].apply(lambda x: re.sub(r'\r\n', ' ', x))
    df['answer'] = df['answer'].apply(lambda x: re.sub(r'\r\n\r', ' ', x))
    df['answer'] = df['answer'].apply(lambda x: re.sub(r'\n\n', ' ', x))
    df['answer'].replace("  ", " ", regex = True, inplace = True)
    
    return df

In [4]:
hc3_test = data_cleaning(label_data(remove_punc(data_preprocessing(df))))

# Make evenly sized chunks (200 words)

In [5]:
# First split the data into human and chatgpt
hc3_human = hc3_test[hc3_test['class'] == 0]
hc3_chatgpt = hc3_test[hc3_test['class'] == 1]

# Join the data and make it into a list format
human_string = ' '.join(hc3_human['answer'].tolist())
chatgpt_string = ' '.join(hc3_chatgpt['answer'].tolist())

# Split data into words
# That way I always have full words
human_split = human_string.split(" ")
chatgpt_split = chatgpt_string.split(" ")

# Make evenly sized chunks: 200 words per answer
human_list = [human_split[i:i + 200] for i in range(0, len(human_split), 200)]
chatgpt_list = [chatgpt_split[i:i + 200] for i in range(0, len(chatgpt_split), 200)]

# The chunks are still split
# Joining the evenly sized chunks into one "sentence"
human_joined = []
for answer in human_list:
    new_list = []
    new_list.append(' '.join(answer))
    human_joined.append(new_list)

chatgpt_joined = []
for answer in chatgpt_list:
    new_list = []
    new_list.append(' '.join(answer))
    chatgpt_joined.append(new_list)
    
# Put both classes into a new Dataframe and concat them together
human_df = pd.DataFrame({'answer' : human_joined, 'class' : 0})
chatgpt_df = pd.DataFrame({'answer' : chatgpt_joined, 'class' : 1})
    
# Again, joining the answers, because they were still in a list format
human_df['answer'] = human_df['answer'].map(lambda x: ''.join(x))
chatgpt_df['answer'] = chatgpt_df['answer'].map(lambda x: ''.join(x))

# Lexical Diversity Analysis

## Whole dataset

In [6]:
human_df['ttr'] = human_df['answer'].map(lambda x: LexicalRichness(x).ttr)
human_df['ttr'].mean()

0.611465584771711

In [7]:
human_df['yulek'] = human_df['answer'].map(lambda x: LexicalRichness(x).yulek)
human_df['yulek'].mean()

110.23107570837516

In [8]:
human_df['mtld'] = human_df['answer'].map(lambda x: LexicalRichness(x).mtld(threshold = 0.72))
human_df['mtld'].mean()

86.76497772019333

In [9]:
chatgpt_df['ttr'] = chatgpt_df['answer'].map(lambda x: LexicalRichness(x).ttr)
chatgpt_df['ttr'].mean()

0.5360142792529885

In [10]:
chatgpt_df['yulek'] = chatgpt_df['answer'].map(lambda x: LexicalRichness(x).yulek)
chatgpt_df['yulek'].mean()

147.34450712339435

In [11]:
chatgpt_df['mtld'] = chatgpt_df['answer'].map(lambda x: LexicalRichness(x).mtld(threshold = 0.72))
chatgpt_df['mtld'].mean()

62.91089110379055

## Per category

In [12]:
hc3_test[['answer', 'class', 'source']]

def data_source(df, category):
    human = df[df['class'] == 0][df['source'] == category]
    chatgpt = df[df['class'] == 1][df['source'] == category]
    
    human_string = ' '.join(human['answer'].tolist())
    chatgpt_string = ' '.join(chatgpt['answer'].tolist())
    
    human_split = human_string.split(" ")
    chatgpt_split = chatgpt_string.split(" ")

    human_list = [human_split[i:i + 200] for i in range(0, len(human_split), 200)]
    chatgpt_list = [chatgpt_split[i:i + 200] for i in range(0, len(chatgpt_split), 200)]
    
    human_joined = []
    for answer in human_list:
        new_list = []
        new_list.append(' '.join(answer))
        human_joined.append(new_list)

    chatgpt_joined = []
    for answer in chatgpt_list:
        new_list = []
        new_list.append(' '.join(answer))
        chatgpt_joined.append(new_list)
    
    human_df = pd.DataFrame({'answer' : human_joined, 'class' : 0, 'source': category})
    chatgpt_df = pd.DataFrame({'answer' : chatgpt_joined, 'class' : 1, 'source': category})

    human_df['answer'] = human_df['answer'].map(lambda x: ''.join(x))
    chatgpt_df['answer'] = chatgpt_df['answer'].map(lambda x: ''.join(x))
    
    return human_df, chatgpt_df

# Join data based on source
reddit_eli5 = data_source(hc3_test[['answer', 'class', 'source']], 'reddit_eli5')
finance = data_source(hc3_test[['answer', 'class', 'source']], 'finance')
medicine = data_source(hc3_test[['answer', 'class', 'source']], 'medicine')
wiki_csai = data_source(hc3_test[['answer', 'class', 'source']], 'wiki_csai')
open_qa = data_source(hc3_test[['answer', 'class', 'source']], 'open_qa')

# Create function
def LD_mean(human, chatgpt):
        ttr_human = human['answer'].map(lambda x: LexicalRichness(x).ttr)
        yulek_human = human['answer'].map(lambda x: LexicalRichness(x).yulek)
        mtld_human = human['answer'].map(lambda x: LexicalRichness(x).mtld(threshold = 0.72))
    
        ttr_chat_gpt = chatgpt['answer'].map(lambda x: LexicalRichness(x).ttr)
        yulek_chat_gpt = chatgpt['answer'].map(lambda x: LexicalRichness(x).yulek)
        mtld_chat_gpt = chatgpt['answer'].map(lambda x: LexicalRichness(x).mtld(threshold = 0.72))
    
        human_chatgpt = ["Human", "ChatGPT"]
        metrics = ["TTR", "Yule's K", "MTLD"]
        data = [[ttr_human.mean(), ttr_chat_gpt.mean()],
                [yulek_human.mean(), yulek_chat_gpt.mean()],
                [mtld_human.mean(), mtld_chat_gpt.mean()]]
        print(pd.DataFrame(data, metrics, human_chatgpt))

In [13]:
LD_mean(reddit_eli5[0], reddit_eli5[1])

               Human     ChatGPT
TTR         0.609441    0.538726
Yule's K  110.853722  142.349614
MTLD       86.051112   63.413385


In [14]:
LD_mean(open_qa[0], open_qa[1])

               Human     ChatGPT
TTR         0.670314    0.503938
Yule's K  124.889604  184.901221
MTLD       98.326788   56.632939


In [15]:
LD_mean(wiki_csai[0], wiki_csai[1])

               Human     ChatGPT
TTR         0.608622    0.542230
Yule's K  127.085706  157.635022
MTLD       76.286572   59.335829


In [16]:
LD_mean(finance[0], finance[1])

               Human     ChatGPT
TTR         0.605182    0.524778
Yule's K  111.524756  154.899536
MTLD       84.418261   60.852995


In [17]:
LD_mean(medicine[0], medicine[1])

               Human     ChatGPT
TTR         0.659574    0.554276
Yule's K   77.129425  124.041407
MTLD      119.287712   71.274707
