# Import packages

In [None]:
import re
import pandas as pd
from lexicalrichness import LexicalRichness # Lexical Analysis
import scipy.stats as stats # Statistical Analysis

# Import dataset

In [8]:
df = pd.read_json(path_or_buf="all.jsonl", lines=True)
df.shape

(24322, 5)

# Data Preprocessing

In [9]:
def data_preprocessing(df):
    '''This function is specifically designed for the HC3 dataset and the preprocessing thereof. 
    It mostly joins the data, deletes empty rows and duplicate rows.'''
    
    # Join all data, making it a string instead of a list
    df['chatgpt_answers'] = df['chatgpt_answers'].map(lambda x: ''.join(x))
    df['human_answers'] = df['human_answers'].map(lambda x: ''.join(x))

    # Deleting empty rows
    df = df.drop(df[df['chatgpt_answers'].map(lambda x: len(x) == 0)].index)
    df = df.drop(df[df['human_answers'].map(lambda x: len(x) == 0)].index)

    # Deleting duplicate rows
    # Answering the same question will result in the same answers
    # Therefore focusing on only question column
    df = df.drop_duplicates(subset = ['question'], ignore_index = True)
    df = df.drop_duplicates(subset = ['human_answers'], ignore_index = True)
    df = df.drop_duplicates(subset = ['chatgpt_answers'], ignore_index = True)
    
    return df

def remove_punc(df):
    '''Function to remove punctuation and make everything lowercase.'''
        
    # Remove punctuation in ChatGPT answers column
    df['chatgpt_answers'] = df['chatgpt_answers'].map(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
    
    # Remove punctuation in human answers column
    df['human_answers'] = df['human_answers'].map(lambda x: re.sub(r'\s(\'\S)', r'\1', x))
    df['human_answers'] = df['human_answers'].map(lambda x: re.sub(r'[^\w\s]', ' ', x.lower()))
    
    # Replace double spaces
    df['human_answers'].replace("  ", " ", regex = True, inplace = True)
    df['human_answers'].replace("   ", " ", regex = True, inplace = True)
    
    return df

def label_data(df):
    '''This functions gives the each answer a label (0 for human answers, 1 for ChatGPT answers).'''
    
    # Labeling human answers
    human = df[['question', 'human_answers', 'source']]
    human['type'] = "human"
    human.rename(columns = {'human_answers':'answer'}, inplace = True)
    human['class'] = 0
    
    # Labeling ChatGPT answers
    chatgpt = df[['question', 'chatgpt_answers', 'source']]
    chatgpt['type'] = "chatgpt"
    chatgpt.rename(columns = {'chatgpt_answers':'answer'}, inplace = True)
    chatgpt['class'] = 1
    
    # Shuffle dataset and reset index
    df = pd.concat([human, chatgpt], ignore_index = True)
    df = shuffle(df)
    df.reset_index(drop = True, inplace = True)
    
    return df

def data_cleaning(df):
    '''This function replaces some of the HC3-specific messiness, mostly in human answers.'''
    
    df['answer'].replace(" n t ", "nt ", regex = True, inplace = True)
    df['answer'].replace(" nt ", "nt ", regex = True, inplace = True)
    df['answer'].replace(" s ", "s ", regex = True, inplace = True)
    df['answer'].replace(" ve ", "ve ", regex = True, inplace = True)
    df['answer'].replace(" re ", "re ", regex = True, inplace = True)
    df['answer'].replace("i m ", "im ", regex = True, inplace = True)
    df['answer'].replace(" ll ", "ll ", regex = True, inplace = True)
    df['answer'].replace(" t ", "t ", regex = True, inplace = True)
    df['answer'].replace(" i d  ", " id ", regex = True, inplace = True)
    df['answer'].replace(" e  ", " ", regex = True, inplace = True)
    df['answer'].replace("thf_media", " ", regex = True, inplace = True)
    df['answer'].replace("url_32", " ", regex = True, inplace = True)
    df['answer'].replace("url_40", " ", regex = True, inplace = True)
    df['answer'] = df['answer'].apply(lambda x: re.sub(r'\r\n', ' ', x))
    df['answer'] = df['answer'].apply(lambda x: re.sub(r'\r\n\r', ' ', x))
    df['answer'] = df['answer'].apply(lambda x: re.sub(r'\n\n', ' ', x))
    df['answer'].replace("  ", " ", regex = True, inplace = True)
    
    return df

In [10]:
hc3_test = data_cleaning(label_data(remove_punc(data_preprocessing(df))))

# Make evenly sized chunks (200 words)

In [11]:
# First split the data into human and chatgpt
hc3_human = hc3_test[hc3_test['class'] == 0]
hc3_chatgpt = hc3_test[hc3_test['class'] == 1]

# Join the data and make it into a list format
human_string = ' '.join(hc3_human['answer'].tolist())
chatgpt_string = ' '.join(hc3_chatgpt['answer'].tolist())

# Split data into words
# That way I always have full words
human_split = human_string.split(" ")
chatgpt_split = chatgpt_string.split(" ")

# Make evenly sized chunks: 200 words per answer
human_list = [human_split[i:i + 200] for i in range(0, len(human_split), 200)]
chatgpt_list = [chatgpt_split[i:i + 200] for i in range(0, len(chatgpt_split), 200)]

# The chunks are still split
# Joining the evenly sized chunks into one "sentence"
human_joined = []
for answer in human_list:
    new_list = []
    new_list.append(' '.join(answer))
    human_joined.append(new_list)

chatgpt_joined = []
for answer in chatgpt_list:
    new_list = []
    new_list.append(' '.join(answer))
    chatgpt_joined.append(new_list)
    
# Put both classes into a new Dataframe and concat them together
human_df = pd.DataFrame({'answer' : human_joined, 'class' : 0})
chatgpt_df = pd.DataFrame({'answer' : chatgpt_joined, 'class' : 1})
    
# Again, joining the answers, because they were still in a list format
human_df['answer'] = human_df['answer'].map(lambda x: ''.join(x))
chatgpt_df['answer'] = chatgpt_df['answer'].map(lambda x: ''.join(x))

# Lexical Diversity Analysis

In [12]:
human_df['ttr'] = human_df['answer'].map(lambda x: LexicalRichness(x).ttr)
human_df['yulek'] = human_df['answer'].map(lambda x: LexicalRichness(x).yulek)
human_df['mtld'] = human_df['answer'].map(lambda x: LexicalRichness(x).mtld(threshold = 0.72))

In [13]:
chatgpt_df['ttr'] = chatgpt_df['answer'].map(lambda x: LexicalRichness(x).ttr)
chatgpt_df['yulek'] = chatgpt_df['answer'].map(lambda x: LexicalRichness(x).yulek)
chatgpt_df['mtld'] = chatgpt_df['answer'].map(lambda x: LexicalRichness(x).mtld(threshold = 0.72))

# Statistical Analysis

In [14]:
t_stat, p_value = stats.ttest_ind(a = human_df['ttr'], b = chatgpt_df['ttr'], alternative = 'two-sided', equal_var = False)
t_stat, p_value

(154.48248841822172, 0.0)

In [15]:
t_stat, p_value = stats.ttest_ind(a = human_df['yulek'], b = chatgpt_df['yulek'], alternative = 'two-sided', equal_var = False)
t_stat, p_value

(-97.85798160612441, 0.0)

In [16]:
t_stat, p_value = stats.ttest_ind(a = human_df['mtld'], b = chatgpt_df['mtld'], alternative = 'two-sided', equal_var = False)
t_stat, p_value

(116.2379270954642, 0.0)