## Install environment packages

In [1]:
!pip install pandas
!pip install numpy
!pip install datasets
!pip install transformers
!pip install sentence-transformers
!pip install scikit-learn
!pip install torch
!pip install tqdm
print("Installed packages")

Installed packages


## Map Google drive to environment if running in cloud

In [2]:
# # Map drive folder in Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

## Data Inspection

**Description:** In this section, we inspect the dataset to better understand its content and composition.

In [3]:
# Import packages
import pandas as pd

data_file_path = '/content/drive/MyDrive/human_v_machine_data/data.parquet'
df = pd.read_parquet(data_file_path)
df.head()

Unnamed: 0,text,source,prompt_id,text_length,word_count
0,"Federal law supersedes state law, and cannabis...",Bloom-7B,0,967,157
1,Miles feels restless after working all day. He...,Bloom-7B,0,5068,778
2,So first of I am danish. That means that I fol...,Bloom-7B,0,1602,267
3,In this paper we present a novel rule-based ap...,Bloom-7B,0,5469,848
4,"Most social progressives, love democracy, and ...",Bloom-7B,0,2379,380


In [4]:
row_count, col_count = df.shape
print(f"Number of rows: {row_count}")
print(f"Number of columns: {col_count}")

Number of rows: 788922
Number of columns: 5


In [5]:
# Inspect sources
names_of_sources = sorted(df.source.unique())
print("*** List of Sources ***\n")
[print(source) for source in names_of_sources]

# Print number of machine sources
num_of_llms = len(df.source.unique()) - 2 # Excluding those from humans and unknown sources
print(f"\nNumber of LLMs: {num_of_llms}")

*** List of Sources ***

Bloom-7B
Claude-Instant-v1
Claude-v1
Cohere-Command
Dolphin-2.5-Mixtral-8x7B
Dolphin-Mixtral-8x7B
Falcon-180B
Flan-T5-Base
Flan-T5-Large
Flan-T5-Small
Flan-T5-XL
Flan-T5-XXL
GLM-130B
GPT-3.5
GPT-4
GPT-J
GPT-NeoX
Gemini-Pro
Goliath-120B
Human
LLaMA-13B
LLaMA-2-70B
LLaMA-2-7B
LLaMA-30B
LLaMA-65B
LLaMA-7B
LZLV-70B
Mistral-7B
Mistral-7B-OpenOrca
Mixtral-8x7B
MythoMax-L2-13B
Neural-Chat-7B
Noromaid-20B
Nous-Capybara-34B
Nous-Capybara-7B
Nous-Hermes-LLaMA-2-13B
Nous-Hermes-LLaMA-2-70B
OPT-1.3B
OPT-125M
OPT-13B
OPT-2.7B
OPT-30B
OPT-350M
OPT-6.7B
OpenChat-3.5
OpenHermes-2-Mistral-7B
OpenHermes-2.5-Mistral-7B
PaLM-2
Psyfighter-13B
Psyfighter-2-13B
RWKV-5-World-3B
StripedHyena-Nous-7B
T0-11B
T0-3B
Text-Ada-001
Text-Babbage-001
Text-Curie-001
Text-Davinci-001
Text-Davinci-002
Text-Davinci-003
Toppy-M-7B
Unknown
YI-34B

Number of LLMs: 61


In [6]:
source_counts = df['source'].value_counts().reset_index()
source_counts.columns = ['source', 'frequency']
print("Most and least frequent sources")
display(source_counts)

Most and least frequent sources


Unnamed: 0,source,frequency
0,Human,347692
1,GPT-3.5,52346
2,Text-Davinci-003,22860
3,Text-Davinci-002,21436
4,OPT-1.3B,18467
...,...,...
58,Toppy-M-7B,433
59,LLaMA-2-7B,409
60,Dolphin-Mixtral-8x7B,407
61,Cohere-Command,390


In [7]:
# Check for number of records from "Unknown" sources
unknown_source_rows = len(df[df.source == 'Unknown'])
print(f"Number of rows with 'Unknown' source: {unknown_source_rows}")

Number of rows with 'Unknown' source: 6093


In [8]:
# Check for number of records with missing text
missing_text_rows = len(df[df.text.isnull()])
print(f"Number of rows with missing text: {missing_text_rows}")

Number of rows with missing text: 0


In [9]:
# Check for number of duplicate rows
duplicate_rows = len(df[df.duplicated()])
print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 0


In [10]:
# Get the average and the median word_count
average_word_count = df['word_count'].mean()
median_word_count = df['word_count'].median()
max_word_count = df['word_count'].max()
min_word_count = df['word_count'].min()
print(f"Average word count: {int(average_word_count)}")
print(f"Median word count: {int(median_word_count)}")
print(f"Max word count: {int(max_word_count)}")
print(f"Min word count: {int(min_word_count)}")

Average word count: 497
Median word count: 324
Max word count: 71543
Min word count: 25


In [11]:
# Get number of records with word_count at or under 500
# This leaves 12 tokens slots of BERT's 512 token limit to account
# for any tokens like [CLS] and [SEP] that may need to be added during tokenization
under_500 = len(df[df.word_count <= 500])
print(f"Number of records with word_count at or under 500: {under_500}")

Number of records with word_count at or under 500: 510783


## Data Preprocessing

**Description:** In this section, we ...

1. Exclude text from "Unknown" sources
2. Filter the dataset to retain only samples whose text is no longer than 500 words/tokens. This is to comply with the 512 token context limit of BERT, allowing a buffer of 12.
3.   Add a `labels` column to distinguish between humman-written and machine-generated text. Human-written = 0, and machine-generated = 1.
5. Use StyleDistance model to generate style embeddings of the text and store in a style_embedding column.
6. Based on prior inspection above, there are no duplicate rows or rows with missing text. Use stratified sampling to split the data into training, test, and validation sets, and save the results as parquet files for subsequent use in model training. Stratified sampling ensures that similar distribution of human and machine sources rows in the subsets as the orginal dataset


### Excluding text from "Unknown Sources"

In [12]:
# Exclude text from "Unknown" sources
df = df[df.source != 'Unknown']
print(f"Number of rows after excluding 'Unknown' sources: {len(df)}")

Number of rows after excluding 'Unknown' sources: 782829


### Excluding text longer than 500 words/tokens

In [13]:
# Exclude text large than 500 words
df = df[df.word_count <= 500]
print(f"Number of rows after excluding text longer than 500 words: {len(df)}")

Number of rows after excluding text longer than 500 words: 504690


### Add a `labels` column to distinguish between human-written and machine-generated text

In [14]:
df['labels'] = 1  # Default to machine-generated
df.loc[df['source'] == 'Human', 'labels'] = 0  # Set human-written to 0
print("Created 'labels' column and assigned values based on 'source'.")
print(df[['source', 'labels']].head())

Created 'labels' column and assigned values based on 'source'.
     source  labels
0  Bloom-7B       1
2  Bloom-7B       1
4  Bloom-7B       1
5  Bloom-7B       1
6  Bloom-7B       1


### Add style embeddings to dataset using StyleDistant



In [15]:
df['style_embedding'] = None
print("Added 'style_embedding' column and initialized with None.")

Added 'style_embedding' column and initialized with None.


In [16]:
from sentence_transformers import SentenceTransformer
import torch

# Load the StyleDistance model
style_model = SentenceTransformer('StyleDistance/styledistance')
print("StyleDistance model loaded.")

# Check for GPU and move model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
style_model.to(device)
print(f"StyleDistance model moved to {device}.")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/249M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

StyleDistance model loaded.
StyleDistance model moved to cuda.


In [17]:
# Function to get StyleDistance embedding
def get_style_embeddings(texts):
    """Generates style embeddings for a batch of texts using the StyleDistance model."""
    # Encode texts using the style model
    embeddings = style_model.encode(texts, convert_to_tensor=True)
    # Move embeddings to CPU and convert to numpy array
    return embeddings.cpu().numpy()

print("Function 'get_style_embeddings' defined.")

Function 'get_style_embeddings' defined.


In [18]:
import numpy as np
from tqdm import tqdm

# Generate StyleDistance embeddings
batch_size = 32 # Using the same batch size as for BERT embeddings
style_embeddings_list = []

for i in tqdm(range(0, len(df), batch_size), desc="Generating StyleDistance embeddings"):
    batch_texts = df['text'].iloc[i:i+batch_size].tolist()
    batch_style_embeddings = get_style_embeddings(batch_texts)
    style_embeddings_list.extend(batch_style_embeddings)

df['style_embedding'] = style_embeddings_list

print("Style embeddings generated and assigned to 'style_embedding' column.")
print(df[['text', 'style_embedding']].head())

Generating StyleDistance embeddings: 100%|██████████| 15772/15772 [48:07<00:00,  5.46it/s]


Style embeddings generated and assigned to 'style_embedding' column.
                                                text  \
0  Federal law supersedes state law, and cannabis...   
2  So first of I am danish. That means that I fol...   
4  Most social progressives, love democracy, and ...   
5  finally gets the fish up onto the ice and he s...   
6  BSkyB utilises the VideoGuard pay-TV scramblin...   

                                     style_embedding  
0  [-0.5177671, 0.22710235, 0.3778064, 0.20268035...  
2  [0.16673793, 0.0333912, 0.34050757, -0.2283511...  
4  [-0.23736966, 0.59904814, 0.6017204, -0.159238...  
5  [0.2704919, 0.18244076, 0.2994834, 0.2050366, ...  
6  [-0.6002549, 0.52633953, 0.3603375, 0.17297883...  


## Data Sampling

**Description:** Here, we split the preprocessed data into training, test, and validation datasets. We use stratified sampling to ensure a similar distribution of human-machine, as compared to the original dataset.

In [19]:
# Show class distribution of data
label_counts = df['labels'].value_counts()
label_percentages = df['labels'].value_counts(normalize=True) * 100

print("Current Class Distribution (Absolute Counts):")
print(label_counts)
print("\nCurrent Class Distribution (Percentages):")
print(label_percentages)


Current Class Distribution (Absolute Counts):
labels
1    310258
0    194432
Name: count, dtype: int64

Current Class Distribution (Percentages):
labels
1    61.474965
0    38.525035
Name: proportion, dtype: float64


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split df into training (80%) and temporary (20%) sets, stratified by 'label'
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)
print(f"Initial split: train_df shape: {train_df.shape}, temp_df shape: {temp_df.shape}")

# Split temp_df into validation (10%) and test (10%) sets, stratified by 'label'
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['labels'], random_state=42)
print(f"Second split: val_df shape: {val_df.shape}, test_df shape: {test_df.shape}")

Initial split: train_df shape: (403752, 7), temp_df shape: (100938, 7)
Second split: val_df shape: (50469, 7), test_df shape: (50469, 7)


## Save preprocessed datasets

**Description:** We save the final training, test, and validation datasets.


In [21]:
import os

# Define the directory to save the Parquet files
output_dir = '/content/drive/MyDrive/human_v_machine_data'
os.makedirs(output_dir, exist_ok=True)

# Save train_df to Parquet
try:
    train_output_path = os.path.join(output_dir, 'train_data.parquet')
    train_df.to_parquet(train_output_path, index=False)
    print(f"train_df successfully saved to {train_output_path}")
except Exception as e:
    print(f"Error saving train_df: {e}")

# Save val_df to Parquet
try:
    val_output_path = os.path.join(output_dir, 'val_data.parquet')
    val_df.to_parquet(val_output_path, index=False)
    print(f"val_df successfully saved to {val_output_path}")
except Exception as e:
    print(f"Error saving val_df: {e}")

# Save test_df to Parquet
try:
    test_output_path = os.path.join(output_dir, 'test_data.parquet')
    test_df.to_parquet(test_output_path, index=False)
    print(f"test_df successfully saved to {test_output_path}")
except Exception as e:
    print(f"Error saving test_df: {e}")

# Final confirmation
if os.path.exists(train_output_path) and os.path.exists(val_output_path) and os.path.exists(test_output_path):
    print("\nAll DataFrames (train_df, val_df, test_df) have been successfully saved.")
else:
    print("\nWarning: One or more DataFrames might not have been saved correctly.")

train_df successfully saved to /content/drive/MyDrive/human_v_machine_data/train_data.parquet
val_df successfully saved to /content/drive/MyDrive/human_v_machine_data/val_data.parquet
test_df successfully saved to /content/drive/MyDrive/human_v_machine_data/test_data.parquet

All DataFrames (train_df, val_df, test_df) have been successfully saved.
