In [3]:
import numpy as np
import json
import pandas as pd
import gzip

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Construct the Combined Dataset

In [5]:
# file paths
file_path_1 = 'data/domain1_train.json'
file_path_2 = 'data/domain2_train.json'

# create pandas dataframes
df1 = pd.read_json(file_path_1, lines=True)
df2 = pd.read_json(file_path_2, lines=True)
df2_cut = df2[['label','text']]
df_comb = pd.concat([df1, df2_cut],axis=0,ignore_index=True)

# info of dataset
print(df1.shape)
print(df2.shape)
machine_models = df2.groupby(df2['model']).count()
machine_generated_text = machine_models['label'].sum()
display(machine_models)
print(f"There are {df2.shape[0] - machine_generated_text} human generated text and {machine_generated_text} machine generated text in domain 2.  ")


(19500, 2)
(14900, 3)


Unnamed: 0_level_0,text,label
model,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2364,2364
1.0,2357,2357
2.0,2339,2339
3.0,2358,2358
4.0,789,789
5.0,780,780
6.0,1763,1763


There are 2150 human generated text and 12750 machine generated text in domain 2.  


# Read the Combined Data to list

In [6]:
# Numeric sequences and corresponding labels
X = df_comb['text'].to_list()
X= [" ".join(map(str, x)) for x in X]
y =  df_comb['label'].to_list() # Corresponding labels (1 for human, 0 for machine)

# Train test Split

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compression Pre-processing

In [14]:
# NCD with compressed lengths
def ncd(x, x2):
  x_compressed = len(gzip.compress(x.encode()))
  x2_compressed = len(gzip.compress(x2.encode()))  
  xx2 = len(gzip.compress((" ".join([x,x2])).encode()))
  return (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)

# train_ncd = [[ncd(X_train[i], X_train[j]) for j in range(len(X_train))] for i in range(len(X_train))]


## save train_ncd

In [7]:
# np.save('compression_train_ncd.npy', train_ncd)

# Load Train

In [8]:
train_ncd = np.load('compression_train_ncd.npy')

# Save Test

In [34]:
# test_ncd = [[ncd(X_test[i], X_train[j]) for j in range(len(X_train))] for i in range(len(X_test))]

In [35]:
# np.save('compression_test_ncd.npy', test_ncd)

# Load Test

In [9]:
test_ncd = np.load('compression_test_ncd.npy')

# KNN Model Fit

In [10]:
model_knn = KNeighborsClassifier(n_neighbors=7)
model_knn.fit(train_ncd, y_train)

In [11]:
print("Accuracy:", model_knn.score(test_ncd, y_test))

Accuracy: 0.8066860465116279


# Send to Kaggle csv

## Read the Test data

In [12]:
file_path_test = 'data/test_set.json'
df_test = pd.read_json(file_path_test, lines=True)

# to convert dataframe to list
X_Kaggle = df_test['text'].to_list()
X_Kaggle = [" ".join(map(str, x)) for x in X_Kaggle]

## Compression Preprocessing

In [15]:
test_ncd_Kaggle = [[ncd(X_Kaggle[i], X_train[j]) for j in range(len(X_train))] for i in range(len(X_Kaggle))]

## Predict using Trained KNN (Although there is no training in KNN...)

In [16]:
predictions_knn = model_knn.predict(test_ncd_Kaggle)

# # Add predictions to the test DataFrame
df_test['class'] = predictions_knn

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('prediction/compression_knn.csv', index=False)