In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score, classification_report
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV


scaler = StandardScaler()
def read_json_to_dataframe(file_path):
    return pd.read_json(file_path, lines=True)

# read files
set1 = read_json_to_dataframe('../data/domain1_train.json')
set2 = read_json_to_dataframe('../data/domain2_train.json')
predict_x = read_json_to_dataframe('../data/test_set.json')

In [2]:
X1 = set2["text"].apply(lambda x: ' '.join(map(str, x)))
y1 = set2["label"]

# Split data into minority and majority classes, less human than machine in set2
minority_class = X1[y1 == 1]  
majority_class = X1[y1 == 0]  

# Determine the size of the minority class
minority_size = len(minority_class)

# Randomly select a subset of the majority class to match the size of the minority class
majority_sampled = majority_class.sample(n=minority_size, random_state=42)

# Combine the minority and majority sampled data
X_balanced = pd.concat([minority_class, majority_sampled])
y_balanced = pd.Series([1] * minority_size + [0] * minority_size)


In [3]:
# Create a function to calculate text entropy from tokenized sequences
def text_entropy(tokens):
    if len(tokens) == 0:
        return 0.0
    else:
        prob = [float(tokens.count(token)) / len(tokens) for token in set(tokens)]
        entropy = -sum([p * math.log(p) / math.log(2.0) for p in prob])
        return entropy

X = set1["text"].apply(lambda x: ' '.join(map(str, x)))  # Convert lists to space-separated strings
y = set1["label"]


# concate two set of data
X_2 = pd.concat([X_balanced, X])
y_2 = pd.concat([y_balanced, y])


In [4]:
# Extract unigram features
vectorizer = CountVectorizer(ngram_range=(1, 1))
X_bow = vectorizer.fit_transform(X_2)

# TF-IDF transformation
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_bow)

# Calculate text length and text entropy as features
text_length = np.array([len(text.split()) for text in X_2])

text_entropy = np.array([text_entropy(text.split()) for text in X_2])

X_combined = np.column_stack((text_length, text_entropy, X_tfidf.toarray()))
X_scaled = scaler.fit_transform(X_combined)

n_components = 1000  # Specify the number of components
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

In [None]:
# increase size of test as training takes too long, find the near best c first, then try to find better c value
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_2, test_size=0.5, random_state=42)
# Build an SVM model
svm_classifier = SVC()

# Define the parameter grid for GridSearch
f1_scores = []

for C_value in [0.1, 0.2, 0.3, 0.4]:
    for kernel_value in ['rbf']:
        svm_model = SVC(C=C_value, kernel=kernel_value)
        svm_model.fit(X_train, y_train)
        y_pred = svm_model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        f1_scores.append((C_value, kernel_value, f1))
        print(C_value, kernel_value, f1)

# Choose the model with the highest F1-score
best_C, best_kernel, best_f1 = max(f1_scores, key=lambda x: x[2])

print("Best Model (based on highest F1-score):")
print("F1-score:", best_f1)

# Train the model with the best hyperparameters
# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


# show the final accuracy of the model on test data
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

0.1 rbf 0.832663422889782
0.2 rbf 0.8395237755756755
0.3 rbf 0.8436039404616381
0.4 rbf 0.8450561390800434
Best Model (based on highest F1-score):
F1-score: 0.8450561390800434
Accuracy: 0.8202521008403362
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.66      0.79      5938
           1       0.74      0.98      0.85      5962

    accuracy                           0.82     11900
   macro avg       0.86      0.82      0.82     11900
weighted avg       0.86      0.82      0.82     11900



In [5]:
# Split data into train and test sets, for tfidf features
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_2, test_size=0.5, random_state=42)

In [16]:
# Build an SVM model
svm_classifier = SVC()

# Define the parameter grid for GridSearch
f1_scores = []

for C_value in [0.1, 0.2, 0.3, 0.4]:
    for kernel_value in ['rbf']:
        svm_model = SVC(C=C_value, kernel=kernel_value)
        svm_model.fit(X_train, y_train)
        y_pred = svm_model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        f1_scores.append((C_value, kernel_value, f1))
        print(C_value, kernel_value, f1)

# Choose the model with the highest F1-score
best_C, best_kernel, best_f1 = max(f1_scores, key=lambda x: x[2])

print("Best Model (based on highest F1-score):")
print("F1-score:", best_f1)



# Train the model with the best hyperparameters
# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


# show the final accuracy of the model on unseen data
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

0.1 rbf 0.8019068573524019
0.2 rbf 0.8323674608285039
0.3 rbf 0.8391768839176884
0.4 rbf 0.8428334654464221
Best Model (based on highest F1-score):
F1-score: 0.8428334654464221
Accuracy: 0.8167226890756303
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.65      0.78      5938
           1       0.74      0.98      0.84      5962

    accuracy                           0.82     11900
   macro avg       0.86      0.82      0.81     11900
weighted avg       0.85      0.82      0.81     11900



In [7]:
# Train the model with the best hyperparameters
# Evaluate the model

svm_model = SVC(C=0.4, kernel='rbf')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# show the final accuracy of the model on unseen data
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8167226890756303
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.65      0.78      5938
           1       0.74      0.98      0.84      5962

    accuracy                           0.82     11900
   macro avg       0.86      0.82      0.81     11900
weighted avg       0.85      0.82      0.81     11900



In [8]:
# Build an SVM model
svm_classifier = SVC()

# Define the parameter grid for GridSearch
f1_scores = []

for C_value in [0.367,0.368,0.369]:
    for kernel_value in ['rbf']:
        svm_model = SVC(C=C_value, kernel=kernel_value)
        svm_model.fit(X_train, y_train)
        y_pred = svm_model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        f1_scores.append((C_value, kernel_value, f1))
        print(C_value, kernel_value, f1)

# Choose the model with the highest F1-score
best_C, best_kernel, best_f1 = max(f1_scores, key=lambda x: x[2])

print("Best Model (based on highest F1-score):")
print("C:", best_C)
print("Kernel:", best_kernel)
print("F1-score:", best_f1)

0.367 rbf 0.8410986482599943
0.368 rbf 0.8410986482599943
0.369 rbf 0.8410986482599943
Best Model (based on highest F1-score):
C: 0.367
Kernel: rbf
F1-score: 0.8410986482599943


In [10]:
# Build an SVM model
svm_classifier = SVC()

# Define the parameter grid for GridSearch
f1_scores = []

for C_value in [0.37,0.375,0.38]:
    for kernel_value in ['rbf']:
        svm_model = SVC(C=C_value, kernel=kernel_value)
        svm_model.fit(X_train, y_train)
        y_pred = svm_model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        f1_scores.append((C_value, kernel_value, f1))
        print(C_value, kernel_value, f1)

# Choose the model with the highest F1-score
best_C, best_kernel, best_f1 = max(f1_scores, key=lambda x: x[2])

print("Best Model (based on highest F1-score):")
print("C:", best_C)
print("Kernel:", best_kernel)
print("F1-score:", best_f1)

0.37 rbf 0.8410381767201092
0.375 rbf 0.8413406214039125
0.38 rbf 0.8413783181066109
Best Model (based on highest F1-score):
C: 0.38
Kernel: rbf
F1-score: 0.8413783181066109


In [9]:
# Train the model with the best hyperparameters
# Evaluate the model
svm_model = SVC(C=0.38, kernel='rbf')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


# show the final accuracy of the model on test data
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8147058823529412
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.65      0.78      5938
           1       0.74      0.98      0.84      5962

    accuracy                           0.81     11900
   macro avg       0.85      0.81      0.81     11900
weighted avg       0.85      0.81      0.81     11900



In [11]:
# Train the final svm model using best c value of 0.38
best_svm_classifier = SVC(C=0.38, kernel="rbf")
best_svm_classifier.fit(X_scaled, y_2)

In [13]:
def text_entropy(tokens):
    if len(tokens) == 0:
        return 0.0
    else:
        prob = [float(tokens.count(token)) / len(tokens) for token in set(tokens)]
        entropy = -sum([p * math.log(p) / math.log(2.0) for p in prob])
        return entropy
x_predict_transformed = predict_x['text'].apply(lambda x: ' '.join(map(str, x)))
x_predict_transformed = pd.concat([x_predict_transformed])
x_predict_transformed
# Extract unigram features
X_predict_bow = vectorizer.transform(x_predict_transformed)

# TF-IDF transformation
X_predict_tfidf = tfidf_transformer.transform(X_predict_bow)

# Calculate text length and text entropy as features
text_length = np.array([len(text.split()) for text in x_predict_transformed])

text_entropy = np.array([text_entropy(text.split()) for text in x_predict_transformed])

X_combined = np.column_stack((text_length, text_entropy, X_predict_tfidf.toarray()))
X_scaled_test = scaler.fit_transform(X_combined)

In [14]:
prediction = best_svm_classifier.predict(X_scaled_test)

In [15]:
pred = {'id': predict_x['id'], 
        'class': prediction} 

# Create a new DataFrame, save the predictions
pred_df = pd.DataFrame(pred)
pred_df.to_csv('svm_test_0.38.csv', index=False)

In [16]:
# inspect results of predictions
pred_df["class"].value_counts()

1    653
0    347
Name: class, dtype: int64