In [2]:
import numpy as np
import json
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Construct the Combined Dataset

In [3]:
# file paths
file_path_1 = 'data/domain1_train.json'
file_path_2 = 'data/domain2_train.json'

# create pandas dataframes
df1 = pd.read_json(file_path_1, lines=True)
df2 = pd.read_json(file_path_2, lines=True)
df2_cut = df2[['label','text']]
df_comb = pd.concat([df1, df2_cut],axis=0,ignore_index=True)

# info of dataset
print(df1.shape)
print(df2.shape)
machine_models = df2.groupby(df2['model']).count()
machine_generated_text = machine_models['label'].sum()
display(machine_models)
print(f"There are {df2.shape[0] - machine_generated_text} human generated text and {machine_generated_text} machine generated text in domain 2.  ")


(19500, 2)
(14900, 3)


Unnamed: 0_level_0,text,label
model,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2364,2364
1.0,2357,2357
2.0,2339,2339
3.0,2358,2358
4.0,789,789
5.0,780,780
6.0,1763,1763


There are 2150 human generated text and 12750 machine generated text in domain 2.  


# Read the Combined Data to list

In [4]:
# Numeric sequences and corresponding labels
X = df_comb['text'].to_list()
X= [" ".join(map(str, x)) for x in X]
y =  df_comb['label'].to_list() # Corresponding labels (1 for human, 0 for machine)


# Tf-Idf Preprocessing

In [5]:
# define a function for tfidf for the convenience of changing: 
# input text (in training set, validation and test); max features needed; and ngrams
def tfidf(cleanedText_train, cleanedText_test, maxFeatures = None, ngram = 1):
    '''
    This function take string form cleaned Text and process it to Tf-Idf sparse matrix
    '''
    vectorizer = TfidfVectorizer(max_features = maxFeatures,ngram_range = (ngram,ngram))
    trained_vec = vectorizer.fit(cleanedText_train)
    tfidf = trained_vec.transform(cleanedText_test)
    # check the names of terms
    #display(vectorizer.get_feature_names_out())
    return tfidf 

# Convert words to tfidf matrix
tfidf_vectors = tfidf(cleanedText_train = X, cleanedText_test = X, maxFeatures = None, ngram = 1)

# Train test Split

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, y, test_size=0.2, random_state=42)

# Logistic Regression

## Train Model

In [27]:
model_lgr = LogisticRegression(class_weight='balanced')
model_lgr.fit(X_train, y_train)

## Test Model

In [28]:
# Make predictions on the test data
predictions = model_lgr.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8377906976744186


# SVM 

## Train Model

In [29]:
model_svm = SVC(class_weight='balanced')  
model_svm.fit(X_train, y_train)

## Test Model

In [30]:
svm_predictions = model_svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)

SVM Accuracy: 0.8655523255813954


# Neural Network: MLP

## Train Model

In [8]:
model_mlp = MLPClassifier()
model_mlp.fit(X_train, y_train)

In [9]:
mlp_predictions = model_mlp.predict(X_test)
mlp_accuracy = accuracy_score(y_test, mlp_predictions)
print("MLP Accuracy:", mlp_accuracy)

MLP Accuracy: 0.8143895348837209


# Send to Kaggle csv

## Read the Test data

In [31]:
file_path_test = 'data/test_set.json'
df_test = pd.read_json(file_path_test, lines=True)

# to convert dataframe to list
X_Kaggle = df_test['text'].to_list()
X_Kaggle = [" ".join(map(str, x)) for x in X_Kaggle]

## Preprocessing Tfidf

In [32]:
tfidf_vectors_Kaggle = tfidf(X, X_Kaggle)

## Predict using Trained LGR

In [33]:
predictions_lgr = model_lgr.predict(tfidf_vectors_Kaggle)

# # Add predictions to the test DataFrame
df_test['class'] = predictions_lgr

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('tfidf_lgr.csv', index=False)

## Predict using Trained SVM

In [34]:
predictions_svm = model_svm.predict(tfidf_vectors_Kaggle)

# # Add predictions to the test DataFrame
df_test['class'] = predictions_svm

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('tfidf_svm.csv', index=False)


## Predict using Trained MLP

In [40]:
predictions_mlp = model_mlp.predict(tfidf_vectors_Kaggle)

# # Add predictions to the test DataFrame
df_test['class'] = predictions_mlp

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('tfidf_mlp.csv', index=False)