In [45]:
import json
import pandas as pd
from collections import Counter
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.model_selection import cross_val_score
from scipy.sparse import csr_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification

## Read Data

In [113]:
dom1 = []
dom2 = []
Test_set = []
with open('../data/domain1_train.json', 'r') as file:
    for line in file:
        dom1.append(json.loads(line))
        
with open('../data/domain2_train.json', 'r') as file:
    for line in file:
        dom2.append(json.loads(line))
        
sam = pd.read_csv("../data/sample.csv")

with open('../data/test_set.json', 'r') as file:
    for line in file:
        Test_set.append(json.loads(line))
        
dom1 = pd.DataFrame.from_dict(dom1)
dom2 = pd.DataFrame.from_dict(dom2)
dom2 = dom2[dom2['text'].apply(len) > 0]
Test_set = pd.DataFrame.from_dict(Test_set)

# human as model 8, dom1 AI as model 7
dom1['model'] = np.where(dom1['label'] == 1, 8, 7)
dom2["model"] = dom2["model"].replace(np.nan, 8)

## Resample 

#### 1. full dom1 + even dom2

In [215]:
dom2_0 = dom2.loc[dom2['label']==0]
dom2_1 = dom2.loc[dom2['label']==1]

length_dom2_1 = len(dom2_1)
mol_num = len(dom2['model'].value_counts())
least_mol_num = length_dom2_1/(mol_num-1)
sam_dom2_0 = dom2_0.groupby('model').apply(lambda x: x.sample(n=int(least_mol_num)))
sam_dom2_0.reset_index(drop=True, inplace=True)
dom2_even = result = pd.concat([dom2_1, sam_dom2_0])

df = pd.concat([dom1, dom2_even])
df = df.sample(frac=1).reset_index(drop=True)

In [216]:
df['model'].value_counts()

model
8.0    11899
7.0     9750
1.0      307
2.0      307
3.0      307
4.0      307
6.0      307
5.0      307
0.0      307
Name: count, dtype: int64

#### 2. model1-7 same langth, even label

In [91]:
# df_all = pd.concat([dom1, dom2])
# df_all0 = df_all.loc[df_all['label']==0]
# df_all1 = df_all.loc[df_all['label']==1]
# mol_num = len(df_all0['model'].value_counts())
# least_mol_num = df_all0['model'].value_counts().tolist()[-1]
# sam_df_all0 = df_all0.groupby('model').apply(lambda x: x.sample(n=int(least_mol_num)))
# sam_df_all1 = df_all1.sample(n=least_mol_num*mol_num, random_state=42)
# df = pd.concat([sam_df_all0, sam_df_all1])
# df = df.sample(frac=1).reset_index(drop=True)

In [92]:
# df['model'].value_counts()

model
8.0    6240
4.0     780
7.0     780
5.0     780
0.0     780
2.0     780
6.0     780
3.0     780
1.0     780
Name: count, dtype: int64

#### 3. full data

In [144]:
# add new feature first 
dom1['text_length'] = dom1['text'].apply(lambda x: len(x))
dom2['text_length'] = dom2['text'].apply(lambda x: len(x))

minimax_scale = MinMaxScaler()
dom1['text_length'] = minimax_scale.fit_transform(dom1[['text_length']])
dom2['text_length'] = minimax_scale.fit_transform(dom2[['text_length']])

In [155]:
df_all = []
df_merge = pd.concat([dom1,dom2])
AI_num = 8
df_all0 = df_merge.loc[df_merge['label']==0]
df_all1= df_merge.loc[df_merge['label']==1]
least_mod_num = df_all0['model'].value_counts().tolist()[-1]
for _ in range(5):
    df_all0_even = df_all0.groupby('model').apply(lambda x: x.sample(n=int(least_mod_num)))
    df_all0_even.reset_index(drop=True, inplace=True)

    df_all1_equal = df_all1.groupby('model').apply(lambda x: x.sample(n=int(least_mod_num*AI_num)))
    df_merge = pd.concat([df_all0_even,df_all1_equal])
    df_merge = df_merge.sample(frac=1).reset_index(drop=True)
    df_all.append(df_merge)

In [136]:
# selected_dfs1 = []
# num_iterations = 5
# AI_num = 8
# rows_per_iteration = int(least_mod_num*AI_num)
# total_rows = num_iterations * rows_per_iteration
# # Create a list to keep track of which rows have been selected
# selected_indices = []
# ind = []
# for _ in range(num_iterations):
#     # If all rows have been selected, randomly choose to keep rows
#     if len(selected_indices) >= len(dom1):
#         random_indices = np.random.choice(len(df_all1), rows_per_iteration, replace=False)
#     else:
#         # Select the next rows (if available)
#         start_idx = 0 if len(selected_indices) == 0 else selected_indices[-1] + 1
#         end_idx = start_idx + rows_per_iteration
#         random_indices = np.arange(start_idx, min(end_idx, len(df_all1)))
#     # Append the selected rows to the list
#     selected_indices.extend(random_indices)
#     # Create a DataFrame with the selected rows
#     selected_df = df_all1.iloc[random_indices]
#     # Append the seected DataFrame to the list
#     selected_dfs1.append(selected_df)
#     ind.append(selected_df.index)
# # for i, selected_df in enumerate(selected_dfs):
# #     print(f"DataFrame {i+1}:")
# #     print(selected_df)
# add = selected_dfs1[0].sample(n=len(selected_dfs1[0]) - len(selected_dfs1[1]), random_state=42)
# selected_dfs1[1] = pd.concat([selected_dfs1[1], add])

In [138]:
# selected_dfs0 = []
# rows_per_label = least_mod_num  
# num_iterations = 5 
# for _ in range(num_iterations):
#     # Create an empty DataFrame with the same columns as df0_train
#     selected_df = pd.DataFrame(columns=df_all0.columns)
#     # Iterate through each unique label
#     for label in df_all0['model'].unique():
#         # Get rows with the current label
#         label_rows = df_all0[df_all0['model'] == label]
#         # If the label has fewer than 30 rows, randomly select rows to reach 30
#         if len(label_rows) < rows_per_label:
#             selected_rows = label_rows.sample(n=rows_per_label, replace=True)
#         else:
#             selected_rows = label_rows.sample(n=rows_per_label, replace=False)
#         # Concatenate the selected rows to the selected_df
#         selected_df = pd.concat([selected_df, selected_rows])
#     # Append the selected DataFrame to the list
#     selected_dfs0.append(selected_df)
# # Print the selected DataFrames for each iteration
# # for i, selected_df in enumerate(selected_dfs0):
# #     print(f"DataFrame {i+1}:")
# #     print(selected_df)

In [165]:
df_all[0]['model'].value_counts()

model
8.0    6240
2.0     780
1.0     780
6.0     780
4.0     780
3.0     780
7.0     780
5.0     780
0.0     780
Name: count, dtype: int64

## Vectorizer

#### 1. CountV

In [None]:
# vectorizer = CountVectorizer()
# df_text_vec = vectorizer.fit_transform([' '.join(str(word) for word in sentence) for sentence in df['text']]).toarray()
# dom2_text_vec = vectorizer.fit_transform([' '.join(str(word) for word in sentence) for sentence in dom2_even['text']]).toarray()
# Test_text_vec = vectorizer.fit_transform([' '.join(str(word) for word in sentence) for sentence in Test_set['text']]).toarray()

#### 2. Sparse

In [217]:
def to_sparse_matrix(sequences, vocab_size):
    indptr = [0]
    indices = []
    data = []
    for seq in sequences:
        feature_counter = {}
        for index in seq:
            if index != 0:  # Skip 0s, other words
                if index not in feature_counter:
                    feature_counter[index] = 1
                else:
                    feature_counter[index] += 1
        indices.extend(feature_counter.keys())
        data.extend(feature_counter.values())
        indptr.append(len(indices))
    return csr_matrix((data, indices, indptr), dtype=int, shape=(len(sequences), vocab_size + 1))
vocab_size = 4999

Test_text_vec = to_sparse_matrix(Test_set['text'], vocab_size)

In [218]:
df_text_vec = to_sparse_matrix(df['text'], vocab_size)

#### for df_all

In [199]:
df_text_vec_all = []
y_df_all = []
for i in range(len(df_all)):
    df_text_vec_all.append(to_sparse_matrix(df_all[i]['text'], vocab_size))
    y_df_all.append(df_all[i]['label'])

## Add New Features

In [219]:
df['text_length'] = df['text'].apply(lambda x: len(x))
Test_set['text_length'] = Test_set['text'].apply(lambda x: len(x))

minimax_scale = MinMaxScaler()
df['text_length'] = minimax_scale.fit_transform(df[['text_length']])
Test_set['text_length'] = minimax_scale.fit_transform(Test_set[['text_length']])

## Aggregation

#### 1. for CountVec

In [73]:
# length_df = pd.DataFrame(df['text_length'])
# X_df = np.concatenate([length_df, df_text_vec], axis=1)
# y_df = df['label']
# dom2_length_df = pd.DataFrame(dom2_even['text_length'])
# X_dom2 = np.concatenate([dom2_length_df, dom2_text_vec], axis=1)
# y_dom2 = dom2['label']
# Test_length_df = pd.DataFrame(Test_set['text_length'])
# X_Test = np.concatenate([Test_length_df, Test_text_vec], axis=1)

#### 2. for sparse

In [220]:
length_df = pd.DataFrame(df['text_length'])
Test_length_df = pd.DataFrame(Test_set['text_length'])
X_df = hstack([length_df, df_text_vec], format='csr')
X_Test = hstack([Test_length_df, Test_text_vec], format='csr')
# concatenated_df = pd.DataFrame(concatenated_data.toarray())
y_df = df['label']

#### for sparse df_all

In [200]:
length_df = []
X_df_all = []
for i in range(len(df_all)):
    length_df.append(pd.DataFrame(df_all[i]['text_length']))
    X_df_all.append(hstack([length_df[i], df_text_vec_all[i]], format='csr'))

## Feature Selection

#### 1. Kbest for Logistic

In [249]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)
k_values = []
accuracies = []

for k in range(100, (X_train.shape[1] + 1),100):
    # Apply SelectKBest class to extract top k best features
    best_features = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = best_features.fit_transform(X_train, y_train)
    X_test_kbest = best_features.transform(X_test)
    # train classifier
    classifier = SVC(C=10, gamma=0.001)
    classifier.fit(X_train_kbest, y_train)
    # Predict
    y_pred = classifier.predict(X_test_kbest)
    acc = f1_score(y_test, y_pred)
    k_values.append(k)
    accuracies.append(acc)
    
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.plot(k_values, accuracies, marker='o', linestyle='-')
plt.xlabel('Number of Features (k)')
plt.ylabel('Accuracy')
plt.title('k vs Accuracy')
plt.grid(True)
plt.show()

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


In [241]:
selector = SelectKBest(chi2, k=5001)
X_new = selector.fit_transform(X_df, y_df)

X_Test_new = selector.transform(X_Test)

## Train

#### split

In [242]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_df, test_size=0.2, random_state=42)

#### split for data all

In [201]:
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

# Loop through each dataset and perform train-test splitting
for i in range(len(df_all)):
    X_train, X_test, y_train, y_test = train_test_split(
        X_df_all[i], y_df_all[i], test_size=0.2, random_state=42
    )
    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

## Model

In [229]:
model_svm = svm.SVC(C=10, gamma=0.001).fit(X_train, y_train)
# Evaluation
# y_pred_train = model_svm.predict(X_train)
# print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
# y_pred_train = model_svm.predict(X_train)
# print("Train f1:", f1_score(y_train, y_pred_train))
y_pred_test = model_svm.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
y_pred_test = model_svm.predict(X_test)
print("Test f1:", f1_score(y_test, y_pred_test))

Test Accuracy: 0.8737394957983193
Test f1: 0.8765153071707419


In [102]:
X_new

<12480x5001 sparse matrix of type '<class 'numpy.float64'>'
	with 561339 stored elements in Compressed Sparse Row format>

In [235]:
test_pre = model_svm.predict(X_Test_new)
result = pd.DataFrame({
    'class': test_pre,
})
result['id'] = result.index
result = result[['id'] + [col for col in result.columns if col != 'id']]
result['class'].value_counts()

class
0    512
1    488
Name: count, dtype: int64

In [236]:
result.to_csv('result_svm.csv', index=False)

#### Logistic

In [243]:
model_log = LogisticRegression(solver='liblinear',multi_class='auto', penalty="l2", C=1, max_iter=100).fit(X_train, y_train)
# Evaluation
# y_pred_train = model_log.predict(X_train)
# print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
# y_pred_train = model_log.predict(X_train)
# print("Train f1:", f1_score(y_train, y_pred_train))
y_pred_test = model_log.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
y_pred_test = model_log.predict(X_test)
print("Test f1:", f1_score(y_test, y_pred_test))

Test Accuracy: 0.8449579831932773
Test f1: 0.8482106129164952


In [244]:
test_pre = model_log.predict(X_Test_new)
result = pd.DataFrame({
    'class': test_pre,
})
result['id'] = result.index
result = result[['id'] + [col for col in result.columns if col != 'id']]
result['class'].value_counts()

class
1    550
0    450
Name: count, dtype: int64

In [245]:
result.to_csv('result_logistic.csv', index=False)

#### MLPC

In [246]:
model_mplc = MLPClassifier(activation= 'relu',solver='adam', alpha=0.0001, hidden_layer_sizes=(20,), random_state=1,
                      learning_rate='adaptive').fit(X_train, y_train)
# Evaluation
# y_pred_train = model_mplc.predict(X_train)
# print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
# y_pred_train = model_mplc.predict(X_train)
# print("Train f1:", f1_score(y_train, y_pred_train))
y_pred_test = model_mplc.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
y_pred_test = model_mplc.predict(X_test)
print("Test f1:", f1_score(y_test, y_pred_test))

Test Accuracy: 0.8418067226890756
Test f1: 0.839479855041569


In [247]:
test_pre = model_mplc.predict(X_Test_new)
result = pd.DataFrame({
    'class': test_pre,
})
result['id'] = result.index
result = result[['id'] + [col for col in result.columns if col != 'id']]
result['class'].value_counts()

class
1    523
0    477
Name: count, dtype: int64

In [248]:
result.to_csv('result_mlpc.csv', index=False)

#### stack by data all

In [203]:
estimators = {
        ('gnb', GaussianNB(var_smoothing = 1.873817422860383e-06).fit(X_train_list[0].toarray(), y_train_list[0])),
       
        ('rf',RandomForestClassifier(n_estimators=800, criterion='entropy', min_samples_split=5, min_samples_leaf=1,
                                  max_features='sqrt',max_depth= 100,bootstrap= False).fit(X_train_list[1], y_train_list[1])),
        
        ('svm', svm.SVC(C=20, gamma=0.0001).fit(X_train_list[2], y_train_list[2])),
        
        #('knn', KNeighborsClassifier(n_neighbors = 9, metric = 'minkowski', p = 2).fit(X_train_list[3], y_train_list[3])),
        
        ('mlpc', MLPClassifier(activation= 'relu',solver='adam', alpha=0.0001, hidden_layer_sizes=(20,), random_state=1,
                      learning_rate='adaptive').fit(X_train_list[3], y_train_list[3]))
    }
stack_para = StackingClassifier(
        estimators=estimators, final_estimator=LogisticRegression(solver='liblinear',multi_class='auto', penalty="l2", C=0.5, max_iter=100))
stack = stack_para.fit(X_train_list[4].toarray(), y_train_list[4])

In [204]:
# Evaluation
y_pred_train = stack.predict(X_train.toarray())
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
y_pred_train = stack.predict(X_train.toarray())
print("Train f1:", f1_score(y_train, y_pred_train))
y_pred_test = stack.predict(X_test.toarray())
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
y_pred_test = stack.predict(X_test.toarray())
print("Test f1:", f1_score(y_test, y_pred_test))

Train Accuracy: 0.9997996794871795
Train f1: 0.9997999199679871
Test Accuracy: 0.8918269230769231
Test f1: 0.8831168831168831


In [213]:
test_pre = stack.predict(X_Test.toarray())
result = pd.DataFrame({
    'class': test_pre,
})
result['id'] = result.index
result = result[['id'] + [col for col in result.columns if col != 'id']]
result['class'].value_counts()

class
0    698
1    302
Name: count, dtype: int64

In [214]:
result.to_csv('result_stack.csv', index=False)