In [2]:
# read the data
# 0:machine 1:human
import json
import random
domain1 = []
with open('../data/domain1_train.json', 'r') as json_file:
    for line in json_file:
        try:
            json_data = json.loads(line)
            domain1.append(json_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
domain2 = []
with open('../data/domain2_train.json', 'r') as json_file:
    for line in json_file:
        try:
            json_data = json.loads(line)
            domain2.append(json_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")

# This is the second approach of lgb model, which is first learn the knowledge from domain 1 and learn and fine tune the model on domain 2, which does not have a good results as expected.

In [3]:
data_label_0 = [item for item in domain2 if item['label'] == 0]
data_label_1 = [item for item in domain2 if item['label'] == 1]
random.seed(42)  
chosen_domain_2 = random.sample(data_label_0, 2150) 
final_domain_2 = data_label_1 + chosen_domain_2 

In [4]:
all_domain = domain1 + domain2
old_text = []
labels = []
for i in range(len(all_domain)):
    old_text.append(all_domain[i]['text'])
    labels.append(all_domain[i]['label'])

In [5]:
data = old_text
unique_integers = set()
for integer_list in data:
    unique_integers.update(integer_list)
vocabulary = sorted(list(unique_integers))
bow_data = []
for integer_list in data:
    bow_vector = [integer_list.count(word) for word in vocabulary]
    bow_data.append(bow_vector)

In [8]:
bow_data_1 = bow_data[0:19500] #domain 1 bow
bow_data_2 = bow_data[19500:]
labels_1 = labels[0:19500] # domain 1 label
labels_2 = labels[19500:]

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score

X = bow_data_1
y = labels_1

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = np.array(X_train)
X_test = np.array(X_test)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [11]:
# params in model
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'early_stopping_rounds': 100
}


# train model and saved the model based on domain1 
num_round = 100
bst_domain1  = lgb.train(params, train_data, num_round, valid_sets=[test_data])
bst_domain1.save_model("domain1_model.txt")



[LightGBM] [Info] Number of positive: 7793, number of negative: 7807
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8459
[LightGBM] [Info] Number of data points in the train set: 15600, number of used features: 2499
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499551 -> initscore=-0.001795
[LightGBM] [Info] Start training from score -0.001795
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.172466


<lightgbm.basic.Booster at 0x307b98dd0>

In [12]:
old_text_2 = []
labels_2 = []
for i in range(len(domain2)):
    old_text_2.append(domain2[i]['text'])
    labels_2.append(domain2[i]['label'])

In [15]:
# load the domain 1 model, learn and fine tune the model based on domain 2
bst_domain1 = lgb.Booster(model_file="domain1_model.txt")
X_2 = bow_data_2
y_2 = labels_2
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42)

In [30]:
# Using the full data from domain2 but added the class weight for each machine and human generated text
class_weights_domain2 = np.array([1.0, 5.0]) 
X_train_2 = np.array(X_train_2)
X_test_2 = np.array(X_test_2)
train_data_2 = lgb.Dataset(X_train_2, label=y_train_2,weight=class_weights_domain2[y_train_2])
test_data_2 = lgb.Dataset(X_test_2, label=y_test_2, reference=train_data_2)

In [31]:
# Domain2 (Fine-tuning）
num_round_domain2 = 50 
params_domain2 = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'early_stopping_rounds': 100
}
# train the final model 
bst_domain2 = lgb.train(params_domain2, train_data_2, num_round_domain2, valid_sets=[test_data_2], init_model=bst_domain1)

[LightGBM] [Info] Number of positive: 1714, number of negative: 10206
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19971
[LightGBM] [Info] Number of data points in the train set: 11920, number of used features: 4421
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[150]	valid_0's binary_logloss: 0.370469


In [32]:
# testing on the test dataset
test = []
with open('../data/test_set.json', 'r') as json_file:
    for line in json_file:
        try:
            json_data = json.loads(line)
            test.append(json_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
# taking all the data out to get the predictions
# 需要做测试的数据有1000组
X_train_final = []
for i in range(len(test)):
    X_train_final.append(test[i]['text'])

In [33]:
# transfer the data into bow formatting
data = old_text
unique_integers = set()
for integer_list in data:
    unique_integers.update(integer_list)

vocabulary = sorted(list(unique_integers))


bow_data_final = []
for integer_list in X_train_final: #
    bow_vector = [integer_list.count(word) for word in vocabulary]
    bow_data_final.append(bow_vector)

In [34]:
# transfer into np.array formatt
bow_data_final = np.array(bow_data_final)

In [35]:
# get the predictions
y_pred_final = bst_domain2.predict(bow_data_final, num_iteration=bst_domain2.best_iteration,predict_disable_shape_check=True)
y_pred_binary_final = (y_pred_final > 0.5).astype(int)

In [38]:
id = []
for i in range(0,1000):
    id.append(i)

answer = []
for i in range(0,1000):
    answer.append((id[i],y_pred_binary_final[i]))

In [39]:
import csv


file_name = '../Predicted_answer/lgb_bow_balance_two_model_fine_tuning.csv'
column_name = ['id','class']
with open(file_name, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(column_name)
    
    for row in answer:
        writer.writerow(row)


print(f'Data has been written to {file_name}')

Data has been written to ../Predicted_answer/lgb_bow_balance_two_model_fine_tuning.csv
