# LGB model
# Light Gradient Boosting Machine

In [1]:
# read the data
# 0:machine 1:human
import json
import random
domain1 = []
with open('../data/domain1_train.json', 'r') as json_file:
    for line in json_file:
        try:
            json_data = json.loads(line)
            domain1.append(json_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
domain2 = []
with open('../data/domain2_train.json', 'r') as json_file:
    for line in json_file:
        try:
            json_data = json.loads(line)
            domain2.append(json_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")

In [2]:
# Balancing the data in domain2 
# 0: machine 1:human
# randomly choose 2150 number of label=0
data_label_0 = [item for item in domain2 if item['label'] == 0]
data_label_1 = [item for item in domain2 if item['label'] == 1]
random.seed(42)  # set seed to make sure that we can see the same data for every iterations
chosen_domain_2 = random.sample(data_label_0, 2150)
final_domain_2 = data_label_1 + chosen_domain_2

In [3]:
all_domain = final_domain_2 + domain1
# Adding a new feature called text length
text_len = []
for i in range(len(all_domain)):
    text_len.append(len(all_domain[i]['text']))
# Getting the original text data which are in token formatting
old_text = []
labels = []
for i in range(len(all_domain)):
    old_text.append(all_domain[i]['text'])
    labels.append(all_domain[i]['label'])

In [4]:
# Represent tokens into bow formatting
data = old_text
unique_integers = set()
for integer_list in data:
    unique_integers.update(integer_list)

# changing all the unique_intergers into a list for storing the bow data
vocabulary = sorted(list(unique_integers))


# creating BOW representation
bow_data = []
for integer_list in data:
    bow_vector = [integer_list.count(word) for word in vocabulary]
    bow_data.append(bow_vector)


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score,confusion_matrix
import pandas as pd

# Getting the features and the corresponding labels
df = pd.DataFrame({'bow': bow_data, 'text_len': text_len})
X = df[['bow','text_len']]
y = labels 

In [12]:
# Getting the train and testing data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# changing the features into np.array formatting 
X_train_bow = np.array(X_train['bow'].tolist())
X_test_bow = np.array(X_test['bow'].tolist())

# creating the lgb formatting data
train_data = lgb.Dataset(X_train_bow, label=y_train)
test_data = lgb.Dataset(X_test_bow, label=y_test, reference=train_data)

# define the hyperparameters after fine tuning
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    # default value 31
    'num_leaves': 50,
    # default value 0.1
    'learning_rate': 0.1,
    # default value 100
    'num_iterations':200,
    'lambda_l1':0.0,
    # default value 1.0
    'feature_fraction': 0.6,
    'early_stopping_rounds': 100
}

# Model training
#num_round = 200
bst = lgb.train(params, train_data,valid_sets=[test_data])
# Find the best iteration which gives the best results 
best_iteration = bst.best_iteration

bst = lgb.train(params, train_data,valid_sets=[test_data],num_boost_round=best_iteration)

# Model predictions
y_pred_prob = bst.predict(X_test_bow, num_iteration=bst.best_iteration)
y_pred_binary = (y_pred_prob > 0.5).astype(int)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred_binary)
confusion = confusion_matrix(y_test, y_pred_binary)
roc_auc = roc_auc_score(y_test, y_pred_binary)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{confusion}')
print(f'ROC AUC: {roc_auc}')




[LightGBM] [Info] Number of positive: 9582, number of negative: 9458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16451
[LightGBM] [Info] Number of data points in the train set: 19040, number of used features: 4099
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503256 -> initscore=0.013025
[LightGBM] [Info] Start training from score 0.013025
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[199]	valid_0's binary_logloss: 0.227935
[LightGBM] [Info] Number of positive: 9582, number of negative: 9458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16451
[LightGBM] [Info] Number of data points in the train set: 19040, number of used features: 4099
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503256 -> initscore=0.013025
[LightGBM] [Info] Start training from score 0.013025
Trainin

For better accuracy:

Use large max_bin (may be slower)
Use small learning_rate with large num_iterations
Use large num_leaves(may cause over-fitting)
Use bigger training data
Try dart
Try to use categorical feature directly\

To deal with over-fitting:
Use small max_bin
Use small num_leaves
Use min_data_in_leaf and min_sum_hessian_in_leaf
Use bagging by set bagging_fraction and bagging_freq
Use feature sub-sampling by set feature_fraction
Use bigger training data
Try lambda_l1, lambda_l2 and min_gain_to_split to regularization
Try max_depth to avoid growing deep tree

# Testing the data and get results!!!!!


In [None]:
# testing on the test dataset
test = []
with open('../data/test_set.json', 'r') as json_file:
    for line in json_file:
        try:
            json_data = json.loads(line)
            test.append(json_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
# taking all the data out to get the predictions
X_train_final = []
for i in range(len(test)):
    X_train_final.append(test[i]['text'])

In [None]:
# check the individual word count for the testing data
check_num = []
for i in range(len(X_train_final)):
    for value in X_train_final[i]:
        check_num.append(value)
print(len(set(check_num)))

4538


In [None]:
# transfer the data into bow formatting
# The number of features in data (4538) is not the same as it was in training data (5000).
# Apply the word bank of the old text data which consist 5000 words 
data = old_text
unique_integers = set()
for integer_list in data:
    unique_integers.update(integer_list)
vocabulary = sorted(list(unique_integers))
bow_data_final = []
for integer_list in X_train_final: 
    bow_vector = [integer_list.count(word) for word in vocabulary]
    bow_data_final.append(bow_vector)

In [None]:
# transfer into np.array formatt
bow_data_final = np.array(bow_data_final)

In [None]:
# get the predictions
y_pred_final = bst.predict(bow_data_final, num_iteration=bst.best_iteration,predict_disable_shape_check=True)
y_pred_binary_final = (y_pred_final > 0.5).astype(int)

In [None]:
# write it into file 
len(y_pred_binary_final)

1000

In [None]:
# count the number of human and machine predictions
count = 0 
for value in y_pred_binary_final:
    if value == 0:
        count +=1
count

479

In [None]:
# Generate the final answer 
id = []
for i in range(0,1000):
    id.append(i)

answer = []
for i in range(0,1000):
    answer.append((id[i],y_pred_binary_final[i]))

In [None]:
import csv
# write the answer into file 
file_name = '../Predicted_answer/lgb_bow_balance_version21.csv'
column_name = ['id','class']
with open(file_name, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(column_name)
    for row in answer:
        writer.writerow(row)
print(f'Data has been written to {file_name}')


Data has been written to ../Predicted_answer/lgb_bow_balance_version21.csv
