In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Using device: cuda:0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
seed = 25
# random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [5]:
train_data = pd.read_csv('/content/drive/MyDrive/datasets/subtask_1/es/train.tsv',sep='\t')
train_data = train_data.reset_index(drop=True)
print(train_data.head())

      id                                               text      label
0   5464  Entrada en vigor. La presente Directiva entrar...      human
1  30129  Preguntas: 1. ¿Cuáles son los principales argu...  generated
2  19553  ¿Desea algo? Póngame una caja de madera. ¿Qué ...  generated
3  13005  @victor28088 1665 Tweets no originales, que as...      human
4  16919  De pequeño Dios me dio a elegir entre tener un...      human


In [6]:
train_data_texts = train_data['text'].to_list()
train_data_labels = train_data['label'].to_list()
print("Unique labels: ", set(train_data_labels))

Unique labels:  {'human', 'generated'}


In [7]:
from sklearn.model_selection import train_test_split
train_data_texts = train_data['text'].to_list()
train_data_labels = train_data['label'].to_list()
train_texts, test_texts, train_labels, test_labels = train_test_split(train_data_texts, train_data_labels, test_size=0.1, random_state=25)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=25)
print('train data size: ', len(train_texts))
print('validation data size: ', len(val_texts))
print('test data size: ', len(test_texts))


train data size:  25969
validation data size:  2886
test data size:  3207


In [8]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [9]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load pre-trained BERT model and tokenizer
model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Convert train and test texts to BERT embeddings
train_embeddings = []
for text in tqdm(train_texts):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    train_embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())

test_embeddings = []
for text in tqdm(test_texts):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    test_embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())





Downloading (…)okenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 25969/25969 [05:36<00:00, 77.10it/s]
100%|██████████| 3207/3207 [00:40<00:00, 79.65it/s]


In [10]:
train_embeddings = [emb.reshape(-1) for emb in train_embeddings]
test_embeddings = [emb.reshape(-1) for emb in test_embeddings]
# Train Random Forest classifier on train embeddings
rf_classifier = RandomForestClassifier()
rf_classifier.fit(train_embeddings, train_labels)

# Predict labels for test embeddings
pred_labels = rf_classifier.predict(test_embeddings)

# Generate classification report
report = classification_report(test_labels, pred_labels)
print(report)

              precision    recall  f1-score   support

   generated       0.80      0.85      0.83      1628
       human       0.84      0.79      0.81      1579

    accuracy                           0.82      3207
   macro avg       0.82      0.82      0.82      3207
weighted avg       0.82      0.82      0.82      3207



In [11]:
'''
  XGBoost is a gradient boosting algorithm, which means that it builds a series of weak learners sequentially, 
  where each new learner tries to improve the errors of the previous ones. 
  On the other hand, Random Forest is a bagging algorithm that builds multiple decision trees 
  in parallel and combines their predictions by taking the majority vote.


  XGBoost is a powerful algorithm that can handle complex relationships between features and 
  the target variable, especially for large datasets, while Random Forest is a reliable 
  algorithm that is easier to interpret and generally works well for smaller datasets with fewer features. 
'''
import xgboost as xgb
import re
# training xgboost 

# XGBoost only supports ASCII characters, so you may need to preprocess your data to remove any non-ASCII characters before feeding it to XGBoost.

def preprocess_text(text):
    # Replace any non-ASCII characters with their ASCII equivalents
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # Remove any remaining non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return text

# Preprocess the train and test texts
train_texts_ascii = [preprocess_text(text) for text in train_texts]
test_texts_ascii = [preprocess_text(text) for text in test_texts]

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Convert train and test texts to BERT embeddings
train_embeddings = []
for text in tqdm(train_texts_ascii):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    train_embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())

test_embeddings = []
for text in tqdm(test_texts_ascii):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    test_embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())




Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 25969/25969 [05:35<00:00, 77.37it/s]
100%|██████████| 3207/3207 [00:41<00:00, 76.44it/s]


In [12]:
from sklearn.preprocessing import LabelEncoder

# Convert labels to integer labels
le = LabelEncoder()
train_labels_encoded = le.fit_transform(train_labels)
test_labels_encoded = le.transform(test_labels)

# Train XGBoost classifier on train embeddings
params = {
    'objective': 'multi:softmax',
    'num_class': 6,
    'tree_method': 'gpu_hist'
}

# create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(**params)
train_embeddings = np.vstack(train_embeddings)
xgb_classifier.fit(train_embeddings, train_labels_encoded)

print("embed size check", np.array(test_embeddings).shape)

# Predict labels for test embeddings
test_embeddings = np.vstack(test_embeddings)
pred_labels_encoded = xgb_classifier.predict(np.array(test_embeddings))

# Convert predicted integer labels back to original labels
pred_labels = le.inverse_transform(pred_labels_encoded)

# Generate classification report
report = classification_report(test_labels, pred_labels)
print(report)

embed size check (3207, 1, 768)
              precision    recall  f1-score   support

   generated       0.72      0.74      0.73      1628
       human       0.72      0.71      0.72      1579

    accuracy                           0.72      3207
   macro avg       0.72      0.72      0.72      3207
weighted avg       0.72      0.72      0.72      3207



In [13]:
# grid search to tune the parameters of xgboost
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define the XGBoost classifier
xgb_classifier = XGBClassifier(tree_method='gpu_hist', n_jobs=-1)

# Define the hyperparameters to tune
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [50, 100, 200],
}

# Define the grid search
grid_search = GridSearchCV(
    estimator=xgb_classifier, 
    param_grid=params, 
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
)

# Train the grid search
grid_search.fit(train_embeddings, train_labels_encoded)

# Print the best hyperparameters and score
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

# Use the best model to predict the test set and evaluate the performance
best_xgb = grid_search.best_estimator_
test_pred = best_xgb.predict(test_embeddings)
test_acc = accuracy_score(test_labels_encoded, test_pred)
print('Test accuracy:', test_acc)




Best parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Best score: 0.7271365559096877
Test accuracy: 0.7271593389460556
