In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Using device: cuda:0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
seed = 25
# random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [7]:
train_data = pd.read_csv('/content/drive/MyDrive/datasets/subtask_2/en/train.tsv',sep='\t')
train_data = train_data.reset_index(drop=True)
print(train_data.head())

      id                                               text label
0   6239  It was not until many years later that it coul...     A
1   9255  Users can then pin these images to their profi...     F
2   1674  The best songs are those that I can sing along...     B
3   5001  I found this book to be poorly written. It was...     D
4  20779  Regulates the application of the EU tariff quo...     E


In [8]:
train_data_texts = train_data['text'].to_list()
train_data_labels = train_data['label'].to_list()
print("Unique labels: ", set(train_data_labels))

Unique labels:  {'B', 'C', 'A', 'E', 'F', 'D'}


In [9]:
from sklearn.model_selection import train_test_split
train_data_texts = train_data['text'].to_list()
train_data_labels = train_data['label'].to_list()
train_texts, test_texts, train_labels, test_labels = train_test_split(train_data_texts, train_data_labels, test_size=0.1, random_state=25)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=25)
print('train data size: ', len(train_texts))
print('validation data size: ', len(val_texts))
print('test data size: ', len(test_texts))

train data size:  18156
validation data size:  2018
test data size:  2242


In [10]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [20]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Convert train and test texts to BERT embeddings
train_embeddings = []
for text in tqdm(train_texts):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    train_embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())

test_embeddings = []
for text in tqdm(test_texts):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    test_embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 18156/18156 [04:49<00:00, 62.62it/s]
100%|██████████| 2242/2242 [00:32<00:00, 68.77it/s]


In [22]:
# train_embeddings = [emb.reshape(-1) for emb in train_embeddings]
# test_embeddings = [emb.reshape(-1) for emb in test_embeddings]
# Train Random Forest classifier on train embeddings
rf_classifier = RandomForestClassifier()
rf_classifier.fit(train_embeddings, train_labels)

# Predict labels for test embeddings
pred_labels = rf_classifier.predict(test_embeddings)

# Generate classification report
report = classification_report(test_labels, pred_labels)
print(report)

              precision    recall  f1-score   support

           A       0.35      0.43      0.39       359
           B       0.22      0.20      0.21       364
           C       0.25      0.19      0.22       353
           D       0.31      0.34      0.33       386
           E       0.30      0.28      0.29       365
           F       0.53      0.55      0.54       415

    accuracy                           0.34      2242
   macro avg       0.33      0.33      0.33      2242
weighted avg       0.33      0.34      0.33      2242



In [26]:
'''
  XGBoost is a gradient boosting algorithm, which means that it builds a series of weak learners sequentially, 
  where each new learner tries to improve the errors of the previous ones. 
  On the other hand, Random Forest is a bagging algorithm that builds multiple decision trees 
  in parallel and combines their predictions by taking the majority vote.


  XGBoost is a powerful algorithm that can handle complex relationships between features and 
  the target variable, especially for large datasets, while Random Forest is a reliable 
  algorithm that is easier to interpret and generally works well for smaller datasets with fewer features. 
'''
import xgboost as xgb
import re
# training xgboost 

# XGBoost only supports ASCII characters, so you may need to preprocess your data to remove any non-ASCII characters before feeding it to XGBoost.

def preprocess_text(text):
    # Replace any non-ASCII characters with their ASCII equivalents
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # Remove any remaining non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return text

# Preprocess the train and test texts
train_texts_ascii = [preprocess_text(text) for text in train_texts]
test_texts_ascii = [preprocess_text(text) for text in test_texts]

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Convert train and test texts to BERT embeddings
train_embeddings = []
for text in tqdm(train_texts_ascii):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    train_embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())

test_embeddings = []
for text in tqdm(test_texts_ascii):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    test_embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 18156/18156 [04:28<00:00, 67.67it/s]
100%|██████████| 2242/2242 [00:32<00:00, 68.76it/s]


In [33]:
from sklearn.preprocessing import LabelEncoder

# Convert labels to integer labels
le = LabelEncoder()
train_labels_encoded = le.fit_transform(train_labels)
test_labels_encoded = le.transform(test_labels)

# Train XGBoost classifier on train embeddings
params = {
    'objective': 'multi:softmax',
    'num_class': 6,
    'tree_method': 'gpu_hist'
}

# create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(**params)
train_embeddings = np.vstack(train_embeddings)
xgb_classifier.fit(train_embeddings, train_labels_encoded)

print("embed size check", np.array(test_embeddings).shape)

# Predict labels for test embeddings
test_embeddings = np.vstack(test_embeddings)
pred_labels_encoded = xgb_classifier.predict(np.array(test_embeddings))

# Convert predicted integer labels back to original labels
pred_labels = le.inverse_transform(pred_labels_encoded)

# Generate classification report
report = classification_report(test_labels, pred_labels)
print(report)

embed size check (2242, 1, 768)
              precision    recall  f1-score   support

           A       0.40      0.41      0.40       359
           B       0.23      0.23      0.23       364
           C       0.24      0.24      0.24       353
           D       0.31      0.31      0.31       386
           E       0.27      0.29      0.28       365
           F       0.61      0.59      0.60       415

    accuracy                           0.35      2242
   macro avg       0.34      0.34      0.34      2242
weighted avg       0.35      0.35      0.35      2242



In [36]:
# grid search to tune the parameters of xgboost
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define the XGBoost classifier
xgb_classifier = XGBClassifier(tree_method='gpu_hist', n_jobs=-1)

# Define the hyperparameters to tune
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [50, 100, 200],
}

# Define the grid search
grid_search = GridSearchCV(
    estimator=xgb_classifier, 
    param_grid=params, 
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
)

# Train the grid search
grid_search.fit(train_embeddings, train_labels_encoded)

# Print the best hyperparameters and score
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

# Use the best model to predict the test set and evaluate the performance
best_xgb = grid_search.best_estimator_
test_pred = best_xgb.predict(test_embeddings)
test_acc = accuracy_score(test_labels_encoded, test_pred)
print('Test accuracy:', test_acc)




Best parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Best score: 0.355970478078872
Test accuracy: 0.35459411239964317
