In [1]:
'''Import libraries'''
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, Trainer, TrainingArguments
from tqdm import tqdm
from torch.nn import functional as F
import torch.nn as nn

wandb.login()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malberto-rodero557[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [20]:
'''Variables and parameters'''

SAMPLES_TO_TRAIN=10000

N_LABELS=2
MAX_LEN = 256
EPOCHS=100
PATIENCE=10
LEARNING_RATE=.00005
WEIGHT_DECAY=.01
BATCH_SIZE=100
METRIC_FOR_BEST_MODEL='eval_loss'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [5]:
'''Preparing dataset'''

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_train_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

train_df=df.sample(round(SAMPLES_TO_TRAIN))
test_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

val_df= df.sample(round(SAMPLES_TO_TRAIN*.2))
test_dev_df= df.sample(round(SAMPLES_TO_TRAIN*.2))

# we balance the training set
print(f'Dataset size before balancing: {train_df.shape}')
counts = train_df['label'].value_counts()
sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(train_df[['text']], train_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {train_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
train_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(train_df['label'].value_counts())

Dataset size before balancing: (10000, 2)
Dataset size after balancing: (9298, 1)
Entried dropped: 702

Balanced DataFrame:
label
0    4649
1    4649
Name: count, dtype: int64


In [6]:
'''loading glove'''
embeddings_index={}
with open('../OtherData/glove.6B.200d.txt','r',encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embeddings_index[word]=vectors
f.close()
print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [7]:
'''glove building'''

from nltk.tokenize import word_tokenize
from tqdm import tqdm 

def sent2vec(s):
    """ Function Creates a normalized vector for the whole sentence"""
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

print('Training df:')
train_x = np.array([sent2vec(x) for x in tqdm(train_df['text'])])
print(train_x.shape)
train_y=train_df['label']

print('\nValidation df:')
val_x = np.array([sent2vec(x) for x in tqdm(val_df['text'])])
print(val_x.shape)
val_y=val_df['label']

print('\nTesting from training dataset df:')
test_train_x = np.array([sent2vec(x) for x in tqdm(test_train_df['text'])])
print(test_train_x.shape)
test_train_y=test_train_df['label']

print('\nTesting from dev dataset df:')
test_dev_x = np.array([sent2vec(x) for x in tqdm(test_dev_df['text'])])
print(test_dev_x.shape)
test_dev_y=test_dev_df['label']


Training df:


100%|██████████| 9298/9298 [00:14<00:00, 645.76it/s]


(9298, 200)

Validation df:


100%|██████████| 2000/2000 [00:02<00:00, 852.13it/s]


(2000, 200)

Testing from training dataset df:


100%|██████████| 2000/2000 [00:02<00:00, 675.61it/s]


(2000, 200)

Testing from dev dataset df:


100%|██████████| 2000/2000 [00:02<00:00, 903.25it/s]

(2000, 200)





In [8]:
'''Preparing for training'''

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler to the training data and transform the data
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)  # Note that we use the same scaler to transform the test data
test_train_x = scaler.transform(test_train_x)
test_dev_x = scaler.transform(test_dev_x)
# If this scaler is going to be used later on for prediction it must be saved, for example with pickle
import pickle

# Save the trained scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [9]:
'''creating custom dataset'''
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {
            'x': self.features[idx],  # changed 'features' to 'x'
            'label': self.labels[idx]  # changed 'labels' to 'label'
        }

# Verify the shape and type of the data
print(train_x.shape, train_y.shape, val_x.shape, val_y.shape)
print(test_train_x.shape, test_train_y.shape, test_dev_x.shape, test_dev_y.shape)

# Convert numpy arrays to PyTorch tensors
train_x = torch.tensor(train_x, dtype=torch.float32)
val_x = torch.tensor(val_x, dtype=torch.float32)

test_train_x = torch.tensor(test_train_x, dtype=torch.float32)
test_dev_x = torch.tensor(test_dev_x, dtype=torch.float32)


# now y
train_y = torch.tensor(train_y.to_numpy(), dtype=torch.float32) if isinstance(train_y, pd.Series) else torch.tensor(train_y, dtype=torch.float32)
val_y = torch.tensor(val_y.to_numpy(), dtype=torch.float32) if isinstance(val_y, pd.Series) else torch.tensor(val_y, dtype=torch.float32)

test_train_y = torch.tensor(test_train_y.to_numpy(), dtype=torch.float32) if isinstance(test_train_y, pd.Series) else torch.tensor(test_train_y, dtype=torch.float32)
test_dev_y = torch.tensor(test_dev_y.to_numpy(), dtype=torch.float32) if isinstance(test_dev_y, pd.Series) else torch.tensor(test_dev_y, dtype=torch.float32)

# Create dataset and dataloaders
train_dataset = CustomDataset(train_x, train_y)
val_dataset = CustomDataset(val_x, val_y)
test_train_dataset = CustomDataset(test_train_x, test_train_y)
test_dev_dataset = CustomDataset(test_dev_x, test_dev_y)

# Check the length of datasets
print(len(train_dataset), len(val_dataset))
print(len(test_train_dataset), len(test_dev_dataset))

(9298, 200) (9298,) (2000, 200) (2000,)
(2000, 200) (2000,) (2000, 200) (2000,)
9298 2000
2000 2000


In [21]:
'''building custom model'''
class MyModel(nn.Module):
    def __init__(self, input_dim):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 200)
        self.dropout1 = nn.Dropout(0.2)
        self.batchnorm1 = nn.BatchNorm1d(200)
        self.fc2 = nn.Linear(200, 100)
        self.dropout2 = nn.Dropout(0.2)
        self.batchnorm2 = nn.BatchNorm1d(100)
        self.fc3 = nn.Linear(100, 100)
        self.dropout3 = nn.Dropout(0.2)
        self.batchnorm3 = nn.BatchNorm1d(100)
        self.fc4 = nn.Linear(100, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, labels=None):
        x = F.relu(self.batchnorm1(self.dropout1(self.fc1(x))))
        x = F.relu(self.batchnorm2(self.dropout2(self.fc2(x))))
        x = F.relu(self.batchnorm3(self.dropout3(self.fc3(x))))
        logits = self.fc4(x)
        outputs = self.sigmoid(logits)

        if labels is not None:
            loss = F.binary_cross_entropy(outputs, labels.unsqueeze(-1))
            return loss, outputs
        return outputs

# Instantiate the model
model = MyModel(input_dim=train_x.shape[1])

In [11]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [27]:
'''Define training arguments and initialize trainer'''

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    logging_dir='./logs',
    logging_steps=1500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    logging_first_step=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)

In [28]:
trainer.train()

  _warn_prf(average, modifier, msg_start, len(result))

  2%|▏         | 158/9300 [00:00<00:26, 341.31it/s]

{'eval_loss': 1.1568818092346191, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0265, 'eval_samples_per_second': 75370.698, 'eval_steps_per_second': 753.707, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))

  2%|▏         | 232/9300 [00:00<00:27, 327.53it/s]

{'eval_loss': 1.1559313535690308, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.028, 'eval_samples_per_second': 71301.385, 'eval_steps_per_second': 713.014, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))

  4%|▎         | 345/9300 [00:01<00:27, 328.29it/s]

{'eval_loss': 1.1506930589675903, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0305, 'eval_samples_per_second': 65511.433, 'eval_steps_per_second': 655.114, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))

  4%|▍         | 416/9300 [00:01<00:27, 318.07it/s]

{'eval_loss': 1.1889064311981201, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0285, 'eval_samples_per_second': 70096.078, 'eval_steps_per_second': 700.961, 'epoch': 4.0}


  _warn_prf(average, modifier, msg_start, len(result))

  6%|▌         | 530/9300 [00:01<00:26, 334.00it/s]

{'eval_loss': 1.1270395517349243, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0331, 'eval_samples_per_second': 60428.82, 'eval_steps_per_second': 604.288, 'epoch': 5.0}


  _warn_prf(average, modifier, msg_start, len(result))

  7%|▋         | 605/9300 [00:01<00:26, 333.59it/s]

{'eval_loss': 1.1756553649902344, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.029, 'eval_samples_per_second': 68887.257, 'eval_steps_per_second': 688.873, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))

  8%|▊         | 720/9300 [00:02<00:24, 344.32it/s]

{'eval_loss': 1.1502429246902466, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0271, 'eval_samples_per_second': 73858.984, 'eval_steps_per_second': 738.59, 'epoch': 7.0}


  _warn_prf(average, modifier, msg_start, len(result))

  9%|▊         | 797/9300 [00:02<00:24, 340.72it/s]

{'eval_loss': 1.1706552505493164, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0245, 'eval_samples_per_second': 81575.059, 'eval_steps_per_second': 815.751, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))

  9%|▉         | 880/9300 [00:02<00:24, 346.32it/s]

{'eval_loss': 1.1813157796859741, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.026, 'eval_samples_per_second': 76893.395, 'eval_steps_per_second': 768.934, 'epoch': 9.0}


  _warn_prf(average, modifier, msg_start, len(result))

 11%|█         | 997/9300 [00:03<00:24, 345.83it/s]

{'eval_loss': 1.1832988262176514, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0275, 'eval_samples_per_second': 72678.981, 'eval_steps_per_second': 726.79, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))

 12%|█▏        | 1074/9300 [00:03<00:24, 334.31it/s]

{'eval_loss': 1.2138298749923706, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0298, 'eval_samples_per_second': 67172.813, 'eval_steps_per_second': 671.728, 'epoch': 11.0}


  _warn_prf(average, modifier, msg_start, len(result))

 13%|█▎        | 1189/9300 [00:03<00:23, 340.05it/s]

{'eval_loss': 1.1950109004974365, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0283, 'eval_samples_per_second': 70777.398, 'eval_steps_per_second': 707.774, 'epoch': 12.0}


  _warn_prf(average, modifier, msg_start, len(result))

 14%|█▎        | 1259/9300 [00:03<00:25, 318.24it/s]

{'eval_loss': 1.2060163021087646, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0266, 'eval_samples_per_second': 75273.984, 'eval_steps_per_second': 752.74, 'epoch': 13.0}


  _warn_prf(average, modifier, msg_start, len(result))

 15%|█▍        | 1371/9300 [00:04<00:23, 333.19it/s]

{'eval_loss': 1.2148991823196411, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0279, 'eval_samples_per_second': 71720.798, 'eval_steps_per_second': 717.208, 'epoch': 14.0}


  _warn_prf(average, modifier, msg_start, len(result))

 15%|█▌        | 1395/9300 [00:04<00:24, 326.46it/s]

{'eval_loss': 1.2498315572738647, 'eval_accuracy': 0.504, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.0275, 'eval_samples_per_second': 72678.981, 'eval_steps_per_second': 726.79, 'epoch': 15.0}
{'train_runtime': 4.2726, 'train_samples_per_second': 217619.434, 'train_steps_per_second': 2176.662, 'train_loss': 0.3092324533770161, 'epoch': 15.0}





TrainOutput(global_step=1395, training_loss=0.3092324533770161, metrics={'train_runtime': 4.2726, 'train_samples_per_second': 217619.434, 'train_steps_per_second': 2176.662, 'train_loss': 0.3092324533770161, 'epoch': 15.0})

In [24]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 20/20 [00:00<00:00, 722.43it/s]


{'eval_loss': 0.412348210811615,
 'eval_accuracy': 0.5315,
 'eval_f1': 0.0,
 'eval_auc': 0.5,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_runtime': 0.0297,
 'eval_samples_per_second': 67253.594,
 'eval_steps_per_second': 672.536,
 'epoch': 99.0}

In [25]:
trainer.evaluate(test_train_dataset)

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 20/20 [00:00<00:00, 848.07it/s]


{'eval_loss': 0.412348210811615,
 'eval_accuracy': 0.5315,
 'eval_f1': 0.0,
 'eval_auc': 0.5,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_runtime': 0.0266,
 'eval_samples_per_second': 75238.876,
 'eval_steps_per_second': 752.389,
 'epoch': 99.0}

In [26]:
trainer.evaluate(test_dev_dataset)

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 20/20 [00:00<00:00, 792.54it/s]


{'eval_loss': 1.175024390220642,
 'eval_accuracy': 0.5065,
 'eval_f1': 0.0,
 'eval_auc': 0.5,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_runtime': 0.0282,
 'eval_samples_per_second': 70829.39,
 'eval_steps_per_second': 708.294,
 'epoch': 99.0}