In [2]:
'''Import libraries'''
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, Trainer, TrainingArguments
from tqdm import tqdm
from torch.nn import functional as F
import torch.nn as nn

wandb.login()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malberto-rodero557[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [31]:
'''Variables and parameters'''

SAMPLES_TO_TRAIN=10000
DIMENSIONS=200

N_LABELS=2
MAX_LEN = 256
EPOCHS=50
PATIENCE=10
LEARNING_RATE=.00005
WEIGHT_DECAY=.01
BATCH_SIZE=16
METRIC_FOR_BEST_MODEL='eval_loss'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [4]:
'''Preparing dataset'''

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_train_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

df=df.sample(round(SAMPLES_TO_TRAIN))
# test_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))

# df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
# df = df[['text', 'label']]

# val_df= df.sample(round(SAMPLES_TO_TRAIN*.2))
# test_dev_df= df.sample(round(SAMPLES_TO_TRAIN*.2))

# we balance the training set
print(f'Dataset size before balancing: {df.shape}')
counts = df['label'].value_counts()
sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(df[['text']], df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(df['label'].value_counts())

Dataset size before balancing: (10000, 2)
Dataset size after balancing: (9454, 1)
Entried dropped: 546

Balanced DataFrame:
label
0    4727
1    4727
Name: count, dtype: int64


In [5]:
'''loading glove'''
embeddings_index={}
with open('../0 playground and indoor/OtherData/glove.6B.200d.txt','r',encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embeddings_index[word]=vectors
f.close()
print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [6]:
'''glove building'''

from nltk.tokenize import word_tokenize
from tqdm import tqdm 

def sent2vec(s):
    """ Function Creates a normalized vector for the whole sentence"""
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

print('Training df:')
df_x = np.array([sent2vec(x) for x in tqdm(df['text'])])
print(df_x.shape)
train_y=df['label']


Training df:


100%|██████████| 9454/9454 [00:13<00:00, 696.70it/s]

(9454, 200)





In [7]:
'''Preparing for training'''

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler to the training data and transform the data
train_x = scaler.fit_transform(df_x)

import pickle

# Save the trained scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [8]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [23]:
class Data(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
        self.y = torch.tensor(y, dtype=torch.long)
        self.len = len(X)

    def __getitem__(self, index):
        return {'x': self.X[index], 'label': self.y[index], 'label_ids': index}

    def __len__(self):
        return self.len

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y.values, test_size=0.2, random_state=42)
traindata = Data(X_train, y_train)
testdata = Data(X_test, y_test)

In [32]:
# number of features (len of X cols)
input_dim = train_x.shape[-1]

# number of classes (unique of y)
output_dim = 2

class CNN1D(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNN1D, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=100, kernel_size=5, stride=1, padding=2)
        self.bn1 = nn.BatchNorm1d(100)
        self.conv2 = nn.Conv1d(in_channels=100, out_channels=150, kernel_size=5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(150)
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(150 * 200, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x, labels=None):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        # Flatten the output for the dense layer
        x = torch.flatten(x, 1) 
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(x, labels)
            return loss, x
        
        return x

# Instantiate the model with appropriate dimensions
model = CNN1D(input_dim=200, num_classes=2)

In [34]:
from transformers import EarlyStoppingCallback

model = CNN1D(input_dim=200, num_classes=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    logging_dir='./logs',
    logging_steps=15000,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    logging_first_step=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb"
)

# Create trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=traindata,             # training dataset
    eval_dataset=testdata, 
    compute_metrics=compute_metrics,# training dataset
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)

# Train the model
print(trainer.evaluate())

trainer.train()

print(trainer.evaluate())

100%|██████████| 119/119 [00:00<00:00, 729.54it/s]


{'eval_loss': 0.6939518451690674, 'eval_accuracy': 0.5076679005817029, 'eval_f1': 0.6119216340141727, 'eval_auc': 0.5086693110862032, 'eval_precision': 0.5037748798901853, 'eval_recall': 0.7791932059447984, 'eval_runtime': 0.1669, 'eval_samples_per_second': 11331.875, 'eval_steps_per_second': 713.111}


  2%|▏         | 466/23650 [00:01<01:12, 320.03it/s]
  2%|▏         | 499/23650 [00:02<02:00, 192.20it/s]

{'eval_loss': 0.5128917694091797, 'eval_accuracy': 0.7440507667900582, 'eval_f1': 0.7034313725490196, 'eval_auc': 0.7435539477246135, 'eval_precision': 0.8318840579710145, 'eval_recall': 0.6093418259023354, 'eval_runtime': 0.1136, 'eval_samples_per_second': 16639.943, 'eval_steps_per_second': 1047.146, 'epoch': 1.0}


  4%|▍         | 938/23650 [00:03<01:12, 312.79it/s]
  4%|▍         | 970/23650 [00:03<01:56, 194.38it/s]

{'eval_loss': 0.48852044343948364, 'eval_accuracy': 0.7514542570068746, 'eval_f1': 0.7528916929547845, 'eval_auc': 0.7514860877132931, 'eval_precision': 0.7458333333333333, 'eval_recall': 0.7600849256900213, 'eval_runtime': 0.1107, 'eval_samples_per_second': 17080.271, 'eval_steps_per_second': 1074.856, 'epoch': 2.0}


  6%|▌         | 1396/23650 [00:05<01:06, 332.66it/s]
  6%|▌         | 1430/23650 [00:05<01:47, 207.02it/s]

{'eval_loss': 0.4398241639137268, 'eval_accuracy': 0.7948175568482284, 'eval_f1': 0.7844444444444445, 'eval_auc': 0.7946503079563022, 'eval_precision': 0.8228438228438228, 'eval_recall': 0.7494692144373672, 'eval_runtime': 0.1078, 'eval_samples_per_second': 17543.14, 'eval_steps_per_second': 1103.984, 'epoch': 3.0}


  8%|▊         | 1860/23650 [00:06<01:04, 337.24it/s]
  8%|▊         | 1922/23650 [00:07<01:36, 224.55it/s]

{'eval_loss': 0.4376523196697235, 'eval_accuracy': 0.7937599153886833, 'eval_f1': 0.7724620770128354, 'eval_auc': 0.7934242995755953, 'eval_precision': 0.8575129533678757, 'eval_recall': 0.70276008492569, 'eval_runtime': 0.1047, 'eval_samples_per_second': 18061.441, 'eval_steps_per_second': 1136.6, 'epoch': 4.0}


 10%|▉         | 2357/23650 [00:08<01:05, 323.55it/s]
 10%|█         | 2390/23650 [00:08<01:45, 200.62it/s]

{'eval_loss': 0.4150914251804352, 'eval_accuracy': 0.7985193019566367, 'eval_f1': 0.7804034582132564, 'eval_auc': 0.7982248606757811, 'eval_precision': 0.8537200504413619, 'eval_recall': 0.7186836518046709, 'eval_runtime': 0.1086, 'eval_samples_per_second': 17408.832, 'eval_steps_per_second': 1095.532, 'epoch': 5.0}


 12%|█▏        | 2829/23650 [00:10<01:06, 313.52it/s]
 12%|█▏        | 2861/23650 [00:10<01:47, 194.24it/s]

{'eval_loss': 0.46151089668273926, 'eval_accuracy': 0.7932310946589106, 'eval_f1': 0.7651651651651652, 'eval_auc': 0.792799549866996, 'eval_precision': 0.8810511756569848, 'eval_recall': 0.6762208067940552, 'eval_runtime': 0.1125, 'eval_samples_per_second': 16814.063, 'eval_steps_per_second': 1058.103, 'epoch': 6.0}


 14%|█▍        | 3297/23650 [00:11<00:58, 345.78it/s]
 14%|█▍        | 3332/23650 [00:12<01:35, 212.55it/s]

{'eval_loss': 0.4040926694869995, 'eval_accuracy': 0.8254891591750396, 'eval_f1': 0.8252118644067797, 'eval_auc': 0.8254945981802276, 'eval_precision': 0.8234672304439746, 'eval_recall': 0.826963906581741, 'eval_runtime': 0.1054, 'eval_samples_per_second': 17941.324, 'eval_steps_per_second': 1129.042, 'epoch': 7.0}


 16%|█▌        | 3774/23650 [00:13<01:01, 321.25it/s]
 16%|█▌        | 3807/23650 [00:13<01:36, 206.52it/s]

{'eval_loss': 0.4192061722278595, 'eval_accuracy': 0.8202009518773136, 'eval_f1': 0.8227320125130344, 'eval_auc': 0.820265046008873, 'eval_precision': 0.8084016393442623, 'eval_recall': 0.8375796178343949, 'eval_runtime': 0.102, 'eval_samples_per_second': 18532.718, 'eval_steps_per_second': 1166.258, 'epoch': 8.0}


 18%|█▊        | 4249/23650 [00:15<00:57, 334.70it/s]
 18%|█▊        | 4283/23650 [00:15<01:35, 203.52it/s]

{'eval_loss': 0.4224974811077118, 'eval_accuracy': 0.8180856689582232, 'eval_f1': 0.8084632516703787, 'eval_auc': 0.8179109085661742, 'eval_precision': 0.8501170960187353, 'eval_recall': 0.7707006369426752, 'eval_runtime': 0.1074, 'eval_samples_per_second': 17611.348, 'eval_steps_per_second': 1108.276, 'epoch': 9.0}


 20%|█▉        | 4721/23650 [00:16<00:56, 335.99it/s]
 20%|██        | 4755/23650 [00:17<01:31, 205.66it/s]

{'eval_loss': 0.42359766364097595, 'eval_accuracy': 0.8291909042834479, 'eval_f1': 0.823593664664118, 'eval_auc': 0.8290848115907011, 'eval_precision': 0.8481439820022497, 'eval_recall': 0.8004246284501062, 'eval_runtime': 0.1063, 'eval_samples_per_second': 17781.958, 'eval_steps_per_second': 1119.013, 'epoch': 10.0}


 22%|██▏       | 5199/23650 [00:18<00:56, 324.48it/s]
 22%|██▏       | 5203/23650 [00:18<00:56, 324.48it/s]

{'eval_loss': 0.4150660037994385, 'eval_accuracy': 0.8244315177154945, 'eval_f1': 0.8173817381738175, 'eval_auc': 0.8242999111815095, 'eval_precision': 0.8481735159817352, 'eval_recall': 0.7887473460721869, 'eval_runtime': 0.1171, 'eval_samples_per_second': 16142.149, 'eval_steps_per_second': 1015.82, 'epoch': 11.0}


 24%|██▍       | 5675/23650 [00:20<00:52, 340.53it/s]
 24%|██▍       | 5676/23650 [00:20<00:52, 340.53it/s]

{'eval_loss': 0.44658663868904114, 'eval_accuracy': 0.8281332628239026, 'eval_f1': 0.8232735182164219, 'eval_auc': 0.8280428163291789, 'eval_precision': 0.8439241917502787, 'eval_recall': 0.8036093418259024, 'eval_runtime': 0.1131, 'eval_samples_per_second': 16720.132, 'eval_steps_per_second': 1052.192, 'epoch': 12.0}


 26%|██▌       | 6145/23650 [00:21<00:53, 325.36it/s]
 26%|██▌       | 6149/23650 [00:21<00:53, 325.36it/s]

{'eval_loss': 0.43658632040023804, 'eval_accuracy': 0.8191433104177683, 'eval_f1': 0.8097886540600667, 'eval_auc': 0.8189724796914397, 'eval_precision': 0.8504672897196262, 'eval_recall': 0.772823779193206, 'eval_runtime': 0.1083, 'eval_samples_per_second': 17453.615, 'eval_steps_per_second': 1098.35, 'epoch': 13.0}


 28%|██▊       | 6609/23650 [00:23<00:50, 337.21it/s]
 28%|██▊       | 6643/23650 [00:23<01:22, 207.10it/s]

{'eval_loss': 0.438464492559433, 'eval_accuracy': 0.8328926493918561, 'eval_f1': 0.8302900107411386, 'eval_auc': 0.8328472926021133, 'eval_precision': 0.8402173913043478, 'eval_recall': 0.8205944798301487, 'eval_runtime': 0.1082, 'eval_samples_per_second': 17472.032, 'eval_steps_per_second': 1099.509, 'epoch': 14.0}


 30%|██▉       | 7075/23650 [00:25<00:49, 336.54it/s]
 30%|███       | 7109/23650 [00:25<01:19, 207.43it/s]

{'eval_loss': 0.44370487332344055, 'eval_accuracy': 0.8365943945002644, 'eval_f1': 0.8344938403856453, 'eval_auc': 0.8365588763677936, 'eval_precision': 0.8421621621621621, 'eval_recall': 0.826963906581741, 'eval_runtime': 0.1039, 'eval_samples_per_second': 18204.545, 'eval_steps_per_second': 1145.606, 'epoch': 15.0}


 32%|███▏      | 7554/23650 [00:26<00:47, 340.83it/s]
 32%|███▏      | 7589/23650 [00:27<01:17, 208.12it/s]

{'eval_loss': 0.45774608850479126, 'eval_accuracy': 0.8323638286620836, 'eval_f1': 0.831651619755709, 'eval_auc': 0.8323595739397153, 'eval_precision': 0.8320935175345378, 'eval_recall': 0.8312101910828026, 'eval_runtime': 0.1092, 'eval_samples_per_second': 17321.01, 'eval_steps_per_second': 1090.005, 'epoch': 16.0}


 34%|███▍      | 8024/23650 [00:28<00:48, 320.72it/s]
 34%|███▍      | 8041/23650 [00:28<00:55, 279.33it/s]


{'eval_loss': 0.4620455503463745, 'eval_accuracy': 0.8318350079323109, 'eval_f1': 0.8312101910828026, 'eval_auc': 0.8318327035498312, 'eval_precision': 0.8312101910828026, 'eval_recall': 0.8312101910828026, 'eval_runtime': 0.1044, 'eval_samples_per_second': 18117.302, 'eval_steps_per_second': 1140.116, 'epoch': 17.0}
{'train_runtime': 28.7852, 'train_samples_per_second': 13136.946, 'train_steps_per_second': 821.602, 'train_loss': 0.3387091230684927, 'epoch': 17.0}


100%|██████████| 119/119 [00:00<00:00, 1092.29it/s]

{'eval_loss': 0.4040926694869995, 'eval_accuracy': 0.8254891591750396, 'eval_f1': 0.8252118644067797, 'eval_auc': 0.8254945981802276, 'eval_precision': 0.8234672304439746, 'eval_recall': 0.826963906581741, 'eval_runtime': 0.1119, 'eval_samples_per_second': 16892.273, 'eval_steps_per_second': 1063.025, 'epoch': 17.0}





In [33]:
# first CNN
# 'eval_loss': 0.4162973463535309, 'eval_accuracy': 0.8286620835536753, 'eval_f1': 0.8205980066445182


100%|██████████| 250/250 [00:00<00:00, 1069.73it/s]


{'eval_loss': 0.6834174990653992,
 'eval_accuracy': 0.522,
 'eval_f1': 0.0020876826722338207,
 'eval_auc': 0.5005224660397074,
 'eval_precision': 1.0,
 'eval_recall': 0.0010449320794148381,
 'eval_runtime': 0.2357,
 'eval_samples_per_second': 8485.118,
 'eval_steps_per_second': 1060.64,
 'epoch': 9.0}