# LSTM Implementation

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import TextClassificationPipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from tqdm import tqdm
from torch.autograd import Variable

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Data

In [3]:
# Import train and test data
train = pd.read_csv('/content/drive/MyDrive/Zeta Test/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Zeta Test/test.csv')

# Separate train labels and text
train_labels = train['category']
train_text = train['text']
train_labels_list = train_labels.tolist()

# Separate test labels and text
test_labels = test['category']
test_text = test['text']
test_labels_list = test_labels.tolist()

### Randomly select 15% of the dataset to be noised

In [4]:
# Set the random seed for reproducibility
np.random.seed(42)

# Calculate the number of samples for 15% of the dataset
sample_size = int(len(train) * 0.15)

# Conduct sampling
sampled_data = train.sample(n=sample_size, replace=False)

sampled_data

Unnamed: 0,text,category
6883,Is it possible for me to change my PIN number?,change_pin
5836,I'm not sure why my card didn't work,declined_card_payment
8601,I don't think my top up worked,top_up_failed
2545,Can you explain why my payment was charged a fee?,card_payment_fee_charged
8697,How long does a transfer from a UK account tak...,balance_not_updated_after_bank_transfer
...,...,...
9001,Is there a charge for exchanging foreign curre...,exchange_charge
6840,Please tell me why the purchase I made online ...,reverted_card_payment?
967,Are you able to make exchanges to EUR?,fiat_currency_support
6463,How do a reverse a duplicated charge?,transaction_charged_twice


### Shuffle 10% of Labels in Dn to Create Noise

In [5]:
# Randomly sample 10% of the rows from the Dn
sampled_rows = sampled_data.sample(frac=0.1, random_state=42)

# Add new column 'k' and initialize with 0
sampled_data['k'] = 0

# Store the original labels ('l') before shuffling the 'category' column
sampled_data['l'] = sampled_data['category']  # Initialize 'l' column with original category values
sampled_data.loc[sampled_rows.index, 'l'] = sampled_data.loc[sampled_rows.index, 'category']

# Convert the selected rows' 'category' column to a numpy array
sampled_category_values = np.array(sampled_rows['category'])

# Shuffle the array
np.random.shuffle(sampled_category_values)

# Update the 'category' column with the shuffled values in the original DataFrame
sampled_data.loc[sampled_rows.index, 'category'] = sampled_category_values

# Update 'k' to 1 for the randomly selected 10% of rows
sampled_data.loc[sampled_rows.index, 'k'] = 1

# Update 'l\'' for the shuffled rows
sampled_data.loc[sampled_rows.index, 'l\''] = sampled_category_values

# Display the DataFrame to verify the changes
sampled_data.head(25)


Unnamed: 0,text,category,k,l,l'
6883,Is it possible for me to change my PIN number?,change_pin,0,change_pin,
5836,I'm not sure why my card didn't work,declined_card_payment,0,declined_card_payment,
8601,I don't think my top up worked,top_up_failed,0,top_up_failed,
2545,Can you explain why my payment was charged a fee?,card_payment_fee_charged,0,card_payment_fee_charged,
8697,How long does a transfer from a UK account tak...,balance_not_updated_after_bank_transfer,0,balance_not_updated_after_bank_transfer,
5573,Why am I getting declines when trying to make ...,declined_transfer,0,declined_transfer,
576,What is the $1 transaction on my account?,extra_charge_on_statement,0,extra_charge_on_statement,
6832,It looks like my card payment was sent back.,reverted_card_payment?,0,reverted_card_payment?,
7111,Why am I unable to transfer money when I was a...,beneficiary_not_allowed,0,beneficiary_not_allowed,
439,What if there is an error on the exchange rate?,card_payment_wrong_exchange_rate,0,card_payment_wrong_exchange_rate,


## Step 1: Define and Train Deep Model

In [6]:
device_name = 'cuda'
model_name = 'bert-base-uncased'

from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, random_split, DataLoader

# Tokenize input text
tokenizer = BertTokenizer.from_pretrained(model_name)
input_text = np.array(sampled_data['text'])
input_ids = tokenizer.batch_encode_plus(input_text, padding=True, truncation=True, return_tensors='pt')['input_ids']
attention_mask = tokenizer.batch_encode_plus(input_text, padding=True, truncation=True, return_tensors='pt')['attention_mask']

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(sampled_data['category'])
labels_tensor = torch.tensor(labels_encoded)

# Create TensorDataset
train_dataset = TensorDataset(input_ids, attention_mask, labels_tensor)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# from safetensors.torch import load_file
# from transformers import BertModel
# file_path = "/content/drive/MyDrive/Zeta Test/model.safetensors"
# loaded = load_file(file_path)

# model = BertModel.from_pretrained(loaded)

In [8]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=77).to(device_name)

# Define optimizer with Adam
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Define loss function (cross-entropy loss for multi-class classification)
criterion = nn.CrossEntropyLoss()

# Define training parameters
num_epochs = 20
batch_size = 15

# DataLoader for training set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize a DataFrame to store the probabilities
probabilities_df = pd.DataFrame()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    epoch_probs = []  # List to store probabilities for this epoch

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)  # Calculate cross-entropy loss
        total_loss += loss.item()

        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)  # Calculate probabilities from logits
        epoch_probs.extend(probs.detach().cpu().numpy().tolist())  # Append probabilities to the list for this epoch

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

        loss.backward()
        optimizer.step()

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch+1}, Loss: {total_loss}, F1 Score: {f1}")

    # Add the probabilities of this epoch to the DataFrame
    probabilities_df[f'Epoch_{epoch+1}'] = pd.Series(epoch_probs)

# Save the DataFrame to a CSV file
probabilities_df.to_csv('/content/drive/MyDrive/Zeta Test/probabilities.csv', index=False)

# Specify a directory to save the model
save_directory = "/content/drive/MyDrive/Zeta Test"

# Save the model
model.save_pretrained(save_directory)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 100/100 [00:07<00:00, 13.67it/s]


Epoch 1, Loss: 436.5207076072693, F1 Score: 0.005971877515000272


Epoch 2: 100%|██████████| 100/100 [00:06<00:00, 14.88it/s]


Epoch 2, Loss: 423.26161885261536, F1 Score: 0.022046677478987028


Epoch 3: 100%|██████████| 100/100 [00:06<00:00, 14.90it/s]


Epoch 3, Loss: 399.071165561676, F1 Score: 0.06981284015652453


Epoch 4: 100%|██████████| 100/100 [00:06<00:00, 14.87it/s]


Epoch 4, Loss: 372.1530222892761, F1 Score: 0.17719016905604015


Epoch 5: 100%|██████████| 100/100 [00:06<00:00, 14.85it/s]


Epoch 5, Loss: 345.0954554080963, F1 Score: 0.30569606315638237


Epoch 6: 100%|██████████| 100/100 [00:06<00:00, 14.87it/s]


Epoch 6, Loss: 319.6551296710968, F1 Score: 0.3826798591534969


Epoch 7: 100%|██████████| 100/100 [00:06<00:00, 14.84it/s]


Epoch 7, Loss: 294.4683165550232, F1 Score: 0.4725672088162636


Epoch 8: 100%|██████████| 100/100 [00:06<00:00, 14.85it/s]


Epoch 8, Loss: 272.5678584575653, F1 Score: 0.5251786916360897


Epoch 9: 100%|██████████| 100/100 [00:06<00:00, 14.85it/s]


Epoch 9, Loss: 249.65050554275513, F1 Score: 0.5747071425310109


Epoch 10: 100%|██████████| 100/100 [00:06<00:00, 14.86it/s]


Epoch 10, Loss: 230.51611244678497, F1 Score: 0.6144580418021963


Epoch 11: 100%|██████████| 100/100 [00:06<00:00, 14.84it/s]


Epoch 11, Loss: 211.00678145885468, F1 Score: 0.6357894690043469


Epoch 12: 100%|██████████| 100/100 [00:06<00:00, 14.82it/s]


Epoch 12, Loss: 193.98382830619812, F1 Score: 0.6726069907140575


Epoch 13: 100%|██████████| 100/100 [00:06<00:00, 14.83it/s]


Epoch 13, Loss: 177.41622054576874, F1 Score: 0.7100693410379564


Epoch 14: 100%|██████████| 100/100 [00:06<00:00, 14.83it/s]


Epoch 14, Loss: 161.97994911670685, F1 Score: 0.7239220674349007


Epoch 15: 100%|██████████| 100/100 [00:06<00:00, 14.86it/s]


Epoch 15, Loss: 147.69362074136734, F1 Score: 0.7660250268454898


Epoch 16: 100%|██████████| 100/100 [00:06<00:00, 14.82it/s]


Epoch 16, Loss: 134.0820425748825, F1 Score: 0.7987150696103676


Epoch 17: 100%|██████████| 100/100 [00:06<00:00, 14.82it/s]


Epoch 17, Loss: 122.16547805070877, F1 Score: 0.8066375653908314


Epoch 18: 100%|██████████| 100/100 [00:06<00:00, 14.76it/s]


Epoch 18, Loss: 110.0095887184143, F1 Score: 0.8537698528382989


Epoch 19: 100%|██████████| 100/100 [00:06<00:00, 14.81it/s]


Epoch 19, Loss: 99.92438042163849, F1 Score: 0.8673993733807359


Epoch 20: 100%|██████████| 100/100 [00:06<00:00, 14.79it/s]


Epoch 20, Loss: 89.78674864768982, F1 Score: 0.8947054823126404


In [9]:
#probabilities_df

In [10]:
# Concatenate probabilities_df onto the end of sampled_data
df = pd.concat([sampled_data.reset_index(drop=True), probabilities_df], axis=1)

df.to_csv('/content/drive/MyDrive/Zeta Test/Dn.csv')

In [11]:
df

Unnamed: 0,text,category,k,l,l',Epoch_1,Epoch_2,Epoch_3,Epoch_4,Epoch_5,...,Epoch_11,Epoch_12,Epoch_13,Epoch_14,Epoch_15,Epoch_16,Epoch_17,Epoch_18,Epoch_19,Epoch_20
0,Is it possible for me to change my PIN number?,change_pin,0,change_pin,,"[0.017596982419490814, 0.011292591691017151, 0...","[0.015339870005846024, 0.018180595710873604, 0...","[0.013956976123154163, 0.01207716390490532, 0....","[0.016559969633817673, 0.007663059048354626, 0...","[0.009708592668175697, 0.021347247064113617, 0...",...,"[0.002108408370986581, 0.0427892804145813, 0.0...","[0.004714111797511578, 0.05084671080112457, 0....","[0.005150074604898691, 0.01221607718616724, 0....","[0.0011201190063729882, 0.09532483667135239, 0...","[0.0019628952722996473, 0.02156745083630085, 0...","[0.004532975144684315, 0.003180434927344322, 0...","[0.0011818245984613895, 0.00449873274192214, 0...","[0.0017907038563862443, 0.01809018664062023, 0...","[0.002720340620726347, 0.011524701490998268, 0...","[0.009413721971213818, 0.0012826001038774848, ..."
1,I'm not sure why my card didn't work,declined_card_payment,0,declined_card_payment,,"[0.016443105414509773, 0.012209547683596611, 0...","[0.01825544238090515, 0.015317553654313087, 0....","[0.025913454592227936, 0.008026418276131153, 0...","[0.011958416551351547, 0.009423889219760895, 0...","[0.00926258321851492, 0.00765965273603797, 0.0...",...,"[0.0050051892176270485, 0.0027034180238842964,...","[0.008617078885436058, 0.029112128540873528, 0...","[0.08637460321187973, 0.003748836927115917, 0....","[0.07775922119617462, 0.002238599117845297, 0....","[0.010216276161372662, 0.001649079960770905, 0...","[0.010580139234662056, 0.017141085118055344, 0...","[0.011088299565017223, 0.004380134865641594, 0...","[0.0012887087650597095, 0.038200102746486664, ...","[0.0023473058827221394, 0.007367720827460289, ...","[0.004136967472732067, 0.001312033855356276, 0..."
2,I don't think my top up worked,top_up_failed,0,top_up_failed,,"[0.01947029121220112, 0.01932305097579956, 0.0...","[0.01843833737075329, 0.015543715097010136, 0....","[0.016296086832880974, 0.01277829334139824, 0....","[0.005617840215563774, 0.014309159480035305, 0...","[0.020488377660512924, 0.013077552430331707, 0...",...,"[0.010101639665663242, 0.003454200690612197, 0...","[0.004760201554745436, 0.024325242266058922, 0...","[0.002811416517943144, 0.029746420681476593, 0...","[0.007566061336547136, 0.015182603150606155, 0...","[0.0019014391582459211, 0.0039322213269770145,...","[0.006252109073102474, 0.008859984576702118, 0...","[0.0013780605513602495, 0.07148285955190659, 0...","[0.006003143265843391, 0.0018175517907366157, ...","[0.0011451068567112088, 0.0032889654394239187,...","[0.003842446953058243, 0.001365426927804947, 0..."
3,Can you explain why my payment was charged a fee?,card_payment_fee_charged,0,card_payment_fee_charged,,"[0.021329037845134735, 0.014395676553249359, 0...","[0.016678880900144577, 0.01206692960113287, 0....","[0.01541356835514307, 0.009578829631209373, 0....","[0.031929612159729004, 0.0081322081387043, 0.0...","[0.009091791696846485, 0.02025308832526207, 0....",...,"[0.020257269963622093, 0.0069657485000789165, ...","[0.007275944110006094, 0.0035132700577378273, ...","[0.002273221965879202, 0.5553904175758362, 0.0...","[0.001729318406432867, 0.5659863352775574, 0.0...","[0.0028168915305286646, 0.0036087241023778915,...","[0.0014360223431140184, 0.02069280669093132, 0...","[0.007268973160535097, 0.0014645641203969717, ...","[0.00653681019321084, 0.0013003882486373186, 0...","[0.0006661086808890104, 0.018443893641233444, ...","[0.017832152545452118, 0.013556744903326035, 0..."
4,How long does a transfer from a UK account tak...,balance_not_updated_after_bank_transfer,0,balance_not_updated_after_bank_transfer,,"[0.022299177944660187, 0.01364618819206953, 0....","[0.024539802223443985, 0.014275139197707176, 0...","[0.011640390381217003, 0.014234712347388268, 0...","[0.016225948929786682, 0.015550513751804829, 0...","[0.016097847372293472, 0.005766901187598705, 0...",...,"[0.00667001074180007, 0.0037446231581270695, 0...","[0.006253272760659456, 0.0029494878835976124, ...","[0.008020875975489616, 0.0036042965948581696, ...","[0.001626369426958263, 0.0043387021869421005, ...","[0.007448111195117235, 0.0029312283731997013, ...","[0.001732499455101788, 0.6365871429443359, 0.0...","[0.0040011596865952015, 0.0018391526537016034,...","[0.007135485298931599, 0.002294597215950489, 0...","[0.005217259284108877, 0.007689596153795719, 0...","[0.0034161729272454977, 0.0009704101830720901,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,Is there a charge for exchanging foreign curre...,exchange_charge,0,exchange_charge,,"[0.016516827046871185, 0.016044171527028084, 0...","[0.015838032588362694, 0.010131028480827808, 0...","[0.010525675490498543, 0.02014286443591118, 0....","[0.013128588907420635, 0.02900848537683487, 0....","[0.008452466689050198, 0.024302976205945015, 0...",...,"[0.009038278833031654, 0.012758447788655758, 0...","[0.0023511445615440607, 0.014037592336535454, ...","[0.0019922375213354826, 0.01240798644721508, 0...","[0.14049369096755981, 0.002551162848249078, 0....","[0.007379357237368822, 0.010230351239442825, 0...","[0.0028513174038380384, 0.012512941844761372, ...","[0.010123146697878838, 0.000941310019697994, 0...","[0.00780617818236351, 0.0004544698167592287, 0...","[0.002054964890703559, 0.003138157306239009, 0...","[0.0028187562711536884, 0.018028704449534416, ..."
1496,Please tell me why the purchase I made online ...,reverted_card_payment?,0,reverted_card_payment?,,"[0.0174664668738842, 0.013958094641566277, 0.0...","[0.013852228410542011, 0.01329109724611044, 0....","[0.01888633705675602, 0.01140750665217638, 0.0...","[0.011055628769099712, 0.05144824832677841, 0....","[0.00907959882169962, 0.05813218653202057, 0.0...",...,"[0.012101280502974987, 0.008975489996373653, 0...","[0.0022915543522685766, 0.04788925498723984, 0...","[0.00636605080217123, 0.004554786719381809, 0....","[0.004954896867275238, 0.0032297272700816393, ...","[0.007391395512968302, 0.0027592587284743786, ...","[0.006036272272467613, 0.004965093918144703, 0...","[0.0061466628685593605, 0.00902782566845417, 0...","[0.0029079595115035772, 0.0013240952976047993,...","[0.013390796259045601, 0.008638406172394753, 0...","[0.0015030408976599574, 0.012513690628111362, ..."
1497,Are you able to make exchanges to EUR?,fiat_currency_support,0,fiat_currency_support,,"[0.01561709400266409, 0.012171845883131027, 0....","[0.01619589328765869, 0.010790304280817509, 0....","[0.017064524814486504, 0.012384545058012009, 0...","[0.015338740311563015, 0.008434603922069073, 0...","[0.01107538677752018, 0.005325980018824339, 0....",...,"[0.0035407678224146366, 0.07856910675764084, 0...","[0.006935542449355125, 0.005353423301130533, 0...","[0.003006214974448085, 0.5073041319847107, 0.0...","[0.008720815181732178, 0.002564074704423547, 0...","[0.006758305709809065, 0.014479408971965313, 0...","[0.029692694544792175, 0.0005663140909746289, ...","[0.008305145427584648, 0.0006959637394174933, ...","[0.010666299611330032, 0.00046680914238095284,...","[0.0032751166727393866, 0.00721946032717824, 0...","[0.018419764935970306, 0.003663411131128669, 0..."
1498,How do a reverse a duplicated charge?,transaction_charged_twice,0,transaction_charged_twice,,"[0.01685273088514805, 0.019437383860349655, 0....","[0.010195554234087467, 0.016119321808218956, 0...","[0.015715109184384346, 0.030460460111498833, 0...","[0.0072137825191020966, 0.022003935649991035, ...","[0.008087054826319218, 0.020213140174746513, 0...",...,"[0.22635097801685333, 0.0019353956449776888, 0...","[0.01958252303302288, 0.007858741097152233, 0....","[0.0024690176360309124, 0.01398796122521162, 0...","[0.003749852767214179, 0.012536104768514633, 0...","[0.0020905083511024714, 0.008422612212598324, ...","[0.028335444629192352, 0.001482372754253447, 0...","[0.0034820574801415205, 0.001880554249510169, ...","[0.00817971583455801, 0.009575441479682922, 0....","[0.016914237290620804, 0.0013406905345618725, ...","[0.02034101076424122, 0.013569142669439316, 0...."


In [12]:
import ast

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size=50, num_layers=2, num_classes=2):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

def main(df):
    # load dataset for training
    td = np.stack(df.iloc[:, -20:].values.tolist())  # extract only ground truth
    #print(td.shape, td)
    td = np.array([[ast.literal_eval(j) for j in i] for i in td])

    print(td.shape)
    is_noisy = df['k'].values.astype(np.int64)
    td = torch.tensor(td, dtype=torch.float)
    print('Using input type with shape of', td.shape)

    # define model
    net = LSTM(77).cuda()  # assuming each of the 77 probabilities is a feature
    print('Training detector instanced by', net.__class__.__name__)

     # Convert to tensors
    train_x = td.float()
    train_y = torch.from_numpy(is_noisy).long()

    train_dataset = TensorDataset(train_x, train_y)
    train_dataloader = DataLoader(train_dataset, batch_size=15, shuffle=True, num_workers=4)

    # define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(net.parameters(), lr=0.001)
    max_epoch = 8
    best_prec = 0

    for epoch in range(max_epoch):
        net.train()
        loss_sigma = 0.0  #
        correct = 0.0
        total = 0.0
        for i, (train_data, train_label) in enumerate(train_dataloader):
            train_data, train_label = Variable(train_data).cuda(), Variable(train_label).cuda()
            out = net(train_data)

            loss = criterion(out, train_label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(out.data, 1)
            total += train_label.size(0)
            correct += (predicted == train_label).squeeze().sum().cpu().numpy()
            loss_sigma += loss.item()

        print("Training: Epoch[{:0>3}/{:0>3}]  Loss: {:.4f} Acc:{:.2%}".format(
            epoch + 1, max_epoch, loss_sigma, correct / total))

    return net

# Load your DataFrame here
df = pd.read_csv('/content/drive/MyDrive/Zeta Test/Dn.csv')

main(df)

(1500, 20, 77)
Using input type with shape of torch.Size([1500, 20, 77])
Training detector instanced by LSTM




Training: Epoch[001/008]  Loss: 37.4203 Acc:90.00%
Training: Epoch[002/008]  Loss: 32.8367 Acc:90.00%
Training: Epoch[003/008]  Loss: 32.7224 Acc:90.00%
Training: Epoch[004/008]  Loss: 32.6507 Acc:90.00%
Training: Epoch[005/008]  Loss: 32.5546 Acc:90.00%
Training: Epoch[006/008]  Loss: 32.7536 Acc:90.00%
Training: Epoch[007/008]  Loss: 32.7115 Acc:90.00%
Training: Epoch[008/008]  Loss: 32.6206 Acc:90.00%


LSTM(
  (lstm): LSTM(77, 50, num_layers=2, batch_first=True)
  (fc): Linear(in_features=50, out_features=2, bias=True)
)

In [13]:
not_sampled = train[~train['text'].isin(sampled_data['text'])]
not_sampled

Unnamed: 0,text,category
1,What can I do if my card still hasn't arrived ...,card_arrival
2,I have been waiting over a week. Is the card s...,card_arrival
4,"How do I know if I will get my card, or if it ...",card_arrival
5,When did you send me my new card?,card_arrival
6,Do you have info about the card on delivery?,card_arrival
...,...,...
9997,I just moved to the US how do I get a card?,country_support
9998,You provide support in what countries?,country_support
10000,What countries are getting support?,country_support
10001,Are cards available in the EU?,country_support


In [14]:
# Tokenize input text
tokenizer = BertTokenizer.from_pretrained(model_name)
input_text = np.array(not_sampled['text'])
input_ids = tokenizer.batch_encode_plus(input_text, padding=True, truncation=True, return_tensors='pt')['input_ids']
attention_mask = tokenizer.batch_encode_plus(input_text, padding=True, truncation=True, return_tensors='pt')['attention_mask']

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(not_sampled['category'])
labels_tensor = torch.tensor(labels_encoded)

# Create TensorDataset
train_dataset = TensorDataset(input_ids, attention_mask, labels_tensor)

In [15]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=77).to(device_name)

# Define optimizer with Adam
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Define loss function (cross-entropy loss for multi-class classification)
criterion = nn.CrossEntropyLoss()

# Define training parameters
num_epochs = 10
batch_size = 15

# DataLoader for training set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize a DataFrame to store the probabilities
probabilities_df = pd.DataFrame()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    epoch_probs = []  # List to store probabilities for this epoch

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)  # Calculate cross-entropy loss
        total_loss += loss.item()

        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)  # Calculate probabilities from logits
        epoch_probs.extend(probs.detach().cpu().numpy().tolist())  # Append probabilities to the list for this epoch

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

        loss.backward()
        optimizer.step()

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch+1}, Loss: {total_loss}, F1 Score: {f1}")

    # Add the probabilities of this epoch to the DataFrame
    probabilities_df[f'Epoch_{epoch+1}'] = pd.Series(epoch_probs)

# Save the DataFrame to a CSV file
#probabilities_df.to_csv('/content/drive/MyDrive/Zeta Test/Dc.csv', index=False)

# Specify a directory to save the model
#save_directory = "/content/drive/MyDrive/Zeta Test"

# Save the model
#model.save_pretrained(save_directory)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 567/567 [00:49<00:00, 11.53it/s]


Epoch 1, Loss: 2161.8797369003296, F1 Score: 0.14420103614833446


Epoch 2: 100%|██████████| 567/567 [00:49<00:00, 11.53it/s]


Epoch 2, Loss: 1447.6734153032303, F1 Score: 0.5348586076406225


Epoch 3: 100%|██████████| 567/567 [00:49<00:00, 11.52it/s]


Epoch 3, Loss: 915.3776234388351, F1 Score: 0.7254103528048778


Epoch 4: 100%|██████████| 567/567 [00:49<00:00, 11.52it/s]


Epoch 4, Loss: 574.4323317408562, F1 Score: 0.8560723626763256


Epoch 5: 100%|██████████| 567/567 [00:49<00:00, 11.41it/s]


Epoch 5, Loss: 360.63695004582405, F1 Score: 0.9168653902953431


Epoch 6: 100%|██████████| 567/567 [00:49<00:00, 11.51it/s]


Epoch 6, Loss: 234.49342696368694, F1 Score: 0.9460634634212065


Epoch 7: 100%|██████████| 567/567 [00:49<00:00, 11.52it/s]


Epoch 7, Loss: 154.71659234166145, F1 Score: 0.9683760234965952


Epoch 8: 100%|██████████| 567/567 [00:49<00:00, 11.51it/s]


Epoch 8, Loss: 103.27357582747936, F1 Score: 0.9816761017820008


Epoch 9: 100%|██████████| 567/567 [00:49<00:00, 11.51it/s]


Epoch 9, Loss: 72.3300967104733, F1 Score: 0.9888379467007031


Epoch 10: 100%|██████████| 567/567 [00:49<00:00, 11.50it/s]

Epoch 10, Loss: 50.2845722027123, F1 Score: 0.9920276732107235





In [16]:
# Concatenate probabilities_df onto the end of sampled_data
df_clean = pd.concat([not_sampled.reset_index(drop=True), probabilities_df], axis=1)

df_clean.to_csv('/content/drive/MyDrive/Zeta Test/Dc.csv')

In [17]:
df_clean = pd.read_csv('/content/drive/MyDrive/Zeta Test/Dc.csv')

td_clean = np.stack(df_clean.iloc[:, -10:].values.tolist())  # extract only ground truth
td_clean = np.array([[ast.literal_eval(j) for j in i] for i in td_clean])
td_clean = torch.tensor(td_clean, dtype=torch.float)

def predict(net, input_data):
    net.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Temporarily set all the requires_grad flag to false
        input_data = Variable(input_data).cuda()
        outputs = net(input_data)
        _, predicted = torch.max(outputs.data, 1)
    return predicted.cpu().numpy()

net = main(df)

(1500, 20, 77)
Using input type with shape of torch.Size([1500, 20, 77])
Training detector instanced by LSTM




Training: Epoch[001/008]  Loss: 38.0017 Acc:86.33%
Training: Epoch[002/008]  Loss: 32.6428 Acc:90.00%
Training: Epoch[003/008]  Loss: 32.6883 Acc:90.00%
Training: Epoch[004/008]  Loss: 32.7544 Acc:90.00%
Training: Epoch[005/008]  Loss: 32.6372 Acc:90.00%
Training: Epoch[006/008]  Loss: 32.7326 Acc:90.00%
Training: Epoch[007/008]  Loss: 32.7065 Acc:90.00%
Training: Epoch[008/008]  Loss: 32.6575 Acc:90.00%


In [18]:
predictions = predict(net, td_clean)
# Assuming df is your DataFrame
df_clean.insert(3, 'predictions', predictions)
#pd.options.display.max_rows = 500
df_clean.to_csv('/content/drive/MyDrive/Zeta Test/predictions_df.csv')

# Other code that might not work below


---



In [None]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=77).to(device_name)

# Define optimizer with Adam
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Define loss function (cross-entropy loss for multi-class classification)
criterion = nn.CrossEntropyLoss()

# Define training parameters
num_epochs = 40
batch_size = 15

# DataLoader for training set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize a DataFrame to store the probabilities
probabilities_df = pd.DataFrame()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    epoch_probs = []  # List to store probabilities for this epoch

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)  # Calculate cross-entropy loss
        total_loss += loss.item()

        logits = outputs.logits
        print(logits.shape)
        probs = torch.softmax(logits, dim=1)  # Calculate probabilities from logits
        print(probs.shape, torch.sum(probs, dim=1))
        epoch_probs.extend(probs.detach().cpu().numpy().tolist())  # Flatten batch dimension and append to the list for this epoch
        print(len(epoch_probs), len(epoch_probs[0]))

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

        loss.backward()
        optimizer.step()

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch+1}, Loss: {total_loss}, F1 Score: {f1}")

    # Convert epoch_probs to a 2D NumPy array and flatten it to 1D
    epoch_probs_flat = np.concatenate(epoch_probs).ravel()

    # Add the probabilities of this epoch to the DataFrame
    probabilities_df[f'Epoch_{epoch+1}'] = epoch_probs_flat

# Save the DataFrame to a CSV file
probabilities_df.to_csv('/content/drive/MyDrive/Zeta Test/probabilities.csv', index=False)

# Specify a directory to save the model
save_directory = "/content/drive/MyDrive/Zeta Test"

# Save the model
model.save_pretrained(save_directory)


In [None]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=77).to(device_name)

# Define optimizer with Adam
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Define loss function (cross-entropy loss for multi-class classification)
criterion = nn.CrossEntropyLoss()

# Define training parameters
num_epochs = 40
batch_size = 15

# DataLoader for training set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# List to store probabilities per epoch
probabilities_per_epoch = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    epoch_probs = []  # List to store probabilities for this epoch

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)  # Calculate cross-entropy loss
        total_loss += loss.item()

        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)  # Calculate probabilities from logits
        epoch_probs.append(probs.detach().cpu().numpy())  # Append probabilities to the list for this epoch

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

        loss.backward()
        optimizer.step()

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch+1}, Loss: {total_loss}, F1 Score: {f1}")

    # Append probabilities for this epoch to the list
    probabilities_per_epoch.append(epoch_probs)

# Convert probabilities_per_epoch to a NumPy array for easier manipulation
probabilities_array = np.array(probabilities_per_epoch)


# Specify a directory to save the model
save_directory = "/content/drive/MyDrive/Zeta Test"

# Save the model
model.save_pretrained(save_directory)


In [None]:
print(probabilities_array.shape)
max(probabilities_array[0][40][0])

In [None]:
max_probs = []
for epoch in probabilities_array:
  for batch in epoch:
    for sample in batch:
      max_probs.append(max(sample))

len(max_probs)

In [None]:
# Assuming probabilities_array has shape (num_epochs, batch_size, num_samples, num_classes)
num_epochs, batch_size, num_samples, num_classes = probabilities_array.shape

# Reshape probabilities_array to have separate dimensions for epochs, samples, and classes
consolidated_probabilities = probabilities_array.reshape(num_epochs, -1, num_classes)

consolidated_probabilities.shape

# Calculate the maximum probability for each epoch
max_probs_per_epoch = np.max(consolidated_probabilities, axis=(1, 2))

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(max_probs_per_epoch, color='blue', label='Max Probability per Epoch')
plt.title('Max Probabilities over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Max Probability')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
epoch_probs[0].shape
len(probabilities_per_epoch[0])

In [None]:
# List to store probabilities of true label per epoch
true_label_probs_per_epoch = []

for epoch_probs in probabilities_per_epoch:
    true_label_probs_batch = []
    for batch_probs, batch_labels in zip(epoch_probs, all_labels):
        true_label_prob = batch_probs[batch_labels].tolist()  # Probability corresponding to the true label
        true_label_probs_batch.extend(true_label_prob)
    true_label_probs_per_epoch.append(true_label_probs_batch)

# Convert true_label_probs_per_epoch to a NumPy array for easier manipulation
true_label_probs_array = np.array(true_label_probs_per_epoch)

# Calculate mean probability of true label per epoch
mean_true_label_probs_per_epoch = np.mean(true_label_probs_array, axis=1)

# Calculate standard deviation of probability of true label per epoch
std_true_label_probs_per_epoch = np.std(true_label_probs_array, axis=1)

print("Mean probability assigned to the actual true label per epoch:")
print(mean_true_label_probs_per_epoch)

print("Standard deviation of probability assigned to the actual true label per epoch:")
print(std_true_label_probs_per_epoch)


In [None]:
# List to store probabilities of true label per epoch
true_label_probs_per_epoch = []

for epoch_probs in probabilities_per_epoch:
    true_label_probs = []
    for i, prob in enumerate(epoch_probs):
        true_label_index = all_labels[prob]  # Get the true label index
        true_label_prob = prob[true_label_index].item()  # Probability corresponding to the true label
        true_label_probs.append(true_label_prob)
    true_label_probs_per_epoch.append(true_label_probs)

# Convert true_label_probs_per_epoch to a NumPy array for easier manipulation
true_label_probs_array = np.array(true_label_probs_per_epoch)

# Calculate mean probability of true label per epoch
mean_true_label_probs_per_epoch = np.mean(true_label_probs_array, axis=1)

# Calculate standard deviation of probability of true label per epoch
std_true_label_probs_per_epoch = np.std(true_label_probs_array, axis=1)

print("Mean probability assigned to the actual true label per epoch:")
print(mean_true_label_probs_per_epoch)

print("Standard deviation of probability assigned to the actual true label per epoch:")
print(std_true_label_probs_per_epoch)


In [None]:
# Save the trained model
model.save_pretrained("/content/drive/MyDrive/Zeta Test")

In [None]:
# Old model with validation set

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=77).to(device_name)

# Define optimizer with Adam
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Define loss function (cross-entropy loss for multi-class classification)
criterion = nn.CrossEntropyLoss()

# Define training parameters
num_epochs = 100
batch_size = 16

# Split dataset into training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# List to store probabilities per epoch
probabilities_per_epoch = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    epoch_probs = []  # List to store probabilities for this epoch

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)  # Calculate cross-entropy loss
        total_loss += loss.item()

        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)  # Calculate probabilities from logits
        epoch_probs.append(probs.detach().cpu().numpy())  # Append probabilities to the list for this epoch

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    all_preds_val = []
    all_labels_val = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds_val.extend(preds.cpu().tolist())
            all_labels_val.extend(labels.cpu().tolist())

    # Calculate F1 score
    f1 = f1_score(all_labels_val, all_preds_val, average='macro')

    print(f"Epoch {epoch+1}, Loss: {total_loss}, F1 Score: {f1}")

    # Append probabilities for this epoch to the list
    probabilities_per_epoch.append(epoch_probs)

# Convert probabilities_per_epoch to a NumPy array for easier manipulation
probabilities_array = np.array(probabilities_per_epoch)