In [1]:
# Importing libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)
import joblib

In [2]:
# We want to use the GPU if possible

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

In [4]:
# Load the data as saved in previous section, before tokenization

data = pd.read_pickle('data.pkl')

In [5]:
data.head()

Unnamed: 0,Id,Title,Body,Tag,NoTags
0,80,sqlstatement.execute() - multiple queries in o...,<p>i've written a database generation script i...,"[flex, actionscript-3, air]",3
1,90,good branching and merging tutorials for torto...,<p>are there any really good tutorials explain...,"[svn, tortoisesvn, branch, branching-and-merging]",4
2,120,asp.net site maps,<p>has anyone got experience creating <strong>...,"[sql, asp.net, sitemap]",3
3,180,function for creating color wheels,<p>this is something i've pseudo-solved many t...,"[algorithm, language-agnostic, colors, color-s...",4
4,260,adding scripting functionality to .net applica...,<p>i have a little game written in c#. it uses...,"[c#, .net, scripting, compiler-construction]",4


In [6]:
# We will combine the title and the body as we did in the previous implementations
# However, tokenization will be implemented as part of the training procedure with the data loader

data_only = data.copy()
data_only = data_only.drop(['Id', 'NoTags'], axis=1)
data_only['Text'] = data_only[['Title', 'Body']].apply("".join, axis=1)
data_only = data_only.drop(['Title', 'Body'], axis=1)
data_only.head()

Unnamed: 0,Tag,Text
0,"[flex, actionscript-3, air]",sqlstatement.execute() - multiple queries in o...
1,"[svn, tortoisesvn, branch, branching-and-merging]",good branching and merging tutorials for torto...
2,"[sql, asp.net, sitemap]",asp.net site maps<p>has anyone got experience ...
3,"[algorithm, language-agnostic, colors, color-s...",function for creating color wheels<p>this is s...
4,"[c#, .net, scripting, compiler-construction]",adding scripting functionality to .net applica...


In [7]:
# Define metric for validation purposes 

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [8]:
most_common_tags = pd.read_pickle('most_common_tags.pkl')

In [9]:
# We want to keep only the top-50 tags due to resources limitations

df_new = data_only.copy()

for tag in most_common_tags:
    tmp_tag = []
    for inst in data_only['Tag']:
        if tag in inst:
            tmp_tag.append(int(1))
        else:
            tmp_tag.append(int(0))
    data_only[tag] = tmp_tag

In [10]:
data_only.head()

Unnamed: 0,Tag,Text,javascript,java,c#,php,android,jquery,python,html,...,spring,wordpress,eclipse,html5,multithreading,oracle,git,facebook,forms,bash
0,"[flex, actionscript-3, air]",sqlstatement.execute() - multiple queries in o...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[svn, tortoisesvn, branch, branching-and-merging]",good branching and merging tutorials for torto...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[sql, asp.net, sitemap]",asp.net site maps<p>has anyone got experience ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[algorithm, language-agnostic, colors, color-s...",function for creating color wheels<p>this is s...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[c#, .net, scripting, compiler-construction]",adding scripting functionality to .net applica...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Combine all labels in one list

source_col_loc = data_only.columns.get_loc('Text') # column position starts from 0

data_only['Target'] = data_only.iloc[:,source_col_loc+1:].apply(
    lambda x: ",".join(x.astype(str)), axis=1)

In [12]:
df = data_only[['Text', 'Target']].copy()

In [13]:
new_df = pd.DataFrame()
new_df['text'] = df['Text']
new_df['labels'] = df.iloc[:, 1:].values.tolist()

In [14]:
new_df.head()

Unnamed: 0,text,labels
0,sqlstatement.execute() - multiple queries in o...,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
1,good branching and merging tutorials for torto...,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
2,asp.net site maps<p>has anyone got experience ...,"[0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0..."
3,function for creating color wheels<p>this is s...,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
4,adding scripting functionality to .net applica...,"[0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0..."


In [None]:
# We want to keep only the questions that include one of the top 50 tags, due to resources limitations
# ATTENTION: time consuming process. You may continue to load the data in the following cells.

df2 = new_df.copy()

for ind in new_df.index:
    st = new_df.loc[ind,'labels'][0].split(",")
    if "1" not in st:
        df2 = df2.drop(index = ind)
        print(st, ind)

joblib.dump(df2, 'data_non_zero.pkl')

In [42]:
data_non_zero = pd.DataFrame(pd.read_pickle('data_non_zero.pkl').T, columns=['text', 'labels'])

In [43]:
data_non_zero.head()

Unnamed: 0,text,labels
0,asp.net site maps<p>has anyone got experience ...,"[0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0..."
1,adding scripting functionality to .net applica...,"[0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0..."
2,should i use nested classes in this case?<p>i ...,"[0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
3,homegrown consumption of web services<p>i've b...,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0..."
4,deploying sql server databases from test to li...,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0..."


In [None]:
# !!! This is also a time consuming cell.
# Even though we have attained all labels in one list, they are in the form of strings, yet the model needs it be
# integers

tm = pd.DataFrame()

for ind in data_non_zero.index:
    print(ind)
    tmp = data_non_zero['labels'][ind][0]
    tmp = tmp.split(",")
    tmp = list(map(int, tmp))
    print(tmp)
    tm[ind] = tmp

d = tm.T
dat = data_non_zero.copy()
for ind in dat.index:
    print(ind)
    l = dat.loc[ind, 'labels']
    #print(l)
    #for j in d.iloc[ind]:
    #    l.append(j)
    dat.loc[ind, 'labels'] = [eval(i) for i in l]

In [None]:
dat
data = dat.copy()

In [None]:
joblib.dump(data, 'data_intlabels.pkl')

In [4]:
data = pd.DataFrame(pd.read_pickle('data_intlabels.pkl').T, columns=['text', 'labels'])

In [5]:
data.head()

Unnamed: 0,text,labels
0,asp.net site maps<p>has anyone got experience ...,"[(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,..."
1,adding scripting functionality to .net applica...,"[(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,should i use nested classes in this case?<p>i ...,"[(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,..."
3,homegrown consumption of web services<p>i've b...,"[(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,deploying sql server databases from test to li...,"[(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [6]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [7]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index][0], dtype=torch.float)
        }

In [8]:
# Creating the dataset and dataloader for the neural network

train_size = 0.7
train_data=data.sample(frac=train_size,random_state=42)
test_data=data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (986641, 2)
TRAIN Dataset: (690649, 2)
TEST Dataset: (295992, 2)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the 
# final output for the model

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 50)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [12]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [13]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        #print(targets.shape)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
1it [00:00,  2.29it/s]

Epoch: 0, Loss:  0.7146694660186768


5003it [08:07, 10.50it/s]

Epoch: 0, Loss:  0.07199488580226898


10003it [16:07, 10.52it/s]

Epoch: 0, Loss:  0.03323834761977196


15003it [24:07, 10.54it/s]

Epoch: 0, Loss:  0.057408809661865234


20003it [32:07, 10.49it/s]

Epoch: 0, Loss:  0.034252118319272995


25003it [40:09, 10.51it/s]

Epoch: 0, Loss:  0.04090392589569092


30002it [48:11, 10.41it/s]

Epoch: 0, Loss:  0.030357569456100464


35003it [56:10, 10.61it/s]

Epoch: 0, Loss:  0.049699027091264725


40003it [1:04:08, 10.49it/s]

Epoch: 0, Loss:  0.030621962621808052


45003it [1:12:06, 10.56it/s]

Epoch: 0, Loss:  0.04128674790263176


50003it [1:20:06, 10.60it/s]

Epoch: 0, Loss:  0.02162994258105755


55003it [1:28:05, 10.66it/s]

Epoch: 0, Loss:  0.02819053828716278


60003it [1:36:05, 10.61it/s]

Epoch: 0, Loss:  0.044776801019907


65003it [1:44:01, 10.64it/s]

Epoch: 0, Loss:  0.050213154405355453


68298it [1:49:19,  9.48it/s]

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

In [None]:
# Validation of the model with a split 70/30 for training/testing

val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

acc = accuracy_score(np.array(targets), np.array(final_outputs))
f1 = f1_score(np.array(targets), np.array(final_outputs), average="micro")

print('Results:\nAccuracy = {:.2f}\nF1-score = {:.2f}\nHamming Loss = {:.2f}\nHamming Score = {:.2f}'.format(acc,\
                                                                        f1, val_hamming_loss, val_hamming_score))

In [None]:
# Saving the files for inference

output_model_file = 'pytorch_distilbert_70_batch8_lr15.bin'
output_vocab_file = 'vocab_distilbert_70_batch8_lr15.bin'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-04
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index][0], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.7
train_data=data.sample(frac=train_size,random_state=42)
test_data=data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the 
# final output for the model

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 50)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        #print(targets.shape)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

In [None]:
# Validation of the model with a split 70/30 for training/testing

val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

acc = accuracy_score(np.array(targets), np.array(final_outputs))
f1 = f1_score(np.array(targets), np.array(final_outputs), average="micro")

print('Results:\nAccuracy = {:.2f}\nF1-score = {:.2f}\nHamming Loss = {:.2f}\nHamming Score = {:.2f}'.format(acc,\
                                                                        f1, val_hamming_loss, val_hamming_score))

In [None]:
# Saving the files for inference

output_model_file = 'pytorch_distilbert_70_batch8_lr14.bin'
output_vocab_file = 'vocab_distilbert_70_batch8_lr14.bin'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')