In [None]:
%pip install - U pip setuptools wheel
%pip install tqdm
%pip install torch torchvision torchaudio
!export CUDA_PATH = "/opt/nvidia/cuda"
%pip install - U spacy[cuda11X, transformers]
%pip install transformers[sentencepiece]

In [1]:
!python3 -m spacy download en_core_web_lg -qq
!python3 -m spacy download en_core_web_trf -qq

2023-03-23 11:01:55.816882: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-23 11:01:57.369844: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 11:01:57.370214: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 11:01:57.370365: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

In [1]:
import json
from tqdm import tqdm
import spacy
from thinc.api import set_gpu_allocator, require_gpu
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score, jaccard_score

with open('selected_tags.json', 'r') as openfile:
    selected_tags = json.load(openfile)


2023-03-23 16:23:57.565946: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-23 16:23:59.241489: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 16:23:59.241867: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 16:23:59.242017: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

In [2]:
import pandas as pd
sample_train = pd.read_csv(
    './train.csv', converters={'tag_set': eval, 'tag_list': eval})
sample_test = pd.read_csv(
    './test.csv', converters={'tag_set': eval, 'tag_list': eval})
sample_validation = pd.read_csv(
    './validation.csv', converters={'tag_set': eval, 'tag_list': eval})


In [3]:


# Use the GPU, with memory allocations directed via PyTorch.
# This prevents out-of-memory errors that would otherwise occur from competing
# memory pools.
set_gpu_allocator("pytorch")
require_gpu(0)


def preprocess(texts, nlp):
    removal = ['ADV', 'PRON', 'CCONJ', 'PUNCT',
               'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM']
    tokens = []
    cleaned_texts = []
    print("preprocessing")
    for summary in tqdm(nlp.pipe(texts, disable=["tok2vec"]), total=len(texts)):
        question_tokens = []
        for token in summary:
            if token.pos_ not in removal and not token.is_stop and token.is_alpha:
                question_tokens.append(token.lemma_)
        cleaned_texts.append(" ".join(question_tokens))
    # question_tokens = [token.lemma_ for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
    return cleaned_texts

def preprocess_transformers(texts):
    tokens = []
    removal = [ 'PUNCT', 'SPACE', 'NUM', 'SYM']
    cleaned_texts = []
    print("preprocessing")
    for summary in tqdm(nlp.pipe(texts, disable=["transformer", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"]), total=len(texts)):
        question_tokens = []
        for token in summary:
            if token.pos_ not in removal and token.is_alpha and len(question_tokens)<512:
                question_tokens.append(token.lower_)
        cleaned_texts.append(" ".join(question_tokens))
    # question_tokens = [token.lemma_ for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
    return cleaned_texts


def convert(data, outfile, nlp):
    db = spacy.tokens.DocBin()
    docs = []
    print("converting")
    for doc, labels in tqdm(nlp.pipe(data.values, as_tuples=True), total=len(data)):
        for tag in selected_tags:
            doc.cats[tag] = tag in labels
        db.add(doc)
    db.to_disk(outfile)

def evaluate_predictions(y_val, y_pred_text, y_pred_code, mlb, model_name):
    y_union = (y_pred_text.astype(np.bool_) |
               y_pred_code.astype(np.bool_)).astype(np.int_)

    y_intersec = (y_pred_text.astype(np.bool_) &
                y_pred_code.astype(np.bool_)).astype(np.int_)
    results = {
        "text": {
            "roc_per_tags": list(
                zip(mlb.classes_, roc_auc_score(y_val, y_pred_text, average=None))),
            "roc_macro": roc_auc_score(y_val, y_pred_text, average='macro'),
            "jaccard": jaccard_score(y_val, y_pred_text, average='samples')
        },
        "code": {
            "roc_per_tags": list(zip(mlb.classes_, roc_auc_score(y_val, y_pred_code, average=None))),
            "roc_macro": roc_auc_score(y_val, y_pred_code, average='macro'),
            "jaccard": jaccard_score(y_val, y_pred_code, average='samples')
        },
        "union": {
            "roc_per_tags": list(zip(mlb.classes_, roc_auc_score(y_union, y_pred_code, average=None))),
            "roc_macro": roc_auc_score(y_union, y_pred_code, average='macro'),
            "jaccard":  jaccard_score(y_union, y_pred_code, average='samples')
        },
        "intersection": {
            "roc_per_tags": list(zip(mlb.classes_, roc_auc_score(y_intersec, y_pred_code, average=None))),
            "roc_macro": roc_auc_score(y_intersec, y_pred_code, average='macro'),
            "jaccard": jaccard_score(y_intersec, y_pred_code, average='samples')
        },
    }

    for i in ["text", "code", "union", "intersection"]:
        print(f"======= {i} ==========")
        print("Roc auc for each tag:")
        print(results[i]["roc_per_tags"])
        print(f"Roc auc macro average: {results[i]['roc_macro']:.3f}")
        print(f"Jaccard score sample average: {results[i]['jaccard']:.3f}")

    with open(f"{model_name}_results.json", "w") as outfile:
        json.dump(results, outfile)


### BOW

In [None]:
nlp = spacy.load('en_core_web_lg')

#### Text

In [None]:
train_df = sample_train[~sample_train.text.isna()].loc[:]
train_df["text_processed"] = preprocess(train_df.text, nlp)
convert(train_df.loc[:, ["text_processed", "tag_list"]],
        'text_train_bow.spacy', nlp)
test_df = sample_test[~sample_test.text.isna()].loc[:]
test_df["text_processed"] = preprocess(test_df.text, nlp)
convert(test_df.loc[:, ["text_processed", "tag_list"]],
        'text_test_bow.spacy', nlp)
validation_df = sample_validation[~sample_validation.text.isna()].loc[:]
validation_df["text_processed"] = preprocess(
    validation_df.text, nlp)
convert(validation_df.loc[:, ["text_processed", "tag_list"]],
        'text_validation_bow.spacy', nlp)


preprocessing


  0%|          | 0/105710 [00:00<?, ?it/s]

converting


  0%|          | 0/105710 [00:00<?, ?it/s]

preprocessing


  0%|          | 0/21648 [00:00<?, ?it/s]

converting


  0%|          | 0/21648 [00:00<?, ?it/s]

preprocessing


  0%|          | 0/14149 [00:00<?, ?it/s]

converting


  0%|          | 0/14149 [00:00<?, ?it/s]

In [None]:
!python3 -m spacy train config_BOW.cfg --output ./output_BOW --paths.train ./text_train_bow.spacy --paths.dev ./text_test_bow.spacy --gpu-id 0

2023-03-17 12:18:13.716779: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-17 12:18:15.191947: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 12:18:15.192320: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 12:18:15.192476: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

#### Code

In [None]:
train_df = sample_train[~sample_train.code.isna()].loc[:]
train_df["code_processed"] = preprocess(train_df.code, nlp)
convert(train_df.loc[:, ["code_processed", "tag_list"]],
        'code_train_bow.spacy', nlp)
test_df = sample_test[~sample_test.code.isna()].loc[:]
test_df["code_processed"] = preprocess(test_df.code, nlp)
convert(test_df.loc[:, ["code_processed", "tag_list"]],
        'code_test_bow.spacy', nlp)
validation_df = sample_validation[~sample_validation.code.isna()].loc[:]
validation_df["code_processed"] = preprocess(
    validation_df.code, nlp)
convert(validation_df.loc[:, ["code_processed", "tag_list"]],
        'code_validation_bow.spacy', nlp)


preprocessing


  0%|          | 0/81619 [00:00<?, ?it/s]

converting


  0%|          | 0/81619 [00:00<?, ?it/s]

preprocessing


  0%|          | 0/16800 [00:00<?, ?it/s]

converting


  0%|          | 0/16800 [00:00<?, ?it/s]

preprocessing


  0%|          | 0/10956 [00:00<?, ?it/s]

converting


  0%|          | 0/10956 [00:00<?, ?it/s]

In [None]:
!python3 -m spacy train config_BOW.cfg --output ./output_code_BOW --paths.train ./code_train_bow.spacy --paths.dev ./code_test_bow.spacy --gpu-id 0

2023-03-17 13:35:08.530678: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-17 13:35:10.172390: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 13:35:10.172770: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 13:35:10.172917: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

### Tok2Vec

#### Text

In [None]:
!python3 -m spacy train config_tok2vec.cfg --output ./output_tok2vec --paths.train ./text_train_bow.spacy --paths.dev ./text_test_bow.spacy --gpu-id 0

2023-03-17 12:20:20.144707: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-17 12:20:21.668324: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 12:20:21.671812: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 12:20:21.671976: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

#### Code

In [None]:
!python3 -m spacy train config_tok2vec.cfg --output ./output_code_tok2vec --paths.train ./code_train_bow.spacy --paths.dev ./code_test_bow.spacy --gpu-id 0

2023-03-17 13:34:36.026104: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-17 13:34:37.612346: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 13:34:37.612711: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 13:34:37.612871: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

### RoBertA


In [7]:
nlp = spacy.load("en_core_web_trf")

loading configuration file /tmp/tmp0lge8o9t/config.json
Model config RobertaConfig {
  "_name_or_path": "/tmp/tmp0lge8o9t/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json



#### Text

In [None]:
train_df = sample_train[~sample_train.text.isna()].loc[:]
train_df["text_processed"] = preprocess_transformers(train_df.text, nlp)
test_df = sample_test[~sample_test.text.isna()].sample(
    round(len(sample_test)/4))
test_df["text_processed"] = preprocess_transformers(
    test_df.text, nlp)
validation_df = sample_validation[~sample_validation.text.isna()].loc[:]
validation_df["text_processed"] = preprocess_transformers(
    validation_df.text, nlp)
convert(train_df.loc[:, ["text_processed", "tag_list"]],
        'text_train_transformer.spacy', nlp)
convert(test_df.loc[:, ["text_processed",
        "tag_list"]], 'text_test_transformer.spacy', nlp)
convert(validation_df.loc[:, [
        "text_processed", "tag_list"]], 'text_validation_transformer.spacy', nlp)


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5413/5413 [00:42<00:00, 127.96it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5322/5322 [01:00<00:00, 87.44it/s] 


In [None]:
!python3 -m spacy train config_transformer_filled.cfg --output ./output_transformer --paths.train ./text_train_transformer.spacy --paths.dev ./text_test_transformer.spacy --gpu-id 0

2023-03-16 16:03:36.530246: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-16 16:03:38.978276: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-16 16:03:38.978632: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-16 16:03:38.978786: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

#### Code

In [None]:
train_df = sample_train[~sample_train.code.isna()].loc[:]
train_df["code_processed"] = preprocess_transformers(train_df.code, nlp)
test_df = sample_test[~sample_test.code.isna()].sample(
    round(len(sample_test)/4))
test_df["code_processed"] = preprocess(test_df.code, nlp)
validation_df = sample_validation[~sample_validation.code.isna()].loc[:]
validation_df["code_processed"] = preprocess(
    validation_df.code, nlp)
convert(train_df.loc[:, ["code_processed", "tag_list"]],
        'code_train_transformer.spacy', nlp)
convert(test_df.loc[:, ["code_processed",
        "tag_list"]], 'code_test_transformer.spacy', nlp)
convert(validation_df.loc[:, [
        "code_processed", "tag_list"]], 'code_validation_transformer.spacy', nlp)


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 81619/81619 [13:16<00:00, 102.47it/s]
 21%|‚ñà‚ñà        | 15681/76005 [01:55<07:08, 140.73it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 76005/76005 [09:12<00:00, 137.64it/s]


In [None]:
!python3 -m spacy train config_transformer_filled.cfg --output ./output_code_transformer --paths.train ./code_train_transformer.spacy --paths.dev ./code_test_transformer.spacy --gpu-id 0

2023-03-17 09:46:03.044001: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-17 09:46:04.556747: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 09:46:04.557120: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 09:46:04.557277: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

### USE

In [4]:
import tensorflow as tf
# import tensorflow_hub as hub
import tensorflow.keras
import os
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

# Bert
import transformers
from transformers import *

os.environ["TF_KERAS"]='1'



In [5]:
print(tf.__version__)
print(tensorflow.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

2.11.0
2.11.0
Num GPUs Available:  1
True


In [6]:
train_df = sample_train.loc[:]
train_df.fillna("", inplace=True)
nlp = spacy.load("en_core_web_trf")
train_df["text_processed"] = preprocess_transformers(train_df.text)
train_df["code_processed"] = preprocess_transformers(train_df.code)


loading configuration file /tmp/tmpkge6q4gf/config.json
Model config RobertaConfig {
  "_name_or_path": "/tmp/tmpkge6q4gf/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105715/105715 [00:41<00:00, 2525.98it/s]


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105715/105715 [02:21<00:00, 748.86it/s]


In [7]:
import tensorflow_hub as hub
import torch
import gc
del nlp
gc.collect()
torch.cuda.empty_cache()


In [8]:

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

2023-03-23 16:02:03.580250: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-23 16:02:03.580890: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 16:02:03.581110: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 16:02:03.581260: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

In [9]:
def feature_USE_fct(sentences, b_size) :
    batch_size = b_size
    # time1 = time.time()
    
    for step in tqdm(range(len(sentences)//batch_size+1)):
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step ==0 :
            features = feat
        else :
            features = np.concatenate((features,feat))

    # time2 = np.round(time.time() - time1,0)
    return features

In [10]:
batch_size = 10
text_embedded_use = feature_USE_fct(train_df["text_processed"].to_list(), batch_size)
code_embedded_use = feature_USE_fct(
    train_df["code_processed"].to_list(), batch_size)


  0%|          | 0/10572 [00:00<?, ?it/s]2023-03-23 16:02:07.606647: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10572/10572 [04:37<00:00, 38.07it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10572/10572 [04:29<00:00, 39.26it/s]


In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

sample_train.tag_list = sample_train.tag_list.apply(
    lambda tag_list: [tag for tag in tag_list if tag in selected_tags])
sample_test.tag_list = sample_test.tag_list.apply(
    lambda tag_list: [tag for tag in tag_list if tag in selected_tags])
sample_validation.tag_list = sample_validation.tag_list.apply(
    lambda tag_list: [tag for tag in tag_list if tag in selected_tags])

mlb = MultiLabelBinarizer()
mlb.fit([selected_tags])
y_train = mlb.transform(sample_train.tag_list.values)
y_train_sets = sample_train.tag_list.apply(set)



In [14]:
from sklearn.datasets import make_multilabel_classification
import numpy as np
from xgboost import XGBClassifier

text_USE_xgb = XGBClassifier(tree_method="hist")
text_USE_xgb.fit(text_embedded_use, y_train)

code_USE_xgb = XGBClassifier(tree_method="hist")
code_USE_xgb.fit(code_embedded_use, y_train)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [17]:
nlp = spacy.load("en_core_web_trf")


loading configuration file /tmp/tmprnhwyq9d/config.json
Model config RobertaConfig {
  "_name_or_path": "/tmp/tmprnhwyq9d/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [18]:
val_df = sample_validation.loc[:]
val_df.fillna("", inplace=True)

val_df["text_processed"] = preprocess_transformers(val_df.text)
val_df["code_processed"] = preprocess_transformers(val_df.code)
y_val = mlb.transform(val_df.tag_list.values)
y_val_sets = val_df.tag_list.apply(set)
del nlp
gc.collect()
torch.cuda.empty_cache()

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

val_text_embedded_use = feature_USE_fct(
    val_df["text_processed"].to_list(), batch_size)
val_code_embedded_use = feature_USE_fct(
    val_df["code_processed"].to_list(), batch_size)


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [00:07<00:00, 1889.15it/s]


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [00:20<00:00, 687.83it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1416/1416 [00:12<00:00, 116.62it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1416/1416 [00:09<00:00, 141.64it/s]


In [19]:
y_pred_text = text_USE_xgb.predict(val_text_embedded_use)
y_pred_code = code_USE_xgb.predict(val_code_embedded_use)

evaluate_predictions(y_val, y_pred_text, y_pred_code, mlb, "use")


Roc auc for each tag:
[('.net', 0.7121421294027646), ('android', 0.8668995046865263), ('c', 0.7801462618928802), ('c#', 0.7868051930953087), ('c++', 0.7396842721487711), ('css', 0.8624284280899274), ('html', 0.7622747779890774), ('ios', 0.8181335466102204), ('iphone', 0.6915743761553756), ('java', 0.769999120294799), ('javascript', 0.7866934028833994), ('jquery', 0.7923708044239223), ('node.js', 0.7837720707414619), ('objective-c', 0.7062156565932376), ('php', 0.8208277888464297), ('python', 0.8661887998264991)]
Roc auc macro average: 0.784
Jaccard score sample average: 0.546
Roc auc for each tag:
[('.net', 0.6079369866359082), ('android', 0.7267987716739797), ('c', 0.6997154406754416), ('c#', 0.72292088495904), ('c++', 0.6909361291254534), ('css', 0.7708856056792683), ('html', 0.696739141993119), ('ios', 0.723385505374633), ('iphone', 0.577980187239019), ('java', 0.6930168290270727), ('javascript', 0.7081876017386395), ('jquery', 0.669616830934715), ('node.js', 0.7314298867330988), ('

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Roberta XGB

In [4]:
nlp = spacy.load("en_core_web_trf")
train_df = sample_train.loc[:]
train_df.fillna("", inplace=True)
train_df["text_processed"] = preprocess_transformers(train_df.text)
train_df["code_processed"] = preprocess_transformers(train_df.code)


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105715/105715 [00:41<00:00, 2545.90it/s]


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105715/105715 [02:16<00:00, 772.66it/s]


In [6]:
train_df_text = train_df[train_df.text_processed != ""].loc[:]


In [10]:

rob_text_embeds = None

for doc in tqdm(nlp.pipe(train_df_text.text_processed.values), total=len(train_df_text)):
    doc_embed = doc.user_data[('._.', 'trf_data', None, None)
                              ].model_output.last_hidden_state.mean(axis=1)[0]
    if rob_text_embeds is None:
        rob_text_embeds = doc_embed.reshape(1, 768)
    else:
        rob_text_embeds = np.vstack([rob_text_embeds, doc_embed])
   

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [04:04<00:00, 81.78it/s]


In [22]:
rob_text_embeds[2].shape


(768,)

In [13]:

mlb = MultiLabelBinarizer()
mlb.fit([selected_tags])
y_train = mlb.transform(sample_train.tag_list.values)
y_train_sets = sample_train.tag_list.apply(set)




In [17]:
len(rob_text_embeds)



105715

In [18]:
from sklearn.datasets import make_multilabel_classification
import numpy as np
from xgboost import XGBClassifier

text_Roberta_xgb = XGBClassifier(tree_method="hist")
text_Roberta_xgb.fit(np.array(rob_text_embeds.get()), y_train)


AttributeError: 'list' object has no attribute 'get'

In [None]:

text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)


In [7]:
from transformers import RobertaTokenizer, TFRobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = TFRobertaModel.from_pretrained('roberta-large')


2023-03-23 16:29:11.010007: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-23 16:29:11.010675: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 16:29:11.010891: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 16:29:11.011045: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

In [22]:

def feature_Roberta_fct(sentences, b_size) :
    batch_size = b_size
    # time1 = time.time()
    features = None
    for step in tqdm(range(len(sentences)//batch_size+1)):
        idx = step*batch_size
        tensors = tokenizer(sentences[idx:idx+batch_size], return_tensors='tf', padding=True,truncation=True)
        outputs = model(tensors)
        last_hidden_states = outputs.last_hidden_state
        batch_features = np.array(last_hidden_states).mean(axis=1)
        if features is None:
            features = batch_features
        else:
            features = np.concatenate([features, batch_features])

    # time2 = np.round(time.time() - time1,0)
    return features

In [24]:
batch_size = 10
text_embedded_use = feature_Roberta_fct(train_df["text_processed"].to_list(), batch_size)
code_embedded_use = feature_Roberta_fct(
    train_df["code_processed"].to_list(), batch_size)


  0%|          | 0/10572 [00:00<?, ?it/s]


ValueError: zero-dimensional arrays cannot be concatenated

### Bert XGB

In [4]:
train_df = sample_train.loc[:]
train_df.fillna("", inplace=True)
train_df["text_processed"] = preprocess_transformers(train_df.text)
train_df["code_processed"] = preprocess_transformers(train_df.code)


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105715/105715 [00:41<00:00, 2534.19it/s]


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105715/105715 [02:24<00:00, 730.41it/s]


In [11]:
import tensorflow_hub as hub
import torch
import gc
from time import time
if model:
    del model
gc.collect()
torch.cuda.empty_cache()

In [7]:
from time import time
# Fonction de pr√©paration des sentences
def bert_inp_fct(sentences, bert_tokenizer, max_length) :
    input_ids=[]
    token_type_ids = []
    attention_mask=[]
    bert_inp_tot = []

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              padding='max_length',
                                              return_attention_mask = True, 
                                              return_token_type_ids=True,
                                              truncation=True,
                                              return_tensors="tf")
    
        input_ids.append(bert_inp['input_ids'][0])
        token_type_ids.append(bert_inp['token_type_ids'][0])
        attention_mask.append(bert_inp['attention_mask'][0])
        bert_inp_tot.append((bert_inp['input_ids'][0], 
                             bert_inp['token_type_ids'][0], 
                             bert_inp['attention_mask'][0]))

    input_ids = np.asarray(input_ids)
    token_type_ids = np.asarray(token_type_ids)
    attention_mask = np.array(attention_mask)
    
    return input_ids, token_type_ids, attention_mask, bert_inp_tot
    

# Fonction de cr√©ation des features
def feature_BERT_fct(model, model_type, sentences, max_length, b_size) :
    batch_size = b_size
    batch_size_pred = b_size
    bert_tokenizer = AutoTokenizer.from_pretrained(model_type)
    time1 = time()

    for step in range(len(sentences)//batch_size +1) :
        idx = step*batch_size
        input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(sentences[idx:idx+batch_size], 
                                                                      bert_tokenizer, max_length)
    
        outputs = model.predict([input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
        last_hidden_states = outputs.last_hidden_state

             
        if step ==0 :
            last_hidden_states_tot = last_hidden_states
            last_hidden_states_tot_0 = last_hidden_states
        else :
            last_hidden_states_tot = np.concatenate((last_hidden_states_tot,last_hidden_states))
    
    features_bert = np.array(last_hidden_states_tot).mean(axis=1)
    
    time2 = np.round(time() - time1,0)
    print("temps traitement : ", time2)
     
    return features_bert, last_hidden_states_tot

In [8]:
max_length = 512
batch_size = 10
model_type = 'bert-base-uncased'
model = TFAutoModel.from_pretrained(model_type)

loading configuration file config.json from cache at /home/aurelien/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file tf_model.h5 from cache at /home/aurelien/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb42

In [9]:
# Cr√©ation des features
train_df_text = train_df[train_df.text_processed != ""].loc[:]


In [14]:
train_df_text


Unnamed: 0,Title,Body,Tags,Score,ViewCount,AnswerCount,nbtags,tag_set,tag_list,text,code,text_processed,code_processed
0,Publish/Subscribe reliable messaging: Redis VS...,<h2>Background</h2>\n\n<p>I am making a publis...,<javascript><node.js><redis><rabbitmq><publish...,25,16250,3,5,"{javascript, publish-subscribe, rabbitmq, node...","[javascript, node.js, redis, rabbitmq, publish...",Background\n\n\nI am making a publish/subscrib...,,background i am making a publish subscribe typ...,
1,What is an undefined reference/unresolved exte...,<p>What are undefined reference/unresolved ext...,<c++><linker-errors><undefined-reference><c++-...,1757,871034,39,5,"{c++, unresolved-external, c++-faq, linker-err...","[c++, linker-errors, undefined-reference, c++-...",What are undefined reference/unresolved extern...,,what are undefined reference unresolved extern...,
2,How MediaCodec finds the codec inside the fram...,<p>I am trying to understanding how MediaCodec...,<android><android-internet><stagefright><openm...,7,4538,1,5,"{opencore, stagefright, android-internet, andr...","[android, android-internet, stagefright, openm...",I am trying to understanding how MediaCodec is...,device/ti/omap3evm/media_codecs.xml for an ex...,i am trying to understanding how mediacodec is...,device ti for an example mediacodec codec medi...
3,Can I just inject super class when use dagger2...,<p>I use Dagger2 for DI in my android applicat...,<java><android><dependency-injection><dagger><...,51,23085,3,5,"{dagger, dagger-2, dependency-injection, andro...","[java, android, dependency-injection, dagger, ...",I use Dagger2 for DI in my android application...,BaseActivity,i use for di in my android application i found...,baseactivity
4,NetworkSecurityConfig: No Network Security Con...,<p>I have some problem for android 7.0.0.</p>\...,<java><android><android-studio><android-volley>,10,30564,3,4,"{java, android-volley, android, android-studio}","[java, android, android-studio, android-volley]",I have some problem for android 7.0.0.\n\n\nI ...,"String url_goster = ""http://185.126.217.71/clo...",i have some problem for android i use volley l...,string requestqueue requestqueue stringrequest...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105710,Fill image inside with color in fabric.js,"<p>Basically, I have a png transparent image a...",<javascript><jquery><css><canvas><fabricjs>,8,2873,1,5,"{javascript, canvas, css, fabricjs, jquery}","[javascript, jquery, css, canvas, fabricjs]","Basically, I have a png transparent image and ...",var canvas = this.__canvas = new fabric.Canvas...,basically i have a png transparent image and i...,var canvas new fabric f fabric function img va...
105711,jquery-ui sortable getting error - PUT 501 (No...,<p>I have a Rails app with jquery-ui sortable ...,<jquery><ruby-on-rails><apache><jquery-ui>,10,1275,5,4,"{apache, jquery-ui, jquery, ruby-on-rails}","[jquery, ruby-on-rails, apache, jquery-ui]",I have a Rails app with jquery-ui sortable lis...,PUT http://ndeavor.ameipro.com/workorders/2 50...,i have a rails app with jquery ui sortable lis...,put not implemented connectwith cursor move up...
105712,UnicodeEncodeError: 'ascii' codec can't encode...,<p>I'm trying to print a string from an <a hre...,<python><unicode><character-encoding><web-scra...,9,14333,1,4,"{web-scraping, unicode, python, character-enco...","[python, unicode, character-encoding, web-scra...",I'm trying to print a string from an \narchive...,print page['html']\nUnicodeEncodeError: 'ascii...,i trying to print a string from an archived we...,print unicodeencodeerror ascii codec ca encode...
105713,How can I rotate a UIImageView with respect to...,"<p>By default, a UIImageView will rotate only ...",<iphone><cocoa-touch><uikit><core-animation>,10,17594,1,4,"{core-animation, uikit, iphone, cocoa-touch}","[iphone, cocoa-touch, uikit, core-animation]","By default, a UIImageView will rotate only abo...",,by default a uiimageview will rotate only abou...,


In [10]:
features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, train_df_text.text_processed.values,
                                                         max_length, batch_size)

loading configuration file config.json from cache at /home/aurelien/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/aurelien/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.t



KeyboardInterrupt: 

## Evaluation

### RoBertA spacy

In [4]:




sample_validation = pd.read_csv(
    './validation.csv', converters={'tag_set': eval, 'tag_list': eval})
sample_validation.text.fillna("", inplace=True)
sample_validation.code.fillna("", inplace=True)
sample_validation.tag_list = sample_validation.tag_list.apply(
    lambda tag_list: [tag for tag in tag_list if tag in selected_tags])

mlb = MultiLabelBinarizer()
mlb.fit([selected_tags])
y = mlb.transform(sample_validation.tag_list.values)
y_sets = sample_validation.tag_list.apply(set)


In [6]:
# del nlp_trf
# gc.collect()
# torch.cuda.empty_cache()
nlp = spacy.load("en_core_web_trf")
val_trf = sample_validation.loc[:]
val_trf["texts_processed"] = preprocess_transformers(val_trf.text)
val_trf['codes_processed'] = preprocess_transformers(val_trf.code)


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [00:07<00:00, 1858.39it/s]


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [00:20<00:00, 690.67it/s]


In [9]:

nlp_text = spacy.load("./output_transformer/model-best")
nlp_code = spacy.load("./output_code_transformer/model-best")
text_cats = []
code_cats = []
for summary in tqdm(nlp_text.pipe(val_trf["texts_processed"].values), total=len(val_trf)):
    text_cats.append(summary.cats)

for summary in tqdm(nlp_code.pipe(val_trf["codes_processed"].values), total=len(val_trf)):
    code_cats.append(summary.cats)

val_trf["text_cats"] = text_cats
val_trf[val_trf.text == ""].text_cats = dict.fromkeys(selected_tags, 0)
val_trf["code_cats"] = code_cats
val_trf[val_trf.code == ""].code_cats = dict.fromkeys(selected_tags, 0)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [02:21<00:00, 99.99it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [01:40<00:00, 141.43it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_trf[val_trf.text == ""].text_cats = dict.fromkeys(selected_tags, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_trf[val_trf.code == ""].code_cats = dict.fromkeys(selected_tags, 0)


In [17]:
THRESHOLD = 0.5
y_pred_text = [[x for x in mlb.classes_ if y[x] > THRESHOLD]
               for y in val_trf.text_cats.values]
y_pred_code = [[x for x in mlb.classes_ if y[x] > THRESHOLD]
               for y in val_trf.code_cats.values]

y_pred_text = mlb.transform(y_pred_text)
y_pred_code = mlb.transform(y_pred_code)
evaluate_predictions(y, y_pred_text, y_pred_code, mlb, "roberta")


Roc auc for each tag:
[('.net', 0.7692095799440233), ('android', 0.9080808690792053), ('c', 0.8447096161377827), ('c#', 0.8388032570978465), ('c++', 0.7881483148459298), ('css', 0.913996074936601), ('html', 0.8416098388612389), ('ios', 0.8734087727851785), ('iphone', 0.7788454730954467), ('java', 0.7919521783959212), ('javascript', 0.8453633686623323), ('jquery', 0.8498724053753448), ('node.js', 0.8705711658530503), ('objective-c', 0.793235451364751), ('php', 0.8618515050250596), ('python', 0.9013762832571274)]
Roc auc macro average: 0.842
Jaccard score sample average: 0.643
Roc auc for each tag:
[('.net', 0.7118275048578196), ('android', 0.7014331326810528), ('c', 0.7336554822753112), ('c#', 0.7457083116089966), ('c++', 0.6881793645604458), ('css', 0.7673096924660345), ('html', 0.7189115131655768), ('ios', 0.7489205401739829), ('iphone', 0.7016473996089918), ('java', 0.6927924005318513), ('javascript', 0.7024452194357037), ('jquery', 0.6839158549562087), ('node.js', 0.7737853899124363

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### BOW Spacy

In [18]:

nlp_lg = spacy.load('en_core_web_lg')
val_bow = sample_validation.loc[:]
val_bow["texts_processed"] = preprocess(val_bow.text, nlp_lg)
val_bow['codes_processed'] = preprocess(val_bow.code, nlp_lg)

nlp_text = spacy.load("./output_BOW/model-best")
nlp_code = spacy.load("./output_code_BOW/model-best")
text_cats = []
code_cats = []
for summary in tqdm(nlp_text.pipe(val_bow["texts_processed"].values), total=len(val_bow)):
    text_cats.append(summary.cats)

for summary in tqdm(nlp_code.pipe(val_bow["codes_processed"].values), total=len(val_bow)):
    code_cats.append(summary.cats)

val_bow["text_cats"] = text_cats
val_bow[val_bow.text == ""].text_cats = dict.fromkeys(selected_tags, 0)
val_bow["code_cats"] = code_cats
val_bow[val_bow.code == ""].code_cats = dict.fromkeys(selected_tags, 0)


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [01:18<00:00, 181.37it/s]


preprocessing


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [01:33<00:00, 151.51it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [00:06<00:00, 2233.17it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [00:06<00:00, 2178.66it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_bow[val_bow.text == ""].text_cats = dict.fromkeys(selected_tags, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_bow[val_bow.code == ""].code_cats = dict.fromkeys(selected_tags, 0)


In [19]:
THRESHOLD = 0.5
y_pred_text = [[x for x in selected_tags if y[x] > THRESHOLD]
               for y in val_bow.text_cats.values]
y_pred_code = [[x for x in selected_tags if y[x] > THRESHOLD]
               for y in val_bow.code_cats.values]

y_pred_text = mlb.transform(y_pred_text)
y_pred_code = mlb.transform(y_pred_code)

evaluate_predictions(y, y_pred_text, y_pred_code, mlb, "bow")

Roc auc for each tag:
[('.net', 0.6488900967782152), ('android', 0.8301444824905723), ('c', 0.7419467792574461), ('c#', 0.7319272956228179), ('c++', 0.7019025172772958), ('css', 0.8399595601095461), ('html', 0.736338138683504), ('ios', 0.7948700679334377), ('iphone', 0.6635471627376747), ('java', 0.7256070039936248), ('javascript', 0.744058014673224), ('jquery', 0.7819770185228876), ('node.js', 0.776855033231991), ('objective-c', 0.646231952979652), ('php', 0.7956970717251355), ('python', 0.8240978956325864)]
Roc auc macro average: 0.749
Jaccard score sample average: 0.482
Roc auc for each tag:
[('.net', 0.6013185899615637), ('android', 0.7079430036168805), ('c', 0.67738271317357), ('c#', 0.7018641191800484), ('c++', 0.6596744189582685), ('css', 0.7413250204306504), ('html', 0.6589245157521024), ('ios', 0.6677736187345674), ('iphone', 0.5571416184495195), ('java', 0.6612724150810498), ('javascript', 0.684897964392756), ('jquery', 0.636890254178122), ('node.js', 0.734195023353506), ('ob

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
val_tok2vec = val_bow.loc[:]
nlp_text = spacy.load("./output_tok2vec/model-best")
nlp_code = spacy.load("./output_code_tok2vec/model-best")
text_cats = []
code_cats = []
for summary in tqdm(nlp_text.pipe(val_tok2vec["texts_processed"].values), total=len(val_tok2vec)):
    text_cats.append(summary.cats)

for summary in tqdm(nlp_code.pipe(val_tok2vec["codes_processed"].values), total=len(val_tok2vec)):
    code_cats.append(summary.cats)

val_tok2vec["text_cats"] = text_cats
val_tok2vec[val_tok2vec.text == ""].text_cats = dict.fromkeys(selected_tags, 0)
val_tok2vec["code_cats"] = code_cats
val_tok2vec[val_tok2vec.code == ""].code_cats = dict.fromkeys(selected_tags, 0)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [00:07<00:00, 1959.16it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14152/14152 [00:07<00:00, 1959.62it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_tok2vec[val_tok2vec.text == ""].text_cats = dict.fromkeys(selected_tags, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_tok2vec[val_tok2vec.code == ""].code_cats = dict.fromkeys(selected_tags, 0)


In [21]:
THRESHOLD = 0.5
y_pred_text = [[x for x in selected_tags if y[x] > THRESHOLD]
               for y in val_tok2vec.text_cats.values]
y_pred_code = [[x for x in selected_tags if y[x] > THRESHOLD]
               for y in val_tok2vec.code_cats.values]

y_pred_text = mlb.transform(y_pred_text)
y_pred_code = mlb.transform(y_pred_code)

evaluate_predictions(y, y_pred_text, y_pred_code, mlb, "tok2vec")


Roc auc for each tag:
[('.net', 0.6421738546586514), ('android', 0.8807337192195761), ('c', 0.7965499808922081), ('c#', 0.7380627650712426), ('c++', 0.718745238765006), ('css', 0.8528525867491494), ('html', 0.792017609560249), ('ios', 0.7830064177346702), ('iphone', 0.665392220285253), ('java', 0.771801953182504), ('javascript', 0.8119293888438004), ('jquery', 0.832020834035505), ('node.js', 0.8196238383189313), ('objective-c', 0.6038419139093959), ('php', 0.8302391235253427), ('python', 0.8801803381366468)]
Roc auc macro average: 0.776
Jaccard score sample average: 0.533
Roc auc for each tag:
[('.net', 0.6002668716090207), ('android', 0.7431834972600363), ('c', 0.7388924816042166), ('c#', 0.7260139695923068), ('c++', 0.697964738350655), ('css', 0.807689796929288), ('html', 0.7150599624483918), ('ios', 0.7110989711456355), ('iphone', 0.6276259641200366), ('java', 0.7048169043203596), ('javascript', 0.7214736670564063), ('jquery', 0.675731020901444), ('node.js', 0.7605714176105575), ('o

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
