# Predicting segment types of all segments

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import json
import random
from pprint import pprint
import os
import sys
import pickle

AICOPE_PY_LIB = os.environ.get("AICOPE_PY_LIB")
if AICOPE_PY_LIB and AICOPE_PY_LIB not in sys.path: sys.path.append(AICOPE_PY_LIB)
import importlib
import aicnlp
importlib.reload(aicnlp)

%config Completer.use_jedi = False
PACSIM_DATA = os.environ.get("AICOPE_SCRATCH") + "/pacsim"

In [13]:
with open(f"{PACSIM_DATA}/parts/tid2t.pickle", "rb") as f:
    tid2t = pickle.load(f)
    t2tid = {v:k for k, v in tid2t.items()}


parts = pd.read_feather(f"{PACSIM_DATA}/parts/parts.feather")
parts["tid"] = parts.stitle.apply(lambda x: t2tid.get(x, -1))
parts["pred"] = parts["tid"]
parts.head(5)

Unnamed: 0,rid,pid,rord,srord,text,stext,title,stitle,label,tid,pred
0,0,0,0,0,different TITLE 8: Dolor adipisci labore modi...,Dolor adipisci labore modi porro consectetur ...,different TITLE 8:,different title #,12,13,13
1,0,0,0,1,OTHER title : Tempora adipisci ut quaerat nu...,Tempora adipisci ut quaerat numquam velit. Se...,OTHER title :,other title,21,22,22
2,0,0,0,2,Tempora quaerat ut sed. Neque sit sed dolorem....,Tempora quaerat ut sed. Neque sit sed dolorem....,,,-1,-1,-1
3,0,0,0,3,some OTHER title 8:\nEius quiquia quisquam dol...,\nEius quiquia quisquam dolore. Neque sit temp...,some OTHER title 8:,some other title #,16,17,17
4,1,0,1,0,A title 8: Porro aliquam velit voluptatem est...,Porro aliquam velit voluptatem est quaerat. A...,A title 8:,a title #,0,1,1


## Prediction

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import datasets


tokenizer = AutoTokenizer.from_pretrained("ufal/robeczech-base")

def tokenize_function(ds):
    return tokenizer(
        ds["text"],
        padding="max_length",
        max_length=150,
        truncation=True,
    )

2022-12-12 09:08:40.222598: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 09:08:40.729995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-12-12 09:08:40.730047: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64


In [4]:
from tensorflow import keras
model = keras.models.load_model(f"{PACSIM_DATA}/parts/models/bilstm")



2022-12-12 09:08:44.108875: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 09:08:44.110885: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 09:08:44.111140: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 09:08:44.111474: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [5]:
def get_prediction(df):
    ds = datasets.Dataset.from_pandas(df)
    ds = ds.map(tokenize_function, batched=True, num_proc=1, desc="Tokenizing")
    ds = ds.remove_columns(['__index_level_0__', 'text', 'tid'])
    return model.predict(ds["input_ids"], batch_size=32)

In [6]:
# for some reason it randomly allocates more memory when predicting all at once
unlabeled = parts.loc[parts["tid"] == -1, ["text", "tid"]]

predictions = []
for df in np.array_split(unlabeled, 1):
    predictions.append(get_prediction(df))
    

The OrderedVocab you are attempting to save contains a hole for index 51959, your vocabulary could be corrupted !


Tokenizing:   0%|          | 0/4 [00:00<?, ?ba/s]



In [7]:
pnp = np.vstack(predictions)
pnp.shape

(3156, 2078)

In [8]:
predmax = pnp.argmax(axis=1)
predmax.shape

(3156,)

In [9]:
parts.loc[parts["tid"] == -1, "pred"] = predmax + 1

In [11]:
parts["ptitle"] = parts["pred"].map(tid2t)
parts

Unnamed: 0,rid,pid,rord,srord,text,stext,title,stitle,label,tid,pred,ptitle
0,0,0,0,0,different TITLE 8: Dolor adipisci labore modi...,Dolor adipisci labore modi porro consectetur ...,different TITLE 8:,different title #,12,13,13,different title #
1,0,0,0,1,OTHER title : Tempora adipisci ut quaerat nu...,Tempora adipisci ut quaerat numquam velit. Se...,OTHER title :,other title,21,22,22,other title
2,0,0,0,2,Tempora quaerat ut sed. Neque sit sed dolorem....,Tempora quaerat ut sed. Neque sit sed dolorem....,,,-1,-1,2,other title #
3,0,0,0,3,some OTHER title 8:\nEius quiquia quisquam dol...,\nEius quiquia quisquam dolore. Neque sit temp...,some OTHER title 8:,some other title #,16,17,17,some other title #
4,1,0,1,0,A title 8: Porro aliquam velit voluptatem est...,Porro aliquam velit voluptatem est quaerat. A...,A title 8:,a title #,0,1,1,a title #
...,...,...,...,...,...,...,...,...,...,...,...,...
6710,1311,99,10,1,Non quaerat voluptatem est quiquia aliquam. Do...,Non quaerat voluptatem est quiquia aliquam. Do...,,,-1,-1,1,a title #
6711,1311,99,10,2,a OTHER TITLE 4:\nUt velit quisquam sed sed. A...,\nUt velit quisquam sed sed. Adipisci velit si...,a OTHER TITLE 4:,a other title #,7,8,8,a other title #
6712,1311,99,10,3,Consectetur ipsum non porro ipsum non. Ipsum a...,\nLabore est ut quisquam quiquia. Labore conse...,Consectetur ipsum non porro ipsum non. Ipsum a...,consectetur ipsum non porro ipsum non. ipsum a...,-1,-1,2,other title #
6713,1311,99,10,4,some TITLE 3: Voluptatem amet dolor non tempo...,Voluptatem amet dolor non tempora quisquam te...,some TITLE 3:,some title #,3,4,4,some title #


In [12]:
parts.to_feather(f"{PACSIM_DATA}/parts/parts_pred.feather")