# Predicting segment types of all segments

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import json
import random
from pprint import pprint
import os
import sys
import pickle

AICOPE_PY_LIB = os.environ.get("AICOPE_PY_LIB")
if AICOPE_PY_LIB and AICOPE_PY_LIB not in sys.path: sys.path.append(AICOPE_PY_LIB)
import importlib
import aicnlp
importlib.reload(aicnlp)

%config Completer.use_jedi = False
PACSIM_DATA = os.environ.get("AICOPE_SCRATCH") + "/pacsim"

In [2]:
with open(f"{PACSIM_DATA}/parts/tid2t.pickle", "rb") as f:
    tid2t = pickle.load(f)
    t2tid = {v:k for k, v in tid2t.items()}


parts = pd.read_feather(f"{PACSIM_DATA}/parts/parts.feather")
parts["tid"] = parts.stitle.apply(lambda x: t2tid.get(x, -1))
parts["pred"] = parts["tid"]
parts.head(1)

Unnamed: 0,rid,pid,rord,srord,text,stext,title,stitle,label,tid,pred
0,0,0,0,0,Konzilium.\n,Konzilium.\n,,,-1,-1,-1


## Prediction

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import datasets


tokenizer = AutoTokenizer.from_pretrained("ufal/robeczech-base")

def tokenize_function(ds):
    return tokenizer(
        ds["text"],
        padding="max_length",
        max_length=512,
        truncation=True,
    )

2022-09-27 17:53:06.744105: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-27 17:53:07.176124: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-27 17:53:08.449195: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-09-27 17:53:08.449427: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [5]:
def get_prediction(df):
    ds = datasets.Dataset.from_pandas(df)
    ds = ds.map(tokenize_function, batched=True, num_proc=8, desc="Tokenizing")
    ds = ds.remove_columns(['__index_level_0__', 'text', 'tid'])
    return trainer.predict(ds).predictions

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    f"{PACSIM_DATA}/parts/checkpoint-180000/"
)

training_args = TrainingArguments(
    output_dir="mgpu/eval",
    per_device_eval_batch_size = 16,
    fp16=True,
    log_level="info",
)

trainer = Trainer(model=model, args=training_args)
trainer.model = model.cuda()

Using cuda_amp half precision backend
CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


In [None]:
# for some reason it randomly allocates more memory when predicting all at once
unlabeled = parts.loc[parts["tid"] == -1, ["text", "tid"]]

predictions = []
for df in np.array_split(unlabeled, 10):
    predictions.append(get_prediction(df))
    

In [9]:
pnp = np.vstack(predictions)
pnp.shape

(1204940, 2078)

In [14]:
predmax = pnp.argmax(axis=1)
predmax.shape

(1204940,)

In [17]:
parts.loc[parts["tid"] == -1, "pred"] = predmax + 1

In [19]:
parts["ptitle"] = parts["pred"].map(tid2t)

In [23]:
parts.to_feather(f"{PACSIM_DATA}/parts/parts_pred.feather")