This notebook will take the original dataset and transform the anchor tag and the search phrase using the Transformers library.  Each phrase will be run through a transformer model, those outptus will be concatenated together, and then the CPC demographic data will be added on as well.  This will give a tabular dataset upon which I will train a XGB model.  This notebook is intended to provide a baseline model for the competition.

# Plan of Action
* Load Data
* Establish Transformers model
* Transform anchor tag, search tag
* Concatenate anchor tag, search tag, demo data
* Write data to file
* Instantiate XGB Model
* Train XGB Model

### Load Data

In [2]:
import pandas as pd
from transformers import BertTokenizer, TFBertModel
import numpy as np

2022-05-02 21:26:23.731245: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [3]:
data = pd.read_csv('/media/alec/bigdisk/phrase_to_phrase/data/interim/data_with_cpc_structure.csv')

In [4]:
data.head()

Unnamed: 0,id,anchor,target,context,score,section,class,subclass,group,subgroup
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,4,7,,
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,4,7,,
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,4,7,,
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,4,7,,
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,4,7,,


In [5]:
# fill CPC columns with flag value for null values
to_fill = ['section','class','subclass','group','subgroup']
for col in to_fill:
    data[col].fillna(9999, inplace=True)

In [6]:
data.head()

Unnamed: 0,id,anchor,target,context,score,section,class,subclass,group,subgroup
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,4,7,9999.0,9999.0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,4,7,9999.0,9999.0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,4,7,9999.0,9999.0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,4,7,9999.0,9999.0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,4,7,9999.0,9999.0


### Establish Transformers Model

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

2022-05-02 21:26:32.640074: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-05-02 21:26:32.641248: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-05-02 21:26:32.681031: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 21:26:32.681313: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1660 Ti computeCapability: 7.5
coreClock: 1.77GHz coreCount: 24 deviceMemorySize: 5.80GiB deviceMemoryBandwidth: 268.26GiB/s
2022-05-02 21:26:32.681343: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-05-02 21:26:32.683032: I tensorflow/stream_executor/platform/

In [8]:
output.last_hidden_state.numpy()[0,0:1,:].shape

(1, 768)

In [15]:
def encode_text2(df, column, tokenizer, model, batch_size):
    final_length = data.shape[0]
    batches = [df.loc[n:n+batch_size,column].values for n in range(0,len(df[column]),batch_size)]
    tokenized = [tokenizer(batch.tolist(), return_tensors='tf', padding=True) for batch in batches]
    computed = [model(batch_tokens) for batch_tokens in tokenized]
    results = [x.last_hidden_state.numpy()[:,0:1,:] for x in computed]
    results = results[0:final_length]
    return results

In [50]:
def encode_text(df, column, tokenizer, model):
    '''Function that takes a word column and encodes
    it as a vector'''
    new_name = f'{column}_vector'
    intermediate = [tokenizer(x, return_tensors='tf') for x in df[column]]
    outputs = [model(x) for x in intermediate]
    result = [x.last_hidden_state.numpy()[0,0:1,:] for x in outputs]
    df[new_name] = result
    return df
