This notebook will take the original dataset and transform the anchor tag and the search phrase using the Transformers library.  Each phrase will be run through a transformer model, those outptus will be concatenated together, and then the CPC demographic data will be added on as well.  This will give a tabular dataset upon which I will train a XGB model.  This notebook is intended to provide a baseline model for the competition.

# Plan of Action
* Load Data
* Establish Transformers model
* Transform anchor tag, search tag
* Dummy encode demographic data
* Concatenate anchor tag, search tag, demo data
* Write data to file
* Instantiate XGB Model
* Train XGB Model

### Load Data

In [1]:
import pandas as pd
from transformers import BertTokenizer, TFBertModel
import numpy as np

2022-05-03 08:31:59.934105: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
data = pd.read_csv('/media/alec/bigdisk/phrase_to_phrase/data/interim/data_with_cpc_structure.csv')

In [3]:
data.head()

Unnamed: 0,id,anchor,target,context,score,section,class,subclass,group,subgroup
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,4,7,,
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,4,7,,
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,4,7,,
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,4,7,,
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,4,7,,


In [4]:
# fill CPC columns with flag value for null values
to_fill = ['section','class','subclass','group','subgroup']
for col in to_fill:
    data[col].fillna(9999, inplace=True)

In [5]:
data.head()

Unnamed: 0,id,anchor,target,context,score,section,class,subclass,group,subgroup
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,4,7,9999.0,9999.0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,4,7,9999.0,9999.0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,4,7,9999.0,9999.0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,4,7,9999.0,9999.0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,4,7,9999.0,9999.0


### Establish Transformers Model

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")
;

2022-05-03 08:32:08.842540: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-05-03 08:32:08.843343: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-05-03 08:32:08.880032: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-03 08:32:08.880173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1660 Ti computeCapability: 7.5
coreClock: 1.77GHz coreCount: 24 deviceMemorySize: 5.80GiB deviceMemoryBandwidth: 268.26GiB/s
2022-05-03 08:32:08.880199: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-05-03 08:32:08.881594: I tensorflow/stream_executor/platform/

''

### Encode text as vector

In [7]:
def encode_text(df, column, tokenizer, model, batch_size):
    '''Function that takes a specified text column and 
    batch processes it through a transformer model. Returns
    a vector of n_samplesx1x768 '''
    text = df[column].tolist()
    chunks = [text[n:n+batch_size] for n in range(0,len(text)+1,batch_size)]
    tokenized = [tokenizer(chunk, padding=True, return_tensors='tf') for chunk in chunks]
    computed = [model(tokens) for tokens in tokenized]
    last_vectors = [x.last_hidden_state.numpy()[:,0:1,:] for x in computed]
    results = np.vstack(last_vectors)
    return results

In [8]:
#encoding the anchor tag and the target tag
to_encode = ['anchor','target'] 
for col in to_encode:
    print(f'Working on {col}')
    new_column = f'{col}_vector'
    content = encode_text(data,col,tokenizer, model, 512)
    data[new_column] = content.tolist()
print(data.columns)


Working on anchor
Working on target
Index(['id', 'anchor', 'target', 'context', 'score', 'section', 'class',
       'subclass', 'group', 'subgroup', 'anchor_vector', 'target_vector'],
      dtype='object')


In [9]:
data.head()

Unnamed: 0,id,anchor,target,context,score,section,class,subclass,group,subgroup,anchor_vector,target_vector
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,4,7,9999.0,9999.0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.17539000511169434, 0.3574599027633667, -0..."
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,4,7,9999.0,9999.0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.12764355540275574, -0.2909691333770752, -..."
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,4,7,9999.0,9999.0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.6683033108711243, 0.04491086304187775, 0...."
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,4,7,9999.0,9999.0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.24492047727108002, 0.213256374001503, -0...."
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,4,7,9999.0,9999.0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.2078469693660736, 0.1655207872390747, -0...."


### Dummy encode demographic data

In [13]:
# use to categorical because dummy encoding makes WAY too many columns
cat_cols = ['section','class','subclass','group','subgroup']
for col in cat_cols:
    data[col] = data[col].astype('category').cat.codes
data.head()


Unnamed: 0,id,anchor,target,context,score,section,class,subclass,group,subgroup,anchor_vector,target_vector
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,0,4,7,0,0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.17539000511169434, 0.3574599027633667, -0..."
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,0,4,7,0,0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.12764355540275574, -0.2909691333770752, -..."
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,0,4,7,0,0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.6683033108711243, 0.04491086304187775, 0...."
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,0,4,7,0,0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.24492047727108002, 0.213256374001503, -0...."
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,0,4,7,0,0,"[[-0.1651368886232376, 0.13542886078357697, -0...","[[-0.2078469693660736, 0.1655207872390747, -0...."


In [17]:
data.to_parquet('../data/processed/processed_data_v1.parquet', index=False)