In [2]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
from transformers import BertTokenizer, TFBertModel

# Function to read the CSV file
def read_tbl(file_path):
    df = pd.read_csv(file_path)
    return df

# Load the neural network model and tokenizer
nn_model = load_model('../../train/data/output/gist_classification_model.h5')
with open('../../train/data/output/tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the logistic regression model
with open('../../train/data/output/gist_classification_lr_model.pkl', 'rb') as handle:
    lr_model = pickle.load(handle)

# Load the BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Read the input CSV file
df = read_tbl("../../ner/data/output/combined_output.csv")

# Preprocess texts for neural network model
texts = df['gist'].astype(str).tolist()
sequences = tokenizer.texts_to_sequences(texts)
maxlen = 100
data = pad_sequences(sequences, maxlen=maxlen)

# Predict using the neural network model
nn_predictions = nn_model.predict(data)
nn_predicted_labels = (nn_predictions > 0.5).astype("int32")

# Add predictions to the DataFrame
df['nn_relevant'] = nn_predicted_labels

# Save the DataFrame with neural network predictions
df.to_csv("../data/output/output_nn.csv", index=False)

# Preprocess texts for logistic regression model
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = bert_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = np.concatenate(input_ids, axis=0)
attention_masks = np.concatenate(attention_masks, axis=0)

# Generate BERT embeddings
bert_outputs = bert_model(input_ids, attention_mask=attention_masks)
X = bert_outputs.last_hidden_state[:, 0, :].numpy()

# Use the pre-trained logistic regression model for predictions
lr_predicted_labels = lr_model.predict(X)

# Add predictions to the DataFrame
df['lr_relevant'] = lr_predicted_labels

# Save the DataFrame with logistic regression predictions
df.to_csv("../data/output/output_lr.csv", index=False)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


: 

In [None]:
# embedding_layer = model.get_layer('embedding_2')
# embeddings = embedding_layer.get_weights()[0]
# print("Embedding shape:", embeddings.shape)
# print("Sample embeddings:")
# print(embeddings[:5])