In [None]:
import pandas as pd

In [None]:
# Load the Excel sheet
file_path = "your_excel_file.xlsx"
data = pd.read_excel(file_path)



In [None]:
# Separate the text and the names
texts = data['Text'].tolist()
names = data['Name'].tolist()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# Tokenize the texts and the names
text_tokenizer = Tokenizer(char_level=True)
text_tokenizer.fit_on_texts(texts)
text_sequences = text_tokenizer.texts_to_sequences(texts)

name_tokenizer = Tokenizer(char_level=True)
name_tokenizer.fit_on_texts(names)
name_sequences = name_tokenizer.texts_to_sequences(names)

In [None]:
# Pad the sequences
max_length = max([len(seq) for seq in text_sequences])
text_sequences_padded = pad_sequences(text_sequences, maxlen=max_length, padding='post')
name_sequences_padded = pad_sequences(name_sequences, maxlen=max_length, padding='post')

In [None]:
# Convert the names to one-hot encoding
name_sequences_onehot = [to_categorical(seq, num_classes=len(name_tokenizer.word_index) + 1) for seq in name_sequences_padded]

In [None]:
# Split the data into training and validation sets
train_texts, val_texts, train_names, val_names = train_test_split(text_sequences_padded, np.array(name_sequences_onehot), test_size=0.2)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense

input_dim = len(text_tokenizer.word_index) + 1
output_dim = len(name_tokenizer.word_index) + 1

model = Sequential([
    Input(shape=(max_length,)),
    Embedding(input_dim=input_dim, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LSTM(units=64, return_sequences=True, dropout=0.5, recurrent_dropout=0.5),
    TimeDistributed(Dense(output_dim, activation="softmax"))
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [1]:
history = model.fit(train_texts, train_names, epochs=30, batch_size=32, validation_data=(val_texts, val_names))

NameError: name 'model' is not defined

In [2]:
import spacy
from spacy.tokens import Doc
from spacy.pipeline import EntityRecognizer
from thinc.layers import PyTorchWrapper
import torch

# Convert Keras model to PyTorch model
torch_model = torch.nn.Sequential(*[PyTorchWrapper(layer) for layer in model.layers])

# Create a custom spaCy component
class KerasEntityRecognizer(EntityRecognizer):
    def predict(self, docs):
        X = [doc.tensor for doc in docs]
        X_padded = pad_sequences(X, maxlen=max_length, padding='post')
        y_pred = self.model(torch.tensor(X_padded, dtype=torch.float32))
        return y_pred.detach().numpy()


NameError: name 'model' is not defined

In [None]:
from spacy.tokens import Span
from spacy.language import Language

@Language.component("keras_entity_recognizer")
def keras_entity_recognizer(doc):
    # Convert the document text into input for the model
    text_sequence = text_tokenizer.texts_to_sequences([doc.text])
    text_sequence_padded = pad_sequences(text_sequence, maxlen=max_length, padding='post')

    # Get the model predictions
    predictions = model.predict(text_sequence_padded)[0]

    # Extract the name from the predictions
    name_indices = np.argmax(predictions, axis=-1)
    name_tokens = [name_tokenizer.index_word[idx] if idx > 0 else '' for idx in name_indices]

    # Assign the name tokens to the doc
    for i, token in enumerate(doc):
        if name_tokens[i]:
            token._.name = name_tokens[i]
        else:
            token._.name = ""

    # Add entities to the doc
    for token in doc:
        if token._.name:
            doc.ents += (Span(doc, token.i, token.i + 1, label='PERSON'),)

    return doc

In [None]:
from spacy.tokens import Token

# Register the custom attribute
Token.set_extension('name', default='')

# Load a spaCy model and add the custom component
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("keras_entity_recognizer")

# Process a text using the spaCy pipeline
text = "Here's an example text about Albert Einstein."
doc = nlp(text)

# Print the extracted entities
print("Entities found:", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
from spacy import displacy

# Visualize the named entities using displaCy
displacy.render(doc, style='ent', options={'ents': ['PERSON']})