In [121]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from tensorflow.keras.models import Model
import string
import re
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential

In [144]:
# Step 1: Load and Prepare Data
df = pd.read_csv('businessCard.csv', sep=',', names=['id', 'text', 'label'], encoding='latin-1', header=1)
df

Unnamed: 0,id,text,label
0,000.jpeg,.,O
1,000.jpeg,040-4852,B-PHONE
2,000.jpeg,8881,I-PHONE
3,000.jpeg,90309,B-PHONE
4,000.jpeg,52549,I-PHONE
...,...,...,...
10439,290.jpeg,,O
10440,290.jpeg,Richard,B-NAME
10441,290.jpeg,Pretorius,I-NAME
10442,290.jpeg,,O


In [145]:
whitespace = string.whitespace
punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
tableWhitespace = str.maketrans('','',whitespace)
tablePunctuation = str.maketrans('','',punctuation)
def cleanText(txt):
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [146]:
df['text'] = df['text'].apply(cleanText)

In [147]:
dataClean = df.query("text != '' ")
dataClean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataClean.dropna(inplace=True)


In [148]:
dataClean.head(50)

Unnamed: 0,id,text,label
0,000.jpeg,.,O
1,000.jpeg,040-4852,B-PHONE
2,000.jpeg,8881,I-PHONE
3,000.jpeg,90309,B-PHONE
4,000.jpeg,52549,I-PHONE
5,000.jpeg,fi,O
6,000.jpeg,/laurelsoverseaseducation,O
7,000.jpeg,ûï@,O
8,000.jpeg,laurels,B-ORG
9,000.jpeg,overseas,I-ORG


In [114]:
data = dataClean

In [149]:
# Define the labels and map them to numerical values
label_mapping = {
    'O': 0,        # Not an entity
    'B-NAME': 1,   # Beginning of a name entity
    'I-NAME': 2,   # Inside of a name entity
    'B-DES': 3,    # Beginning of a description entity
    'I-DES': 4,    # Inside of a description entity
    'B-ORG': 5,    # Beginning of an organization entity
    'I-ORG': 6,    # Inside of an organization entity
    'B-PHONE': 7,  # Beginning of a phone entity
    'I-PHONE': 8,  # Inside of a phone entity
    'B-EMAIL': 9,  # Beginning of an email entity
    'I-EMAIL': 10, # Inside of an email entity
    'B-WEB': 11,   # Beginning of a web entity
    'I-WEB': 12    # Inside of a web entity
}

In [158]:
# Tokenize the text data
max_length = 128  # Define the desired sequence length
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index
X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=max_length, padding="post", truncating="post")

# Encode the labels as 2D arrays
y = [label_mapping[label] for label in df['label']]
y = to_categorical(y, num_classes=len(label_mapping))


In [159]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [160]:
# Build the LSTM NER model
model = Sequential([
    Embedding(len(word_index) + 1, 128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    TimeDistributed(Dense(len(label_mapping), activation="softmax"))
])

In [162]:
# Encode the labels for each time step
from keras.utils import to_categorical

y_train_encoded = np.zeros((len(y_train), max_length, len(label_mapping)), dtype=int)
for i, sequence in enumerate(y_train):
    for j, label in enumerate(sequence):
        label_str = str(label)  # Convert label to string
        label_id = label_mapping.get(label_str, 0)  # Use 0 for unknown labels
        y_train_encoded[i, j, label_id] = 1

y_test_encoded = np.zeros((len(y_test), max_length, len(label_mapping)), dtype=int)
for i, sequence in enumerate(y_test):
    for j, label in enumerate(sequence):
        label_str = str(label)  # Convert label to string
        label_id = label_mapping.get(label_str, 0)  # Use 0 for unknown labels
        y_test_encoded[i, j, label_id] = 1


In [164]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


In [165]:
# Train the model
model.fit(X_train, y_train_encoded, validation_data=(X_test, y_test_encoded), epochs=5, batch_size=32)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1b6f0ea3af0>