In [2]:
import numpy as np
import pandas as pd
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential,Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import RMSprop

In [3]:
# Charger les données depuis un fichier CSV
df = pd.read_csv('NER dataset.csv',encoding='unicode_escape',skipinitialspace=True,skip_blank_lines=True)
df.head(13)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [4]:
# Définir une fonction pour créer des dictionnaires de mapping
def get_dict_map(data,col_name):
  # Initialiser les dictionnaires
  token2id = {}
  id2token = {}
  # Obtenir le vocabulaire unique
  vocab = list(set(data[col_name].to_list()))
  # Créer les dictionnaires de mapping
  id2token = {id:token for id,token in enumerate(vocab)}
  token2id = {token:id for id,token in enumerate(vocab)}
  # Retourner les dictionnaires
  return token2id,id2token

In [5]:
# Créer des dictionnaires de mapping pour les mots et les étiquettes
word2id,id2word = get_dict_map(df,'Word')
tag2id,id2tag = get_dict_map(df,'Tag')

In [6]:
tag2id

{'O': 0,
 'B-org': 1,
 'B-nat': 2,
 'B-geo': 3,
 'B-gpe': 4,
 'I-art': 5,
 'B-per': 6,
 'B-art': 7,
 'I-per': 8,
 'I-tim': 9,
 'I-org': 10,
 'I-eve': 11,
 'I-gpe': 12,
 'I-nat': 13,
 'B-tim': 14,
 'B-eve': 15,
 'I-geo': 16}

In [6]:
# Mapper les mots et les étiquettes à leurs identifiants correspondants
df['Word_id'] = df['Word'].map(word2id)
df['Tag_id'] = df['Tag'].map(tag2id)
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_id,Tag_id
0,Sentence: 1,Thousands,NNS,O,5116,16
1,,of,IN,O,29875,16
2,,demonstrators,NNS,O,974,16
3,,have,VBP,O,2645,16
4,,marched,VBN,O,3033,16


In [7]:
# Remplir les valeurs manquantes
df_fillnan = df.ffill(axis=0)
df_fillnan.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_id,Tag_id
0,Sentence: 1,Thousands,NNS,O,5116,16
1,Sentence: 1,of,IN,O,29875,16
2,Sentence: 1,demonstrators,NNS,O,974,16
3,Sentence: 1,have,VBP,O,2645,16
4,Sentence: 1,marched,VBN,O,3033,16


In [8]:
# Grouper les données par phrase
final_data = df_fillnan.groupby(['Sentence #'], as_index=False)[['Word', 'POS', 'Tag', 'Word_id', 'Tag_id']].agg(lambda x: list(x))
final_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_id,Tag_id
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[5116, 29875, 974, 2645, 3033, 20053, 4066, 90...","[16, 16, 16, 16, 16, 16, 15, 16, 16, 16, 16, 1..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[22814, 17407, 4203, 9756, 11773, 9073, 27472,...","[13, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[18659, 1205, 32322, 18807, 8642, 8017, 14094,...","[16, 16, 7, 16, 16, 16, 16, 16, 15, 16, 16, 16..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[8809, 16755, 1740, 32237, 32047, 4495, 16757,...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[11306, 11692, 20344, 30656, 30920, 10231, 320...","[15, 16, 16, 11, 4, 16, 7, 16, 15, 16, 13, 16,..."


In [9]:
# Rembourrer les phrases les plus courtes pour uniformiser la taille des entrées
# Obtenir la taille maximale des phrases
max_len_statement = final_data['Word_id'].apply(len).max()
print(max_len_statement)
# Obtenir la taille maximale des étiquettes
max_len_tag = final_data['Tag_id'].apply(len).max()
print(max_len_tag)
# Rembourrer les phrases et les étiquettes
pad_statements = pad_sequences(final_data['Word_id'].to_list(), maxlen=max_len_statement, padding='post', value=word2id['The'])
pad_tags = pad_sequences(final_data['Tag_id'].to_list(), maxlen=max_len_tag, padding='post', value=tag2id['O'])
print('Statements after padding : \n',pad_statements)
print('Tags after padding : \n',pad_tags)

104
104
Statements after padding : 
 [[ 5116 29875   974 ...  9253  9253  9253]
 [22814 17407  4203 ...  9253  9253  9253]
 [18659  1205 32322 ...  9253  9253  9253]
 ...
 [10194 20551 25718 ...  9253  9253  9253]
 [31492  1009 22197 ...  9253  9253  9253]
 [ 9253 12572   111 ...  9253  9253  9253]]
Tags after padding : 
 [[16 16 16 ... 16 16 16]
 [13 16 16 ... 16 16 16]
 [16 16  7 ... 16 16 16]
 ...
 [16 15 16 ... 16 16 16]
 [16 16 16 ... 16 16 16]
 [16  2  6 ... 16 16 16]]


In [None]:
# Convertir les étiquettes de sortie en encodage one-hot
pad_tags = to_categorical(pad_tags)
print(pad_tags)
print(pad_tags.shape)

[[[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]

 ...

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 1. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]]
(47959, 10

In [None]:
# Diviser les données en ensembles d'entraînement et de test
X_train,X_test,Y_train,Y_test = train_test_split(pad_statements,pad_tags,test_size=0.1,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(43163, 104)
(4796, 104)
(43163, 104, 17)
(4796, 104, 17)


In [15]:
# Définir les paramètres du modèle
shape = X_train[1].shape
input_dim = len(list(set(df['Word'].to_list())))+1
output_dim = 64
input_length = max_len_statement
output_units = len(id2tag)
print(shape)
print(input_dim)
print(input_length)
print(output_units)

(104,)
35179
104
17


In [13]:
# Créer le modèle
input_layer = Input(shape=shape)
embeddings = Embedding(input_dim=input_dim,output_dim=output_dim,input_length=input_length)(input_layer)
lstm1 = LSTM(units=output_dim,return_sequences=True)(embeddings)
lstm2 = LSTM(units=output_dim,return_sequences=True)(lstm1)
output = TimeDistributed(Dense(units=output_units,activation='softmax'))(lstm2)
model = Model(inputs=input_layer,outputs=output)
model.compile(optimizer=RMSprop(1e-3),loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()
plot_model(model,show_shapes=True)



You must install pydot (`pip install pydot`) for `plot_model` to work.


In [14]:
# Entraîner le modèle
model.fit(x=X_train,y=Y_train,batch_size=4,validation_data=(X_test,Y_test))

[1m10791/10791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1161s[0m 106ms/step - accuracy: 0.9760 - loss: 0.1044 - val_accuracy: 0.9891 - val_loss: 0.0402


<keras.src.callbacks.history.History at 0x1e88c1a9d00>

In [16]:
# Évaluer le modèle
test_loss, test_accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Prédire sur l'ensemble de test
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_true = np.argmax(Y_test, axis=-1)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 35ms/step - accuracy: 0.9892 - loss: 0.0406
Test Loss: 0.0402
Test Accuracy: 0.9891
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step


In [25]:
def predict_ner_tags(sentence, model, word2id, id2tag, max_len_statement):
    """
    Prédit les étiquettes NER pour une phrase donnée.

    Args:
        sentence: La phrase à prédire.
        model: Le modèle NER entraîné.
        word2id: Le dictionnaire de mapping mot à index.
        id2tag: Le dictionnaire de mapping index à étiquette.
        max_len_statement: La longueur maximale de la phrase.

    Returns:
        None (affiche les résultats).
    """
    # Diviser la phrase en tokens
    tokens = sentence.split()

    # Convertir les tokens en leurs indices correspondants
    token_indices = [word2id.get(word, word2id["The"]) for word in tokens]

    # Rembourrer les indices de tokens à la longueur maximale
    token_indices_padded = pad_sequences([token_indices], maxlen=max_len_statement, padding="post", value=word2id["The"])

    # Faire des prédictions
    predictions = model.predict(token_indices_padded)

    # Obtenir les indices prédits
    predicted_indices = np.argmax(predictions, axis=-1)[0]

    # Mapper les indices prédits aux étiquettes
    #predicted_tags = [id2tag[idx] for idx in predicted_indices[:len(tokens)]]

    # Afficher les résultats
    #for word, tag in zip(tokens, predicted_tags):
        #print(f"{word:15} {tag}")

    return list(zip(tokens, predicted_indices))

# Exemple d'utilisation
custom_sentence = "Google and Microsoft are competing in the AI market."
# En supposant que model, word2id, id2tag et max_len_statement sont définis
predictions = predict_ner_tags(custom_sentence, model, word2id, id2tag, max_len_statement)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step


In [27]:
id2tag

{0: 'I-geo',
 1: 'B-eve',
 2: 'B-org',
 3: 'I-tim',
 4: 'I-per',
 5: 'B-nat',
 6: 'I-org',
 7: 'B-tim',
 8: 'I-nat',
 9: 'I-art',
 10: 'I-gpe',
 11: 'B-per',
 12: 'B-art',
 13: 'B-gpe',
 14: 'I-eve',
 15: 'B-geo',
 16: 'O'}

In [28]:
tag_mapping= {
    0: 'Geographical Entity',
    1: 'Event',
    2: 'Organization' ,
    3: 'Time' ,
    4: 'Person' ,
    5: 'Nationality' ,
    6: 'Organization',
    7: 'Time' ,
    8: 'Nationality' ,
    9: 'Artifact' ,
    10: 'Geopolitical Entity' ,
    11: 'Person' ,
    12: 'Artifact',
    13: 'Geopolitical Entity',
    14: 'Event',
    15: 'Geographical Entity',
    16: 'Outside' 
}

In [29]:
def display_ner_predictions(sentence, predictions, tag_mapping):
    """
    Displays NER predictions in a user-friendly format.

    Args:
        sentence: The input sentence.
        predictions: A list of (word, tag_id) tuples.
        tag_mapping: A dictionary mapping tag IDs to readable tag names.
    """

    entities = []
    current_entity = []
    for word, tag_id in predictions:
        tag = tag_mapping.get(tag_id, "O")  # Get full tag name from mapping

        if tag != "Outside":  # Beginning of a new entity or inside an existing one
            if not current_entity or current_entity[0][1] == tag:
                current_entity.append((word, tag))
            else: # Different entity type
                entities.append((" ".join([w for w, _ in current_entity]), current_entity[0][1]))
                current_entity = [(word, tag)]
        else:  # Outside any entity (O tag)
            if current_entity:
                entities.append((" ".join([w for w, _ in current_entity]), current_entity[0][1]))
            current_entity = []

    if current_entity:
        entities.append((" ".join([w for w, _ in current_entity]), current_entity[0][1]))

    result = {}
    if entities:
        print("Entities:")
        for entity_text, entity_type in entities:
            result[entity_text] = entity_type
            print(f"- {entity_text} ({entity_type})")
    else:
        print("No entities found.")

    return result

In [30]:
custom_sentence = "Google and Microsoft are competing in the AI market."
print(display_ner_predictions(custom_sentence,predictions,tag_mapping))

Entities:
- Google (Organization)
- Microsoft (Organization)
{'Google': 'Organization', 'Microsoft': 'Organization'}


In [36]:
# Sauvegarder le modèle
model.save('ner_model.keras')

In [32]:
from tensorflow.keras.models import load_model

model1 = load_model('ner_model.keras')

In [33]:
custom_sentence = "Google and Microsoft are competing in the AI market."
predictions = predict_ner_tags(custom_sentence, model1, word2id, id2tag, max_len_statement)
print(display_ner_predictions(custom_sentence,predictions,tag_mapping))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Entities:
- Google (Organization)
- Microsoft (Organization)
{'Google': 'Organization', 'Microsoft': 'Organization'}
