In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout

In [38]:
data = pd.read_csv("ner.csv",encoding = "ISO-8859-1",nrows = 50000)
data.head()

Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,...,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,...,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,...,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,...,thousand,NNS,capitalized,Thousands,lowercase,of,1,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,...,of,IN,lowercase,of,lowercase,demonstrators,1,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,...,demonstr,NNS,lowercase,demonstrators,lowercase,have,1,lowercase,marched,O


In [39]:
dataset=data.drop(['Unnamed: 0', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos',
       'next-next-shape', 'next-next-word', 'next-pos', 'next-shape',
       'next-word', 'prev-iob', 'prev-lemma', 'prev-pos',
       'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape',
       'prev-prev-word', 'prev-shape', 'prev-word',"pos","shape"],axis=1)

In [40]:
words = list(set(dataset["word"].values))
words.append("ENDPAD")
tags = list(set(dataset["tag"].values))

In [41]:
n_words = len(words); n_words

7465

In [42]:
n_tags = len(tags); n_tags
tags

['B-art',
 'B-gpe',
 'B-per',
 'I-org',
 'B-tim',
 'I-art',
 'I-per',
 'B-org',
 'I-gpe',
 'B-nat',
 'I-tim',
 'B-geo',
 'I-nat',
 'I-geo',
 'I-eve',
 'O',
 'B-eve']

In [43]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [44]:
getter = SentenceGetter(dataset)
sentences = getter.sentences

In [45]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [46]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=140, sequences=X, padding="post",value=n_words - 1)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=140, sequences=y, padding="post", value=tag2idx["O"])

In [47]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y-1]

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [63]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

input = Input(shape=(140,))
model = Embedding(input_dim=n_words, output_dim=140, input_length=140)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [64]:
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=1, validation_split=0.2, verbose=1)



In [65]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
preds = []
p = np.argmax(y_pred, axis=-1)
for x in p[0]:
    preds.append(tags[x])




In [66]:
test_tags = []
t = np.argmax(y_test, axis=-1)
for x in t[0]:
    test_tags.append(tags[x])

In [67]:

print(classification_report(preds, test_tags))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00         0
       I-eve       1.00      0.98      0.99       140
       I-org       0.00      0.00      0.00         0
       I-tim       0.00      0.00      0.00         0

    accuracy                           0.98       140
   macro avg       0.25      0.24      0.25       140
weighted avg       1.00      0.98      0.99       140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
from sklearn.metrics import accuracy_score

accuracy_score(preds, test_tags)

0.8857142857142857

In [55]:
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

def build_model(hp):
    input = Input(shape=(140,))
    model = Embedding(input_dim=n_words, output_dim=140, input_length=140)(input)
    model = Dropout(hp.Float('Dropout_rate',min_value=0,max_value=0.5,step=0.1))(model)
    model = Bidirectional(LSTM(units=hp.Int('lstm_units', min_value=32, max_value=128, step=32), return_sequences=True, recurrent_dropout=hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1)))(model)
    out = TimeDistributed(Dense(n_tags, activation=hp.Choice('dense_activation',values=['relu','sigmoid','softmax'])))(model) 
    model = Model(input, out)
    model.compile(optimizer="adam", loss=hp.Choice('loss_fn',values=['binary_crossentropy','categorical_crossentropy']), metrics=["accuracy"])
    model.summary()
    return model

# Initialize Keras Tuner RandomSearch
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    directory='tuner_dir',
    project_name='lstm_sentiment'
)

# Perform hyperparameter search
tuner.search(X_train, np.array(y_train), validation_split=0.2, epochs=3)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:")
print(best_hps)

# Build the final model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_split=0.2, verbose=1)


Trial 5 Complete [00h 00m 08s]
val_accuracy: 0.9765507578849792

Best val_accuracy So Far: 0.9765507578849792
Total elapsed time: 00h 01m 03s
INFO:tensorflow:Oracle triggered exit
Best Hyperparameters:
<keras_tuner.engine.hyperparameters.hyperparameters.HyperParameters object at 0x7f6657181950>
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 140)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 140, 140)          1045100   
                                                                 
 dropout_1 (Dropout)         (None, 140, 140)          0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 140, 256)          275456    
 onal)                                                           
                           

In [57]:
# Print the chosen activation function and loss function
best_activation = best_hps.get('dense_activation')
best_loss_function = best_hps.get('loss_fn')
best_units = best_hps.get('lstm_units')
best_dropout = best_hps.get('Dropout_rate')
print("Chosen number of LSTM units:", best_units)
print("Chosen dropout rate:", best_dropout)
print("Chosen Activation Function:", best_activation)
print("Chosen Loss Function:", best_loss_function)

Chosen number of LSTM units: 128
Chosen dropout rate: 0.1
Chosen Activation Function: softmax
Chosen Loss Function: categorical_crossentropy


In [58]:
y_pred = model.predict(X_test)
preds = []
p = np.argmax(y_pred, axis=-1)
for x in p[0]:
    preds.append(tags[x])




In [60]:
test_tags = []
t = np.argmax(y_test, axis=-1)
for x in t[0]:
    test_tags.append(tags[x])

In [61]:

print(classification_report(preds, test_tags))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00         0
       B-gpe       0.00      0.00      0.00         0
       I-art       0.00      0.00      0.00         0
       I-eve       1.00      0.89      0.94       140
       I-nat       0.00      0.00      0.00         0
       I-org       0.00      0.00      0.00         0
       I-tim       0.00      0.00      0.00         0

    accuracy                           0.89       140
   macro avg       0.14      0.13      0.13       140
weighted avg       1.00      0.89      0.94       140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
