In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import nltk.corpus
import os, urllib.request
import zipfile
from nltk.tokenize import WhitespaceTokenizer
import sklearn
from tqdm import tqdm
from collections import OrderedDict
from sklearn.externals._packaging.version import List
from keras.preprocessing.text import Tokenizer
from keras import layers
from keras.layers import Dropout, Flatten
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential

In [None]:
nltk.download('treebank')
nltk.download('punkt')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
class AvgSentenceEmbedding(tf.keras.layers.Layer):

    def call(self, inputs, mask=None, training=False, **kwargs):
        float_mask = tf.cast(mask, inputs.dtype)
        masked_inputs = inputs * float_mask[:, :, None]
        return tf.reduce_sum(masked_inputs, axis=1) / tf.reduce_sum(float_mask, axis=1)[:, None]

In [None]:
def Encoding(df,Tags):
  label_encoder = sklearn.preprocessing.LabelEncoder()
  X_lab = label_encoder.fit_transform(df)
  OneHot_encoder = sklearn.preprocessing.OneHotEncoder()
  X = OneHot_encoder.fit_transform(X_lab.reshape(-1,1)).toarray()
  dfOneHot = pd.DataFrame(X, columns = [i for i in Tags])
  df.reset_index(inplace=True,drop=True)
  df = pd.concat([df,dfOneHot],axis=1)
  return df

In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
  vocab_size = len(word_index)+1
  embedding_matrix = np.zeros((vocab_size,embedding_dim))

  with open(filepath, encoding='utf-8') as f:
    for line in f:
      word, *vector = line.split()
      if word in word_index:
        idx = word_index[word]
        embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
  return embedding_matrix

In [None]:
nltk.corpus.treebank.tagged_words()

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

In [None]:
nltk.corpus.treebank.words()

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', ...]

In [None]:
nltk.corpus.treebank.fileids()
train_files = nltk.corpus.treebank.fileids()[:100]
val_files = nltk.corpus.treebank.fileids()[100:150]
test_files = nltk.corpus.treebank.fileids()[150:]
print(train_files)
print(val_files)
print(test_files)


['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', 'wsj_0005.mrg', 'wsj_0006.mrg', 'wsj_0007.mrg', 'wsj_0008.mrg', 'wsj_0009.mrg', 'wsj_0010.mrg', 'wsj_0011.mrg', 'wsj_0012.mrg', 'wsj_0013.mrg', 'wsj_0014.mrg', 'wsj_0015.mrg', 'wsj_0016.mrg', 'wsj_0017.mrg', 'wsj_0018.mrg', 'wsj_0019.mrg', 'wsj_0020.mrg', 'wsj_0021.mrg', 'wsj_0022.mrg', 'wsj_0023.mrg', 'wsj_0024.mrg', 'wsj_0025.mrg', 'wsj_0026.mrg', 'wsj_0027.mrg', 'wsj_0028.mrg', 'wsj_0029.mrg', 'wsj_0030.mrg', 'wsj_0031.mrg', 'wsj_0032.mrg', 'wsj_0033.mrg', 'wsj_0034.mrg', 'wsj_0035.mrg', 'wsj_0036.mrg', 'wsj_0037.mrg', 'wsj_0038.mrg', 'wsj_0039.mrg', 'wsj_0040.mrg', 'wsj_0041.mrg', 'wsj_0042.mrg', 'wsj_0043.mrg', 'wsj_0044.mrg', 'wsj_0045.mrg', 'wsj_0046.mrg', 'wsj_0047.mrg', 'wsj_0048.mrg', 'wsj_0049.mrg', 'wsj_0050.mrg', 'wsj_0051.mrg', 'wsj_0052.mrg', 'wsj_0053.mrg', 'wsj_0054.mrg', 'wsj_0055.mrg', 'wsj_0056.mrg', 'wsj_0057.mrg', 'wsj_0058.mrg', 'wsj_0059.mrg', 'wsj_0060.mrg', 'wsj_0061.mrg', 'wsj_0062.mrg', 'wsj_00

In [None]:
import re
def pre_process(df,string):
    def text_pre_process(text):
        ret = re.sub("RT @(.)+?:\s|(&#[0-9]+;)|@([\w\-]+)|(#)\S+|(http)s?\S+|&gt;|^\s+|\b\s+|\n", "", text)
        ret = re.sub("\s\s+|[^a-zA-Z\d\s:]" , " ", ret).rstrip().lower()
        return ret
    return df[string].apply(text_pre_process)

In [None]:
frames = []
for i in train_files:
  data = nltk.corpus.treebank.tagged_words(i)
  df = pd.DataFrame().from_records(data,columns=['Words','Tags'])
  frames.append(df)
train_df = pd.concat(frames)
train_df["Words"] = pre_process(train_df,"Words")
train_df["Tags"] = pre_process(train_df,"Tags")
train_df['Words'].replace('', np.nan, inplace=True)
train_df['Tags'].replace('', np.nan, inplace=True)
train_df.dropna(inplace=True)
train_df = train_df[train_df.Tags != "fw"]
train_df = train_df[train_df.Tags != "uh"]
train_df = train_df[train_df.Tags != "ls"]
train_df.head()

Unnamed: 0,Words,Tags
0,pierre,nnp
1,vinken,nnp
3,61,cd
4,years,nns
5,old,jj


In [None]:
frames = []
for i in val_files:
  data = nltk.corpus.treebank.tagged_words(i)
  df = pd.DataFrame().from_records(data,columns=['Words','Tags'])
  frames.append(df)
val_df = pd.concat(frames)
val_df["Words"] = pre_process(val_df,"Words")
val_df["Tags"] = pre_process(val_df,"Tags")
val_df['Words'].replace('', np.nan, inplace=True)
val_df['Tags'].replace('', np.nan, inplace=True)
val_df.dropna(inplace=True)
val_df = val_df[val_df.Tags != "fw"]
val_df = val_df[val_df.Tags != "uh"]
val_df = val_df[val_df.Tags != "ls"]
val_df.head()

Unnamed: 0,Words,Tags
0,a,dt
1,house senate,nnp
2,conference,nn
3,approved,vbd
4,major,jj


In [None]:
frames = []
for i in test_files:
  data = nltk.corpus.treebank.tagged_words(i)
  df = pd.DataFrame().from_records(data,columns=['Words','Tags'])
  frames.append(df)
test_df = pd.concat(frames)
test_df["Words"] = pre_process(test_df,"Words")
test_df["Tags"] = pre_process(test_df,"Tags")
test_df['Words'].replace('', np.nan, inplace=True)
test_df['Tags'].replace('', np.nan, inplace=True)
test_df.dropna(inplace=True)
test_df = test_df[test_df.Tags != "fw"]
test_df = test_df[test_df.Tags != "uh"]
test_df = test_df[test_df.Tags != "ls"]
test_df.head()

Unnamed: 0,Words,Tags
0,intelogic,nnp
1,trace,nnp
2,inc,nnp
4,san,nnp
5,antonio,nnp


In [None]:
train_df.describe()

Unnamed: 0,Words,Tags
count,44108,44108
unique,7792,34
top,the,nn
freq,2329,6117


In [None]:
Tags = []
for i in train_df.Tags:
  if i not in Tags:
    Tags.append(i)

Tags_test = []
for i in test_df.Tags:
  if i not in Tags_test:
    Tags_test.append(i)
Tags_val = []
for i in val_df.Tags:
  if i not in Tags_val:
    Tags_val.append(i)
for i in Tags:
  if i not in Tags_test:
    print(i)
print(Tags_val)
print(Tags_test)
print(Tags)

['dt', 'nnp', 'nn', 'vbd', 'jj', 'nns', 'in', 'jjr', 'cd', ' none', 'wdt', 'vbz', 'rb', 'cc', 'vbg', 'vbn', 'vbp', 'jjs', 'to', 'pos', 'rbr', 'md', 'vb', 'nnps', 'prp', ' lrb', ' rrb', 'wp', 'rbs', 'ex', 'wrb', ':', 'rp', 'pdt']
['nnp', 'vbd', ' none', 'prp', 'cd', 'nns', 'cc', 'in', 'jj', 'nn', 'dt', 'vbz', 'pos', 'to', 'md', 'vb', 'vbg', 'rb', 'vbn', 'wp', 'vbp', 'wrb', 'wdt', 'jjr', 'jjs', 'rp', 'nnps', 'rbr', 'ex', 'rbs', ':', ' lrb', ' rrb', 'pdt']
['nnp', 'cd', 'nns', 'jj', 'md', 'vb', 'dt', 'nn', 'in', 'vbz', 'vbg', 'cc', 'vbd', 'vbn', ' none', 'rb', 'to', 'prp', 'rbr', 'wdt', 'vbp', 'rp', 'jjs', 'pos', 'ex', 'wp', 'jjr', 'wrb', ':', 'nnps', ' lrb', ' rrb', 'pdt', 'rbs']


In [None]:
label_encoder = LabelEncoder()

train_df["Tags"]=label_encoder.fit_transform(train_df["Tags"])
test_df["Tags"]=label_encoder.fit_transform(test_df["Tags"])
val_df["Tags"]=label_encoder.fit_transform(val_df["Tags"])

print(train_df["Tags"].unique())
print(test_df["Tags"].unique())
print(val_df["Tags"].unique())


[14  5 16  9 12 25  6 13  8 30 27  4 26 28  1 20 24 19 21 31 29 23 11 18
  7 32 10 33  3 15  0  2 17 22]
[14 26  1 19  5 16  4  8  9 13  6 30 18 24 12 25 27 20 28 32 29 33 31 10
 11 23 15 21  7 22  3  0  2 17]
[ 6 14 13 26  9 16  8 10  5  1 31 30 20  4 27 28 29 11 24 18 21 12 25 15
 19  0  2 32 22  7 33  3 23 17]


In [None]:
X_train=train_df.Words
y_train = train_df.Tags
print(y_train.shape)
y_train = Encoding(y_train,Tags)
print(y_train.head())
X_val =val_df.Words
y_val = val_df.Tags
y_val = Encoding(y_val,Tags_val)
print(y_val.head())
X_test=test_df.Words
y_test = test_df.Tags
y_test = Encoding(y_test,Tags_test)
print(y_test.head())
y_train.drop("Tags",inplace=True,axis=1)
y_test.drop("Tags",inplace=True,axis=1)
y_val.drop("Tags",inplace=True,axis=1)

(44108,)
   Tags  nnp   cd  nns   jj   md   vb   dt   nn   in  ...   ex   wp  jjr  wrb  \
0    14  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1    14  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2     5  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3    16  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4     9  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

     :  nnps   lrb   rrb  pdt  rbs  
0  0.0   0.0   0.0   0.0  0.0  0.0  
1  0.0   0.0   0.0   0.0  0.0  0.0  
2  0.0   0.0   0.0   0.0  0.0  0.0  
3  0.0   0.0   0.0   0.0  0.0  0.0  
4  0.0   0.0   0.0   0.0  0.0  0.0  

[5 rows x 35 columns]
   Tags   dt  nnp   nn  vbd   jj  nns   in  jjr   cd  ...  prp   lrb   rrb  \
0     6  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  ...  0.0   0.0   0.0   
1    14  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   0.0   
2    13  0.0  0.0  0.0  0

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
print(X_train.head())
print(y_train.head())
print(X_train.shape)
print(y_train.shape)

0    pierre
1    vinken
3        61
4     years
5       old
Name: Words, dtype: object
   nnp   cd  nns   jj   md   vb   dt   nn   in  vbz  ...   ex   wp  jjr  wrb  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0  0.0   

     :  nnps   lrb   rrb  pdt  rbs  
0  0.0   0.0   0.0   0.0  0.0  0.0  
1  0.0   0.0   0.0   0.0  0.0  0.0  
2  0.0   0.0   0.0   0.0  0.0  0.0  
3  0.0   0.0   0.0   0.0  0.0  0.0  
4  0.0   0.0   0.0   0.0  0.0  0.0  

[5 rows x 34 columns]
(44108,)
(44108, 34)


In [None]:
num_words = 9000
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= num_words}
tokenizer.word_index[tokenizer.oov_token] = num_words +1

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
#GLOVE    
url = "https://nlp.stanford.edu/data/glove.6B.zip"

glove_path = os.path.join(os.getcwd(),"Glove")
glove_zip = os.path.join(os.getcwd(),"Glove", "glove.6B.zip")

if not os.path.exists(glove_path):
    os.makedirs(glove_path)

if not os.path.exists(glove_zip):
    urllib.request.urlretrieve(url, glove_zip)
    print("Successful download")

with zipfile.ZipFile(glove_zip, 'r') as zip_ref:
    zip_ref.extractall(path=glove_path)
    print("Successful extraction")

Successful extraction


In [None]:
glove_file = os.path.join(os.getcwd(),"Glove", "glove.6B.50d.txt")

print ("Loading Glove Model")
with open(glove_file, encoding="utf8" ) as f:
    lines = f.readlines()
vocabulary = {}
for line in lines:
    splits = line.split()
    vocabulary[splits[0]] = np.array([float(val) for val in splits[1:]])
print ("Done.",len(vocabulary.keys())," words loaded!")

Loading Glove Model
Done. 400000  words loaded!


In [None]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type:str,embedding_dimension: int=50)->gensim.models.keyedvectors.KeyedVectors:
  download_path = ""
  if model_type.strip().lower() == 'glove':
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)

  try:
    emb_model = gloader.load(download_path)
  except ValueError as e:
    print("Invalid embedding model name! Check embedding dimension:")
    print("Glove: 50,100, 200,300")
    raise e

  return emb_model
embedding_model = load_embedding_model("glove",50)

In [None]:
max_len = 50
X_train = pad_sequences(X_train,padding='post',maxlen=max_len)
X_val = pad_sequences(X_val,padding='post',maxlen=max_len)
X_test = pad_sequences(X_test,padding='post',maxlen=max_len)

In [None]:
input_dim = X_train.shape[1]

In [None]:
embedding_dim = 300
embedding_matrix = create_embedding_matrix("/content/Glove/glove.6B.300d.txt", tokenizer.word_index, embedding_dim)

In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix,axis=1))
embedding_accuracy = nonzero_elements/vocab_size
print('embedding accuracy: ' + str(embedding_accuracy))

embedding accuracy: 0.985885372112917


In [None]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, weights = [embedding_matrix], input_length = max_len, trainable=True))
model.add(layers.Bidirectional(layers.LSTM(128)))
model.add(layers.Dense(34,activation='softmax'))
optimizer = tf.keras.optimizers.Adam(lr=0.1)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')
model.summary()

  super(Adam, self).__init__(name, **kwargs)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 300)           2104200   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              439296    
 l)                                                              
                                                                 
 dense (Dense)               (None, 34)                8738      
                                                                 
Total params: 2,552,234
Trainable params: 2,552,234
Non-trainable params: 0
_________________________________________________________________


In [None]:
result = model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred = model.predict(X_test)



In [None]:
print(y_pred)

In [None]:
print(y_pred.shape)
print(y_test.shape)

(14636, 34)
(14636, 34)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
thresholds=[0.1,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for val in thresholds:
    predicted=y_pred.copy()
  
    predicted[predicted>=val]=1
    predicted[predicted<val]=0
    precision = precision_score(y_test, predicted, average='macro',zero_division=True)
    recall = recall_score(y_test, predicted, average='macro')
    f1 = f1_score(y_test, predicted, average='macro')
   
    print("Threshold: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(val, precision, recall, f1))

Threshold: 0.1000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.2000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.2500, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.3000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.4000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.5000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.6000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.7000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.8000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719
Threshold: 0.9000, Precision: 0.6210, Recall: 0.5519, F1-measure: 0.4719


In [None]:
Threshold = 0.1
y_pred[y_pred>=Threshold]=1
y_pred[y_pred<Threshold]=0

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred, target_names=Tags,zero_division=True))

              precision    recall  f1-score   support

         nnp       1.00      0.00      0.00        18
          cd       0.81      0.98      0.89       996
         nns       1.00      1.00      1.00        18
          jj       0.02      1.00      0.03        27
          md       0.98      1.00      0.99       353
          vb       0.58      0.42      0.49       858
          dt       0.98      0.96      0.97      1335
          nn       0.83      1.00      0.91         5
          in       0.95      0.61      0.74      1630
         vbz       0.12      0.62      0.20       918
         vbg       0.73      0.46      0.56        59
          cc       1.00      0.00      0.00        31
         vbd       0.93      0.95      0.94       167
         vbn       0.46      0.25      0.32      2305
        none       0.29      0.27      0.28      1504
          rb       0.00      0.00      0.00        44
          to       0.30      0.38      0.34       941
         prp       1.00    