In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np 
import pandas as pd
import nltk
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall, Accuracy
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Concatenate, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics import classification_report

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
def readFile(path):
    file = open(path,'r+')
    text = file.readlines()
    print(len(text))
    return text

In [4]:
def posTagging(text):
    tagged_list = []
    for txt in text:
        tokenized = sent_tokenize(txt)
        for i in tokenized:
            wordsList = nltk.word_tokenize(i)
            wordsList = [w.lower() for w in wordsList] 
            tagged = nltk.pos_tag(wordsList)
            tagged_list.append(tagged)
    return tagged_list

In [5]:
def getTokensAndLabels(text):
    tokens = []
    labels = []
    for txt in text:
        t = []
        l = []
        for w in txt:
            t.append(w[0])
            l.append(w[1])
#             print(t)
        count = t.count('<')
        while count>0:
            count -= 1
            ind = [i for i,x in enumerate(t) if x=='<'][0]
#             print(ind)
            if t[ind+1]=='unk' and t[ind+2]=='>':
                t[ind] = '<unk>'
                del t[ind+1]
                del t[ind+1]
                l[ind] = nltk.pos_tag(['<unk>'])[0][1]
                del l[ind+1]
                del l[ind+1]
        tokens.append(t)
        labels.append(l)
    return tokens, labels

In [6]:
def getVocab(text, vocab):
    for txt in text:
        for w in txt:
            vocab.add(w)
    return vocab

In [7]:
def charVec(text):
    char_arr = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
                '0','1','2','3','4','5','6','7','8','9',
                '-',',',';','.','!','?',':','’','’’','/','\\','|','_','@','#','$',
                '%','^','&','*','˜','‘','+','-','=','(',')','[',']','{','}', "'", '"']
    char_vector = []
    for txt in text:
#         print(txt)
        vec = []
        for word in txt:
#             print(word)
            if word == '<unk>':
                vec.append([len(char_arr)])
            else:
                v = []
                for c in word:
                    v.append(char_arr.index(c))
                vec.append(v)
        char_vector.append(vec)
    return char_vector

In [8]:
def postagVec(postags, labels):
    pos_dict = {}
    ind = 0
    for tag in postags:
        pos_dict[tag] = ind
        ind += 1
    postag_vector = []
    for l in labels:
        pos_vec = []
        for tag in l:
            pos_vec.append(pos_dict[tag])
        postag_vector.append(pos_vec)
    return postag_vector

In [17]:
def wordVec(text, vocab):
    word_dict = {}
    ind = 0
    for word in vocab:
        word_dict[word] = ind
        ind += 1
    word_vector = []
    for txt in text:
        w_vec = []
        for word in txt:
            w_vec.append(word_dict[word])
        word_vector.append(w_vec)
    return word_vector

In [9]:
def padding(seq, maxlen=150):
    final = []
    for lis in seq:
#         print(lis, '\n***\n')
        # padding
        if len(lis)<maxlen:
            pad = []
            for i in range(maxlen-len(lis)):
                if type(lis[0]) == int:
                    pad.append(0)
                else:
                    pad.append([0 for i in range(len(lis[0]))])
            for i in range(len(lis)):
                pad.append(lis[i])
            final.append(pad)
        #truncating
        else:
            trunc = []
            for i in range(maxlen):
                trunc.append(lis[i])
            final.append(trunc)
    return final

In [10]:
## loading files

train = readFile('/content/drive/MyDrive/NLP_lab7/ptbdataset/ptb.train.txt')
test = readFile('/content/drive/MyDrive/NLP_lab7/ptbdataset/ptb.test.txt')
valid = readFile('/content/drive/MyDrive/NLP_lab7/ptbdataset/ptb.valid.txt')

42068
3761
3370


In [11]:
## pos tagging

train_tagged = posTagging(train)
test_tagged = posTagging(test)
valid_tagged = posTagging(valid)

In [12]:
train_tokens, train_postags = getTokensAndLabels(train_tagged)
test_tokens, test_postags = getTokensAndLabels(test_tagged)
valid_tokens, valid_postags = getTokensAndLabels(valid_tagged)

In [13]:
## get vocab

vocab = set()
vocab = getVocab(train_tokens, vocab)
vocab = getVocab(test_tokens, vocab)
vocab = getVocab(valid_tokens, vocab)
vocab = list(vocab)
vocab.sort()
len(vocab)

10004

In [14]:
## pos tags

postags = set()
postags = getVocab(train_postags, postags)
postags = list(postags)
postags.sort()
len(postags)

39

In [15]:
# pos tags

## convert to vectors

pos_seq_train = postagVec(postags, train_postags)
pos_seq_test = postagVec(postags, test_postags)
pos_seq_valid = postagVec(postags, valid_postags)

## padding

pos_seq_train = np.array(padding(pos_seq_train), dtype='float32')
pos_seq_test = np.array(padding(pos_seq_test), dtype='float32')
pos_seq_valid = np.array(padding(pos_seq_valid), dtype='float32')

## saving as csv

pd.DataFrame(pos_seq_train).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/pos_seq_train.csv', index=False)
pd.DataFrame(pos_seq_test).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/pos_seq_test.csv', index=False)
pd.DataFrame(pos_seq_valid).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/pos_seq_valid.csv', index=False)

In [18]:
# words

## convert to vectors 

word_seq_train = wordVec(train_tokens, vocab)
word_seq_test = wordVec(test_tokens, vocab)
word_seq_valid = wordVec(valid_tokens, vocab)

## padding

word_seq_train = np.array(padding(word_seq_train), dtype='float32')
word_seq_test = np.array(padding(word_seq_test), dtype='float32')
word_seq_valid = np.array(padding(word_seq_valid), dtype='float32')

## saving as csv

pd.DataFrame(pos_seq_train).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/word_seq_train.csv', index=False)
pd.DataFrame(pos_seq_test).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/word_seq_test.csv', index=False)
pd.DataFrame(pos_seq_valid).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/word_seq_valid.csv', index=False)

In [19]:
# characters 

## convert to vectors

char_seq_train = charVec(train_tokens)
char_seq_test = charVec(test_tokens)
char_seq_valid = charVec(valid_tokens)

## padding 

for i in range(len(char_seq_train)):
    char_seq_train[i] = padding(char_seq_train[i], maxlen=40)
char_seq_train = np.array(padding(char_seq_train), dtype='float32')

for i in range(len(char_seq_test)):
    char_seq_test[i] = padding(char_seq_test[i], maxlen=40)
char_seq_test = np.array(padding(char_seq_test), dtype='float32')

for i in range(len(char_seq_valid)):
    char_seq_valid[i] = padding(char_seq_valid[i], maxlen=40)
char_seq_valid = np.array(padding(char_seq_valid), dtype='float32')

## saving as csv

pd.DataFrame(np.ravel(char_seq_train)).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/char_seq_train.csv', index=False)
pd.DataFrame(np.ravel(char_seq_test)).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/char_seq_test.csv', index=False)
pd.DataFrame(np.ravel(char_seq_valid)).to_csv('/content/drive/MyDrive/NLP_lab7/seq_data/char_seq_valid.csv', index=False)

## dimensions to reshape when loading 
print(char_seq_train.shape)
print(char_seq_test.shape)
print(char_seq_valid.shape)

## saving this to text file
dim = np.array(['char train shape = (42503, 150, 40)', 
                'char test shape = (3793, 150, 40)', 
                'char valid shape = (3395, 150, 40)', 
                'reshape command example = np.reshape(np.ravel(char_seq_train), (42503, 150, 40)).shape'])
np.savetxt('char_dim.txt', dim, delimiter=',', fmt='%s')

(42503, 150, 40)
(3793, 150, 40)
(3395, 150, 40)


In [20]:
def getPosCat(pos_tag_seq):
    pos_tag_seq_cat = []
    n = 40
    for pos_tag in pos_tag_seq:
        pos_tag_seq_cat.append(to_categorical(pos_tag, num_classes = n+1))
    return np.array(pos_tag_seq_cat, dtype = 'float32')

In [21]:
X_train_char = char_seq_train
X_train_word = word_seq_train
y_train = getPosCat(pos_seq_train)

X_test_char = char_seq_test
X_test_word = word_seq_test
y_test = getPosCat(pos_seq_test)

X_valid_char = char_seq_valid
X_valid_word = word_seq_valid
y_valid = getPosCat(pos_seq_valid)

In [22]:
def ConstituencyParser(word_seq_len, char_seq_len, lstm_units, embedding_dim, char_vocab_size, word_vocab_size, pos_vocab_size):
    char_size_seq = Input(shape=(word_seq_len, char_seq_len))
    word_size_seq = Input(shape=(word_seq_len,))
    char_vector = Embedding(char_vocab_size, embedding_dim, input_length=char_seq_len)(char_size_seq)
    char_lstm = TimeDistributed(LSTM(lstm_units))(char_vector)
    word_vector = Embedding(word_vocab_size, embedding_dim, input_length=word_seq_len)(word_size_seq)
    concat_layer = Concatenate(axis=-1)([word_vector, char_lstm])
    bilstm = Bidirectional(LSTM(lstm_units, return_sequences=True))(concat_layer)
    mlp_1 = Dense(lstm_units, activation="relu")(bilstm)
    mlp_2 = Dense(pos_vocab_size+1)(mlp_1)
    output = Dense(pos_vocab_size+1, activation="softmax")(mlp_2)
    
    return Model(inputs=[word_size_seq, char_size_seq], outputs=output)

In [23]:
char_vocab_size = len(np.unique(char_seq_train)) + 1
word_vocab_size = len(vocab) + 1
pos_vocab_size = len(postags) + 1
embedding_dim = 256

In [None]:
char_seq_train.shape

(42503, 150, 40)

In [24]:
s_maxlen = char_seq_train.shape[1]
w_maxlen = char_seq_train.shape[2]

In [25]:
model = ConstituencyParser(s_maxlen, w_maxlen, 128, embedding_dim, char_vocab_size, word_vocab_size, pos_vocab_size)
model.compile(optimizer=Adam(), loss="categorical_crossentropy", metrics=[Accuracy(), Precision(), Recall()])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 150, 40)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 40, 256) 12032       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 256)     2561280     input_2[0][0]                    
______________________________________________________________________________________________

In [26]:
history = model.fit(
    [X_train_word, X_train_char], 
    y_train, 
    batch_size=128, 
    epochs=10,
    validation_data=([X_valid_word, X_valid_char], y_valid)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
scores = model.evaluate([X_test_word, X_test_char],y_test)



In [30]:
f1_score = 2* (scores[2]*scores[3])/(scores[2] + scores[3])
print("Precision: {}, Recall: {}, F1 score: {}".format(scores[2],scores[3],f1_score))

Precision: 0.995675265789032, Recall: 0.9950505495071411, F1 score: 0.9953628096259406
