# Assignment 2

# Task 1
ref:https://becominghuman.ai/part-of-speech-tagging-tutorial-with-the-keras-deep-learning-library-d7f93fa05537

Data is split in to training set, testing set, and validation set. Each of these sets are then split into tags and feature vectors for each word.

N-gram model is used where a window of previous 3 and next 2 words is considered.

The feature set also consists of 3 characters prefix and suffix of each word 

The features are returned in a dictionary.

This feature dictionary is then vectorized using the DictVectorizer

The neural network model uses sigmoid activation function and the softmax activation is applied at the output layer
An accuracy of 96.8% is observed

In [1]:
import numpy as np
import nltk
from nltk.corpus import treebank
import random
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [9]:
#Loading data
sentences = treebank.tagged_sents(tagset='universal')
tags =[]
for sentence in treebank.tagged_sents():
    for word, tag in sentence:
        tags.append(tag)
tags=set(tags)
#print(tags)

In [10]:
#Split data into Training set , Testing set and Validation set
train_test_cutoff = int(.80 * len(sentences)) 
training_sentences = sentences[:train_test_cutoff]
testing_sentences = sentences[train_test_cutoff:]
train_val_cutoff = int(.0005 * len(training_sentences))
validation_sentences = training_sentences[:train_val_cutoff]
training_sentences = training_sentences[train_val_cutoff:]


In [11]:
#data preprocessing
def untag(tagged_sentence):
    words=[]
    for word, _ in tagged_sentence:
        words.append(word)
    return words

def add_basic_features(sentence_terms, index):
    term = sentence_terms[index]
    return {
        'nb_terms': len(sentence_terms),
        'term': term,
        'is_first': index == 0,
        'is_last': index == len(sentence_terms) - 1,
        'is_capitalized': term[0].upper() == term[0],
        'is_all_caps': term.upper() == term,
        'is_all_lower': term.lower() == term,
        'prefix-1': term[0],
        'prefix-2': term[:2],
        'prefix-3': term[:3],
        'suffix-1': term[-1],
        'suffix-2': term[-2:],
        'suffix-3': term[-3:],
        'prev_word1': '' if index == 0 else sentence_terms[index - 1],
        'prev_word2': '' if index == 0 or index == 1 else sentence_terms[index - 2],
        'prev_word3': '' if index == 0 or index == 1 or index == 2else sentence_terms[index - 3],
        'next_word1': '' if index == len(sentence_terms) - 1 else sentence_terms[index + 1],
        'next_word2': '' if index == len(sentence_terms) - 1 or index == len(sentence_terms) - 2 else sentence_terms[index + 2]
    }
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for pos_tags in tagged_sentences:
        for index, (term, class_) in enumerate(pos_tags):
            X.append(add_basic_features(untag(pos_tags), index))
            y.append(class_)
    return X, y

X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(testing_sentences)
X_val, y_val = transform_to_dataset(validation_sentences)
val_posts,val_tags = [],[]
for pos_tags in validation_sentences:
    for index, (term, class_) in enumerate(pos_tags):
            val_posts.append(untag(pos_tags))
            val_tags.append(class_)

In [12]:
#creating vectors

dict_vectorizer = DictVectorizer(sparse=False)
dict_vectorizer.fit(X_train + X_test + X_val)
X_train = dict_vectorizer.transform(X_train)
X_test = dict_vectorizer.transform(X_test)
X_val = dict_vectorizer.transform(X_val)

label_encoder = LabelEncoder()
label_encoder.fit(y_train + y_test + y_val)

y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
y_val = label_encoder.transform(y_val)

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
y_val = np_utils.to_categorical(y_val)


In [13]:
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1]))
model.add(Activation('sigmoid'))
model.add(Dense(12))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

history = model.fit(X_train, y_train, 
                    batch_size=256, 
                    epochs=3, 
                    verbose=1, 
                    validation_split=0.1)

score = model.evaluate(X_test, y_test, 
                       batch_size=10, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 72557 samples, validate on 8062 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.09513067156635531
Test accuracy: 0.9689106192499097


In [14]:
#validating the output
for i in range(len(X_val)):    
    prediction = model.predict(np.array([X_val[i]]))
    text_labels = label_encoder.classes_ 
    predicted_label = text_labels[np.argmax(prediction[0])]
    prob = prediction[0]
    print("....")
    print("text: "+str(val_posts[0][i]))
    print('Actual label:' + val_tags[i])
    print("Predicted label: " + predicted_label)  
    print("Probabilities of labels: " + str(prob)) 


....
text: Pierre
Actual label:NOUN
Predicted label: NOUN
Probabilities of labels: [0.00361503 0.18973994 0.04224168 0.14558423 0.00351953 0.05036765
 0.5019615  0.01056448 0.01418394 0.00225018 0.02883826 0.00713357]
....
text: Vinken
Actual label:NOUN
Predicted label: NOUN
Probabilities of labels: [3.9041904e-04 2.6865397e-03 5.3626078e-04 7.5662555e-03 1.2549205e-04
 1.8793099e-04 9.7823763e-01 1.6962015e-03 2.4471979e-04 2.2549546e-04
 7.5228475e-03 5.8017467e-04]
....
text: ,
Actual label:.
Predicted label: .
Probabilities of labels: [9.9797279e-01 2.5965649e-04 9.7500430e-05 2.3769135e-04 4.3621196e-05
 7.0192844e-05 1.9327119e-04 4.3456105e-04 8.5083899e-05 8.3963067e-05
 3.8148288e-04 1.4018241e-04]
....
text: 61
Actual label:NUM
Predicted label: NUM
Probabilities of labels: [2.0273016e-03 3.9523072e-03 1.5476816e-03 1.9835664e-03 4.7425559e-04
 9.2269573e-04 1.2278978e-03 9.7583789e-01 5.5424764e-04 7.8292296e-04
 1.2641661e-03 9.4250720e-03]
....
text: years
Actual label:NOUN

# Task 2
ref:https://cloud.google.com/blog/products/gcp/intro-to-text-classification-with-keras-automatically-tagging-stack-overflow-posts

The corpus is created by merging text tagged as greeitng, goodbye, and action directive from The nps chats corpus and The Switchboard Dialog Act Corpus

example of data from the corpus:<br>
  greeting: hola<br>
  goodbye: ok..i'm gone...again...cya later <br>
  request: Can you hang on just a minute? 
  
The data is loaded from the corpus into a list of dictionaries (with keys: class & text and values: dialogue act  & text)
This list of dictionaries is split into training set, testing set and validation set and then, each these sets are split into seperate text and tags list which are then converted to vectors using the bag of words model.

A sequential model is used with relu activation function in the hidden layers and the softmax activation in the output layer.

Test accuracy or 92.6% is observed


In [5]:
import random
import nltk
import sklearn
import numpy as np
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from sklearn.preprocessing import LabelBinarizer

In [27]:
#Data Loading and Preprocessing
#Loading the Data
data = []
post=[]
tag=[]
words=[]
with open("myCorpus.txt","r") as corpus:
    for line in corpus:
        label=line.split(":", 1)[0]        
        text=line.split(":", 1)[1]
        data.append({"class":label,"sentence":text})
        for word in text.split():
            if word not in words:
                words.append(word)
 
words=list(set(words))

#splitting it into Training set , Testing set and Validation set
random.shuffle(data)
train_size = int(len(data) * .8)
train=data[:train_size]
test=data[train_size:]
val_size=int(.002*len(train))
validation=train[:val_size]
train=train[val_size:]

train_posts=[]
train_tags=[]
for post in train:
    train_posts.append(post['sentence'])
    train_tags.append(post['class'])
    
test_posts = []
test_tags = []
for post in test:
    test_posts.append(post['sentence'])
    test_tags.append(post['class'])

val_posts=[]
val_tags=[]
for post in validation:
    val_posts.append(post['sentence'])
    val_tags.append(post['class'])

#creating vectors for each set using bag or words model 
vocab_size = len(words)
tokenize = Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(train_posts)
    
x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)
x_val = tokenize.texts_to_matrix(val_posts)

encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
y_val = encoder.transform(val_tags)



In [21]:
#Building a model
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

history = model.fit(x_train, y_train, 
                    batch_size=10, 
                    epochs=3, 
                    verbose=1, 
                    validation_split=0.1)

score = model.evaluate(x_test, y_test, 
                       batch_size=10, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 1608 samples, validate on 179 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.2229078364315293
Test accuracy: 0.9263392761349678


In [28]:
#Validating results
for i in range(len(x_val)):    
    prediction = model.predict(np.array([x_val[i]]))
    text_labels = encoder.classes_ 
    predicted_label = text_labels[np.argmax(prediction[0])]
    print("....")
    print("text: "+val_posts[i])
    print('Actual label:' + val_tags[i])
    print("Predicted label: " + predicted_label)    


....
text:  hi 

Actual label:greeting
Predicted label: greeting
....
text:  you want to go ahead and tell me your favorite team, or  who you think will be doing well this year. 

Actual label:request
Predicted label: request
....
text:  Byebye. 

Actual label:goodbye
Predicted label: goodbye
....
text:  Okay. 

Actual label:goodbye
Predicted label: request
....
text:  nice talking to you, Linda. 

Actual label:goodbye
Predicted label: request
....
text:  hey  

Actual label:greeting
Predicted label: request
....
text:  Hi  sweetie

Actual label:greeting
Predicted label: greeting
....
text:   Me too.  

Actual label:goodbye
Predicted label: request
