# Developing a Question Tagging system.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, nltk
import gensim
import codecs
import spacy
import pickle
from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from sklearn import linear_model, svm
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer



In [2]:
f_train = open('LabelledData (1).txt', 'r+')
data = pd.DataFrame(f_train.readlines(), columns = ['Question'])

In [3]:
data.head()

Unnamed: 0,Question
0,how did serfdom develop in and then leave russ...
1,what films featured the character popeye doyle...
2,how can i find a list of celebrities ' real na...
3,what fowl grabs the spotlight after the chines...
4,"what is the full form of .com ? ,,, what\n"


# Splitting the Data into Question and Category

In [4]:
data['Category'] = data.Question.apply(lambda x: x.split(',,,', 1)[1])
data['Question'] = data.Question.apply(lambda x: x.split(',,,', 1)[0])
data['Category'].unique()

array([' unknown\n', ' what\n', ' when\n', ' who\n', '  what\n', '  who\n',
       ' affirmation\n'], dtype=object)

Since the categories have Newline character and variable number of blank whitespaces , we will be creating a chomp method to remove them.

In [5]:
def chomp(x):
    x = x.lstrip()
    x = x.rstrip()
    if x.endswith("\r\n"):x[:-2]
    if x.endswith("\n"): return x[:-1]
    return x

In [6]:
for i in range (0, data.shape[0]):
    data.at[i, 'Category'] = chomp(data.at[i, 'Category'])

In [7]:
resultant_data = pd.DataFrame({"Question": data['Question'], "Category": data['Category']})
resultant_data.to_csv('Question Labeling.csv', index=False)

In [8]:
data.head()

Unnamed: 0,Question,Category
0,how did serfdom develop in and then leave russ...,unknown
1,what films featured the character popeye doyle ?,what
2,how can i find a list of celebrities ' real na...,unknown
3,what fowl grabs the spotlight after the chines...,what
4,what is the full form of .com ?,what


In [9]:
data.describe()

Unnamed: 0,Question,Category
count,1483,1483
unique,1476,5
top,what is the speed of the mississippi river ?,what
freq,3,609


So we observe that the data set contains 7 duplicate entries and a total of 5 classes.

# Splitting the Data
Creating the train and test set with a ratio of 80:20.<br>
In the blocks below, we will further observe the properties of the train and test sets.

In [10]:
classVar = data['Category']
data_without_class = data.drop('Category', axis = 1) 
X_train, X_test, y_train, y_test = train_test_split(data_without_class, classVar, test_size=0.2, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,Question
381,what is the abbreviation of the company name `...
532,what is the telephone number for the universit...
672,who sings the song `` drink to me with thine e...
482,when is the tulip festival in michigan ?
405,what is the difference between a generator and...


In [12]:
X_test.head()

Unnamed: 0,Question
123,how long does it take for your blood to make o...
432,who is stein eriksen ?
1032,who lives at 39 stone canyon way ?
529,how many innings are there in a regulation sof...
1416,is it accurate ?


In [13]:
y_train.head()

381    what
532    what
672     who
482    what
405    what
Name: Category, dtype: object

In [14]:
y_test.head()

123         unknown
432             who
1032            who
529         unknown
1416    affirmation
Name: Category, dtype: object

In [15]:
X_train.describe()

Unnamed: 0,Question
count,1186
unique,1183
top,who is desmond tutu ?
freq,2


In [16]:
y_train.describe()

count     1186
unique       5
top       what
freq       493
Name: Category, dtype: object

In [17]:
X_test.describe()

Unnamed: 0,Question
count,297
unique,296
top,what is the speed of the mississippi river ?
freq,2


In [18]:
y_test.describe()

count      297
unique       5
top       what
freq       116
Name: Category, dtype: object

In [19]:
y_train.unique()

array(['what', 'who', 'unknown', 'affirmation', 'when'], dtype=object)

In [20]:
total_test_variables = y_train.append(y_test)

In [21]:
total_test_variables.values

array(['what', 'what', 'who', ..., 'what', 'affirmation', 'when'], dtype=object)

In [22]:
print(y_train.shape)
print(y_test.shape)
print(total_test_variables.shape)

(1186,)
(297,)
(1483,)


# Label Encoding
We will further encode the class variables in both the train and test sets.

In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(pd.Series(total_test_variables).values)
y_train = le.transform(y_train.values)
print(y_train.shape)

(1186,)


In [24]:
y_test  = le.transform(y_test.values)
np.unique(y_test)
print(y_test.shape)

(297,)


# Pre-processing the Questions data

In [25]:
all_corpus = X_train.append(X_test)
print(all_corpus.shape)

(1483, 1)


In [26]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# dot_words = []
# for i in all_corpus.index:
#     for word in all_corpus.at[i, 'Question'].split():
#         if '.' in word and len(word)>2:
#             dot_words.append(word)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I327950\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\I327950\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for i in all_corpus.index:
        qs = []
        for word in all_corpus.at[i, 'Question'].split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [28]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [29]:
common_dot_words = ['U.S.', 'St.', 'Mr.', 'Mrs.', 'D.C.']
#all_corpus
all_corpus = preprocess(all_corpus, keep_list = common_dot_words, remove_stopwords = True)

# Splitting the combined corpus into train and test data again

In [30]:
train_corpus = all_corpus[0:X_train.shape[0]]
test_corpus = all_corpus[X_train.shape[0]:]
print(len(train_corpus))
print(len(test_corpus))

1186
297


# Loading the English model for Spacy.
NLTK version for the same performs too slowly, hence opting for Spacy.

In [31]:
nlp = spacy.load('en')

# Feature Extraction and Engineering

Creating list of Named Entitites, Lemmas, POS Tags, Syntactic Dependency Relation and Orthographic Features using shape from the train corpus.<br>
Later, these would be used as features for our model.

In [32]:
all_ner = []
all_lemma = []
all_tag = []
all_dep = []
all_shape = []
for row in train_corpus:
    doc = nlp(row)
    present_lemma = []
    present_tag = []
    present_dep = []
    present_shape = []
    present_ner = []
    for token in doc:
        present_lemma.append(token.lemma_)
        present_tag.append(token.tag_)
        present_dep.append(token.dep_)
        present_shape.append(token.shape_)
    all_lemma.append(" ".join(present_lemma))
    all_tag.append(" ".join(present_tag))
    all_dep.append(" ".join(present_dep))
    all_shape.append(" ".join(present_shape))
    for ent in doc.ents:
        present_ner.append(ent.label_)
    all_ner.append(" ".join(present_ner))

# Converting the Features obtained above into vectors using CountVectorizer.

In [33]:
count_vec_ner = CountVectorizer(ngram_range=(1, 2)).fit(all_ner)
ner_ft = count_vec_ner.transform(all_ner)
count_vec_lemma = CountVectorizer(ngram_range=(1, 2)).fit(all_lemma)
lemma_ft = count_vec_lemma.transform(all_lemma)
count_vec_tag = CountVectorizer(ngram_range=(1, 2)).fit(all_tag)
tag_ft = count_vec_tag.transform(all_tag)
count_vec_dep = CountVectorizer(ngram_range=(1, 2)).fit(all_dep)
dep_ft = count_vec_dep.transform(all_dep)
count_vec_shape = CountVectorizer(ngram_range=(1, 2)).fit(all_shape)
shape_ft = count_vec_shape.transform(all_shape)

# Stacking up the features obtained together

In [34]:
#x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft, dep_ft, shape_ft])
x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft])

In [35]:
x_all_ft_train

<1186x7619 sparse matrix of type '<class 'numpy.int64'>'
	with 21215 stored elements in COOrdinate format>

# Converting from COOrdinate format to Compressed Sparse Row format for easier mathematical computations.

In [36]:
x_all_ft_train = x_all_ft_train.tocsr()
x_all_ft_train

<1186x7619 sparse matrix of type '<class 'numpy.int64'>'
	with 21215 stored elements in Compressed Sparse Row format>

# Obtaining the Features from test data and transforming it using the vectorizers obtained from the training set

In [37]:
all_test_ner = []
all_test_lemma = []
all_test_tag = []
all_test_dep = []
all_test_shape = []
for row in test_corpus:
    doc = nlp(row)
    present_lemma = []
    present_tag = []
    present_dep = []
    present_shape = []
    present_ner = []
    #print(row)
    for token in doc:
        present_lemma.append(token.lemma_)
        present_tag.append(token.tag_)
        #print(present_tag)
        present_dep.append(token.dep_)
        present_shape.append(token.shape_)
    all_test_lemma.append(" ".join(present_lemma))
    all_test_tag.append(" ".join(present_tag))
    all_test_dep.append(" ".join(present_dep))
    all_test_shape.append(" ".join(present_shape))
    for ent in doc.ents:
        present_ner.append(ent.label_)
    all_test_ner.append(" ".join(present_ner))

In [38]:
ner_test_ft = count_vec_ner.transform(all_test_ner)
lemma_test_ft = count_vec_lemma.transform(all_test_lemma)
tag_test_ft = count_vec_tag.transform(all_test_tag)
dep_test_ft = count_vec_dep.transform(all_test_dep)
shape_test_ft = count_vec_shape.transform(all_test_shape)

In [39]:
#x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft, dep_test_ft, shape_test_ft])
x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft])

In [40]:
x_all_ft_test

<297x7619 sparse matrix of type '<class 'numpy.int64'>'
	with 3829 stored elements in COOrdinate format>

In [41]:
x_all_ft_test = x_all_ft_test.tocsr()
x_all_ft_test

<297x7619 sparse matrix of type '<class 'numpy.int64'>'
	with 3829 stored elements in Compressed Sparse Row format>

# Model Training
Literature shows that Linear SVM performs the best on such use cases.(http://www.aclweb.org/anthology/C08-1061)

In [42]:
model = svm.LinearSVC()

In [43]:
model.fit(x_all_ft_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

 Now, we will test the preformance of the model on the test set.

In [44]:
preds = model.predict(x_all_ft_test)

In [45]:
accuracy_score(y_test, preds)

0.96296296296296291

# We received an amazing accuracy of 96.296%.

We should now check where exactly the model failed to provide correct results.

In [46]:
ct = 0
for i in range(0,y_test.shape[0]):
    if(y_test[i]!=preds[i]):
        ct = ct + 1
        index = X_test.index[i]
        print(ct, X_test.at[index, 'Question'], le.classes_[y_test[i]], le.classes_[preds[i]])

1 when is boxing day ?  what when
2 what year did hitler die ?  when what
3 what did richard feynman say upon hearing he would receive the nobel prize in physics ?  unknown what
4 what singer 's theme song was when the moon comes over the mountain ?  when what
5 give a reason for american indians oftentimes dropping out of school . ?  unknown affirmation
6 who is the man behind the pig-the man who pulls the strings and speaks for miss piggy ?  unknown who
7 name the various costumed personas of dr. henry pym .  unknown affirmation
8 which water filter cap replacement (white color) should i use for this filter ?  affirmation unknown
9 when the tutankhamun exhibit was on display in the u.s. , what moving company transported it ?  when what
10 who was the star of the 1965 broadway hit golden boy ?  what who
11 define the pheonix club ?  unknown affirmation


The model didn't give the correct output for the 11 questions mentioned above out of a total of 297 questions.

However, closer examination of the original categories and assigned categories reveals that <b> 1, 3, 4, 6, 8, 10 </b> seemed to have incorrect tags, whereas the predicted tags seems more suitable.

# Tagging a Test Set.
As mentioned in the question, we will try tagging the question set mentioned in the Problem Statement. http://cogcomp.cs.illinois.edu/Data/QA/QC/train_1000.label

The question are in a <b>CoarseClass:FineClass Question </b> format. <br>
Hence, we will perform the necessary splitting as well in the next block.

In [47]:
f_tagging_1000 = open('Tagging_1000.txt', 'r+')
tagging_1000 = pd.DataFrame(f_tagging_1000.readlines(), columns = ['Question'])
tagging_1000['Question'] = tagging_1000.Question.apply(lambda x: x.split(' ', 1)[1])

In [48]:
tagging_1000.head()

Unnamed: 0,Question
0,How did serfdom develop in and then leave Russ...
1,What films featured the character Popeye Doyle...
2,How can I find a list of celebrities ' real na...
3,What fowl grabs the spotlight after the Chines...
4,What is the full form of .com ?\n


In [49]:
tagging_1000.describe()

Unnamed: 0,Question
count,1000
unique,996
top,What is the speed of the Mississippi River ?\n
freq,3


<b>We will check the properties of our training set and see if there is any overlap between the training set and this test/validation set.</b>

In [50]:
data_without_class.head()

Unnamed: 0,Question
0,how did serfdom develop in and then leave russ...
1,what films featured the character popeye doyle ?
2,how can i find a list of celebrities ' real na...
3,what fowl grabs the spotlight after the chines...
4,what is the full form of .com ?


In [51]:
data_without_class.describe()

Unnamed: 0,Question
count,1483
unique,1476
top,what is the speed of the mississippi river ?
freq,3


# Observations
* It's observed that the questions in the heads of both the sets are similar.
* 1476 and 996 unique questions are present in the Train set and Validation Set.
* A comparison should be made to check their overlap and for this purpose we will define a chompe_lower function.

In [52]:
def chomp_lower(data_without_class, tagging):
    for i in range(0, data_without_class.shape[0]):
        data_without_class.at[i, 'Question'] = data_without_class.at[i, 'Question'].lower()
        data_without_class.at[i, 'Question'] = chomp(data_without_class.at[i, 'Question'])
    for i in range(0, tagging.shape[0]):
        tagging.at[i, 'Question'] = tagging.at[i, 'Question'].lower()
        tagging.at[i, 'Question'] = chomp(tagging.at[i, 'Question'])
    return data_without_class, tagging

In [53]:
data_without_class, tagging_1000 = chomp_lower(data_without_class, tagging_1000)
total_data = data_without_class.append(tagging_1000)

In [54]:
total_data.describe()

Unnamed: 0,Question
count,2483
unique,1497
top,what is the speed of the mississippi river ?
freq,6


* The appended train and validation sets have a total of 1497 uniques questions. <br>
* Out of these, 1476 unique question were already present in the train set, hence the validation set is contribution only 21 uniques questions.<br>
* Further observation is required.

In [55]:
drop_dupli = total_data.drop_duplicates()
drop_dupli.describe()
drop_dupli.sort_values(['Question'])
drop_dupli.to_csv('Train_vaild_total.csv', index=False)

Observation of the appended CSV file shows that the 21 unique questions contributed by the validation set are actually present in the train set and the are classified unique because they have punctuations and within question whitespaces at different locations.<br>
Hence, there is no point validating on the mentioned dataset as it completely overlaps the training set.

So, we will try out the http://cogcomp.cs.illinois.edu/Data/QA/QC/train_3000.label containing 3000 questions. <br>
Similar pre-processing as above would be performed as data is in same format.

In [56]:
f_tagging_3000 = open('tagging_3000.txt', 'r+')
tagging_3000 = pd.DataFrame(f_tagging_3000.readlines(), columns = ['Question'])
tagging_3000['Question'] = tagging_3000.Question.apply(lambda x: x.split(' ', 1)[1])

In [57]:
data_without_class, tagging_3000 = chomp_lower(data_without_class, tagging_3000)

We will again check for overlap.

In [58]:
total_data = data_without_class.append(tagging_3000)

In [59]:
total_data.describe()

Unnamed: 0,Question
count,4483
unique,3241
top,what is the speed of the mississippi river ?
freq,6


In [60]:
tagging_3000.describe()

Unnamed: 0,Question
count,3000
unique,2976
top,what is the speed of the mississippi river ?
freq,3


Way less overlap between the mentioned sets, hence we can go ahead with this dataset. <br>
We will extract the features of the validation dataset and fit them to the CountVectorizer's obtained during training.

In [61]:
all_tagging_ner = []
all_tagging_lemma = []
all_tagging_tag = []
all_tagging_dep = []
all_tagging_shape = []
for i in range(0, tagging_3000.shape[0]):
    doc = nlp(tagging_3000.at[i, 'Question'])
    present_lemma = []
    present_tag = []
    present_dep = []
    present_shape = []
    present_ner = []
    #print(row)
    for token in doc:
        present_lemma.append(token.lemma_)
        present_tag.append(token.tag_)
        #print(present_tag)
        present_dep.append(token.dep_)
        present_shape.append(token.shape_)
    all_tagging_lemma.append(" ".join(present_lemma))
    all_tagging_tag.append(" ".join(present_tag))
    all_tagging_dep.append(" ".join(present_dep))
    all_tagging_shape.append(" ".join(present_shape))
    for ent in doc.ents:
        present_ner.append(ent.label_)
    all_tagging_ner.append(" ".join(present_ner))

In [62]:
ner_tagging_ft = count_vec_ner.transform(all_tagging_ner)
lemma_tagging_ft = count_vec_lemma.transform(all_tagging_lemma)
tag_tagging_ft = count_vec_tag.transform(all_tagging_tag)
dep_tagging_ft = count_vec_dep.transform(all_tagging_dep)
shape_tagging_ft = count_vec_shape.transform(all_tagging_shape)

In [63]:
#x_all_tagging_ft = hstack([ner_tagging_ft, lemma_tagging_ft, tag_tagging_ft, dep_tagging_ft, shape_tagging_ft])
x_all_tagging_ft = hstack([ner_tagging_ft, lemma_tagging_ft, tag_tagging_ft])

In [64]:
x_all_tagging_ft_csr = x_all_tagging_ft.tocsr()

In [65]:
x_all_tagging_ft_csr

<3000x7619 sparse matrix of type '<class 'numpy.int64'>'
	with 56033 stored elements in Compressed Sparse Row format>

Now, we will perform the tagging for the validation data.

In [66]:
preds_tagged = model.predict(x_all_tagging_ft_csr)

# Storing the tagged results in a CSV file

In [67]:
tagged = pd.DataFrame({"Question": tagging_3000['Question'],"Category": le.classes_[preds_tagged]})
tagged.to_csv('Tagged.csv', index=False)

# We will also atore the model.

In [68]:
filename = 'tagging_model.pkl'
pickle.dump(model, open(filename, 'wb'))

# Conclusion

* We were able to create a Question Tagger which performed with an accuracy of 95.96%.
* We were also able to tag new questions based on the model and store them into an appropriate data structure.
* We relied on Named Entitites, Lemmas, POS Tags, Syntactic Dependency Relation and Orthography as Features.

# Future Work/Experimentation
* We can try other Machine Learning algorithms such as Naive Bayes', Classification Trees.
* Ensembles such as Random Forests and Boosting algortihms can also be tried.
* Deep Learning based LSTMs/RNNs can also be checked for trying to improve accuracy.
* Usage of Embeddings from Word2Vec, Glove are also a possibility.

# References
Classifying What-type Questions by Head Noun Tagging<br>
Fangtao Li, Xian Zhang, Jinhui Yuan, Xiaoyan Zhu<br>
State Key Laboratory on Intelligent Technology and Systems<br>
Tsinghua National Laboratory for Information Science and Technology<br>
Department of Computer Sci. and Tech., Tsinghua University, Beijing 100084, China<br>
zxy-dcs@tsinghua.edu.cn<br>
(http://www.aclweb.org/anthology/C08-1061)