In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, nltk
import gensim
import codecs
from sner import Ner
import spacy
from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.internals import find_jars_within_path
from nltk.tag import StanfordPOSTagger
from nltk.tag import StanfordNERTagger
import spacy
from sklearn import linear_model
from sklearn import svm
from sklearn.metrics import fbeta_score, accuracy_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer



In [2]:
f_train = open('traininig_dataset (1) (1).txt', 'r+')
f_test = open('validation_dataset (1) (1).txt', 'r+')

train = pd.DataFrame(f_train.readlines(), columns = ['Question'])
test = pd.DataFrame(f_test.readlines(), columns = ['Question'])

In [3]:
train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])
train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])
train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])
train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])
test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])
test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])
test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])
test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])

In [4]:
train.head()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
0,How did serfdom develop in and then leave Russ...,DESC:manner,DESC,manner
1,What films featured the character Popeye Doyle...,ENTY:cremat,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESC:manner,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,ENTY,animal
4,What is the full form of .com ?\n,ABBR:exp,ABBR,exp


In [5]:
test.describe()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
count,500,500,500,500
unique,500,42,6,39
top,How cold should a refrigerator be ?\n,DESC:def,DESC,def
freq,1,123,138,123


In [6]:
test.head()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
0,How far is it from Denver to Aspen ?\n,NUM:dist,NUM,dist
1,"What county is Modesto , California in ?\n",LOC:city,LOC,city
2,Who was Galileo ?\n,HUM:desc,HUM,desc
3,What is an atom ?\n,DESC:def,DESC,def
4,When did Hawaii become a state ?\n,NUM:date,NUM,date


In [7]:
train.append(test).describe()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
count,5952,5952,5952,5952
unique,5871,50,6,47
top,How deep is a fathom ?\n,HUM:ind,ENTY,ind
freq,3,1017,1344,1017


As can be observed, the train set consists of some duplicate question (81 to be exact). <br>
The number of unique Coarse:Fine classes is 50 whereas entries corresponding to 42 are present in the test set. <br>
The number of fine classes overall is 47 whereas entries corresponding to 39 are present in test.

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(pd.Series(train.QType.tolist() + test.QType.tolist()).values)
train['QType'] = le.transform(train.QType.values)
test['QType'] = le.transform(test.QType.values)
le2 = LabelEncoder()
le2.fit(pd.Series(train['QType-Coarse'].tolist() + test['QType-Coarse'].tolist()).values)
train['QType-Coarse'] = le2.transform(train['QType-Coarse'].values)
test['QType-Coarse'] = le2.transform(test['QType-Coarse'].values)
le3 = LabelEncoder()
le3.fit(pd.Series(train['QType-Fine'].tolist() + test['QType-Fine'].tolist()).values)
train['QType-Fine'] = le3.transform(train['QType-Fine'].values)
test['QType-Fine'] = le3.transform(test['QType-Fine'].values)

In [10]:
train.head()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
0,How did serfdom develop in and then leave Russ...,4,1,23
1,What films featured the character Popeye Doyle...,9,2,8
2,How can I find a list of celebrities ' real na...,4,1,23
3,What fowl grabs the spotlight after the Chines...,6,2,1
4,What is the full form of .com ?\n,1,0,16


In [11]:
all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)

Obtaining Dotwords.<br>
Also, performing text cleaning and pre-processing in the next two blocks

In [15]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

dot_words = []
for row in all_corpus:
    for word in row.split():
        if '.' in word and len(word)>2:
            dot_words.append(word)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I327950\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\I327950\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [17]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [18]:
common_dot_words = ['U.S.', 'St.', 'Mr.', 'Mrs.', 'D.C.']
all_corpus = preprocess(all_corpus, keep_list = common_dot_words, remove_stopwords = True)

Loading the English model for Spacy.<br>
NLTK version for the same performs too slowly, hence opting for Spacy.

In [19]:
nlp = spacy.load('en')

Creating list of Named Entitites, Lemmas, POS Tags, Syntactic Dependency Relation and Orthographic Features using shape.<br>
Later, these would be used as features for our model.

In [20]:
all_ner = []
all_lemma = []
all_tag = []
all_dep = []
all_shape = []
for row in all_corpus:
    doc = nlp(row)
    present_lemma = []
    present_tag = []
    present_dep = []
    present_shape = []
    present_ner = []
    #print(row)
    for token in doc:
        present_lemma.append(token.lemma_)
        present_tag.append(token.tag_)
        #print(present_tag)
        present_dep.append(token.dep_)
        present_shape.append(token.shape_)
    all_lemma.append(" ".join(present_lemma))
    all_tag.append(" ".join(present_tag))
    all_dep.append(" ".join(present_dep))
    all_shape.append(" ".join(present_shape))
    for ent in doc.ents:
        present_ner.append(ent.label_)
    all_ner.append(" ".join(present_ner))

Converting the attributes obtained above into vectors using CountVectorizer.

In [21]:
count_vec_ner = CountVectorizer(ngram_range=(1, 2)).fit(all_ner)
ner_ft = count_vec_ner.transform(all_ner)
count_vec_lemma = CountVectorizer(ngram_range=(1, 2)).fit(all_lemma)
lemma_ft = count_vec_lemma.transform(all_lemma)
count_vec_tag = CountVectorizer(ngram_range=(1, 2)).fit(all_tag)
tag_ft = count_vec_tag.transform(all_tag)
count_vec_dep = CountVectorizer(ngram_range=(1, 2)).fit(all_dep)
dep_ft = count_vec_dep.transform(all_dep)
count_vec_shape = CountVectorizer(ngram_range=(1, 2)).fit(all_shape)
shape_ft = count_vec_shape.transform(all_shape)

Combining the features obtained into 1 matrix

In [22]:
x_all_ft = hstack([ner_ft, lemma_ft, tag_ft, dep_ft, shape_ft])

In [23]:
x_all_ft

<5952x29210 sparse matrix of type '<class 'numpy.int64'>'
	with 184291 stored elements in COOrdinate format>

Converting from COOrdinate format to Compressed Sparse Row format for easier mathematical computations.

In [24]:
x_all_ft_csr = x_all_ft.tocsr()
x_all_ft_csr

<5952x29210 sparse matrix of type '<class 'numpy.int64'>'
	with 184291 stored elements in Compressed Sparse Row format>

Splitting obtained matrix to original test and train features

In [25]:
x_all_ft_train = x_all_ft_csr[0:train.shape[0],:]
x_all_ft_train

<5452x29210 sparse matrix of type '<class 'numpy.int64'>'
	with 172919 stored elements in Compressed Sparse Row format>

In [26]:
x_all_ft_test = x_all_ft_csr[train.shape[0]:,:]
x_all_ft_test

<500x29210 sparse matrix of type '<class 'numpy.int64'>'
	with 11372 stored elements in Compressed Sparse Row format>

Literature study over the years has shown Linear SVM performs best in this Use Case.

In [27]:
model = svm.LinearSVC()

First Modelling for Coarse Classes

In [28]:
model.fit(x_all_ft_train, train['QType-Coarse'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [29]:
preds = model.predict(x_all_ft_test)

In [30]:
preds

array([5, 4, 3, 1, 5, 5, 3, 1, 1, 1, 4, 1, 5, 3, 5, 5, 4, 3, 1, 5, 3, 1, 4,
       1, 1, 3, 1, 2, 4, 1, 5, 4, 1, 5, 5, 5, 4, 5, 5, 5, 2, 1, 1, 1, 3, 2,
       5, 1, 5, 3, 1, 3, 3, 1, 1, 1, 5, 4, 4, 5, 4, 3, 4, 2, 4, 3, 2, 1, 5,
       4, 5, 5, 4, 3, 4, 1, 2, 5, 5, 3, 1, 5, 3, 5, 5, 1, 1, 3, 1, 4, 2, 1,
       5, 5, 4, 4, 5, 1, 1, 3, 1, 3, 1, 3, 4, 1, 5, 2, 5, 4, 2, 1, 4, 2, 4,
       3, 5, 1, 5, 4, 5, 2, 1, 3, 1, 3, 1, 5, 1, 5, 5, 3, 1, 1, 1, 1, 4, 3,
       3, 1, 1, 2, 4, 2, 1, 2, 3, 2, 1, 1, 2, 3, 1, 5, 3, 4, 4, 1, 2, 4, 1,
       1, 5, 4, 2, 2, 5, 1, 4, 3, 5, 5, 5, 1, 4, 4, 4, 5, 2, 5, 4, 1, 4, 1,
       2, 3, 3, 1, 4, 1, 1, 4, 5, 5, 1, 4, 2, 3, 2, 2, 3, 4, 3, 2, 1, 4, 3,
       5, 1, 1, 5, 5, 1, 4, 1, 2, 1, 2, 5, 1, 1, 5, 1, 1, 4, 2, 5, 1, 4, 3,
       5, 3, 1, 5, 2, 1, 4, 1, 4, 5, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 3, 1, 2,
       2, 1, 4, 4, 2, 1, 4, 3, 3, 5, 2, 5, 1, 1, 4, 5, 1, 2, 3, 1, 3, 1, 2,
       1, 5, 0, 2, 4, 3, 0, 1, 4, 1, 1, 1, 1, 1, 4, 2, 5, 2, 1, 1, 2, 5, 1,
       2, 0,

In [31]:
accuracy_score(test['QType-Coarse'], preds)

0.878

Glad to announce, Feature Engineering has enabled us to achieve an Accuracy of 87.8% on the validation set.<br>
The obtained accuracy is way higher than the 73% accuracy obtained without feature engineering

Next, we will obtain accuracies for Coarse:Fine combinations

In [32]:
model.fit(x_all_ft_train, train['QType'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [33]:
preds = model.predict(x_all_ft_test)

In [34]:
accuracy_score(test['QType'], preds)

0.80400000000000005

Woah, up to 80.4% accuracy from 68% obtained earlier when modelled without Feature Engineering.

Finally, we would evaluate our performance for the fine classes

In [35]:
model.fit(x_all_ft_train, train['QType-Fine'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [36]:
preds = model.predict(x_all_ft_test)

In [37]:
accuracy_score(test['QType-Fine'], preds)

0.80800000000000005

Not bad, We haved achieved an accuracy of 80.8% over the Fine Classes.

# Conclusion

We achieved great accuracies using Feature Engineering as compared to accuracies obtained without feature engineering.
(The notebook for models obtained without feature engineering is not being shared and one can try implementing it easily).

Experimenting with informer hypernyms can further help in accuracy improvement as suggested in https://nlp.stanford.edu/courses/cs224n/2010/reports/olalerew.pdf