To classify sentences into one of several Categories, I am using a Supervised Learning mechanism, developing a model trained on a set of pre-classified sentences.

Insted of  counting specific types of words (i.e. "question words") in the features I am considering Part-Of-Speech patterns in a sentence.

In [1]:

import numpy as np
import pandas as pd

CODE_LOC = 'C://Users/Aayush/Nltk-Sentence Classification'   # !! Modify to path to "features.py" folder lcoation
DATA_LOC = 'C://Users/Aayush/Nltk-Sentence Classification/sentences.csv'  # !! Modify this to the CSV data location

sentences = pd.read_csv(filepath_or_buffer = DATA_LOC)   

In [2]:
sentences.head(10)

Unnamed: 0,SENTENCE,CLASS
0,"Sorry, I don't know about the weather.",S
1,That is a tricky question to answer.,C
2,What does OCM stand for,Q
3,MAX is a Mobile Application Accelerator,S
4,Can a dog see in colour?,Q
5,how are you,C
6,If you deploy a MySQL database in the Oracle c...,Q
7,who is dominic Fakename,Q
8,what's the weather like today?,C
9,Can the OCM host non Oracle software stacks?,Q


The code snippet below is an example of taking a sentence and extracting sets of POS-tag Triples from it. We can use this approach for building up features from a sentence by counting occurances of triple-patterns (or other POS-tag patterns).

In [4]:
# Extract some patterns of PoS sequences
import nltk
from nltk import word_tokenize

list_of_triple_strings = []  # triple sequence of PoS tags
sentence = "Can a dog see in colour?"

sentenceParsed = word_tokenize(sentence)
pos_tags = nltk.pos_tag(sentenceParsed)
pos = [ i[1] for i in pos_tags ]
print("Words mapped to Part of Speech Tags:",pos_tags)
print("PoS Tags:", pos)

n = len(pos)
for i in range(0,n-3):
    t = "-".join(pos[i:i+3]) # pull out 3 list item from counter, convert to string
    list_of_triple_strings.append(t)
    
print("sequences of triples:", list_of_triple_strings)

['MD', 'DT', 'NN', 'NN', 'IN', 'NN', '.']
Words mapped to Part of Speech Tags: [('Can', 'MD'), ('a', 'DT'), ('dog', 'NN'), ('see', 'NN'), ('in', 'IN'), ('colour', 'NN'), ('?', '.')]
PoS Tags: ['MD', 'DT', 'NN', 'NN', 'IN', 'NN', '.']
sequences of triples: ['MD-DT-NN', 'DT-NN-NN', 'NN-NN-IN', 'NN-IN-NN']


In [5]:
import sys
sys.path.append(CODE_LOC)  
import features           

sentence = "Can  \',a dog see 744}  in colour?"

sentence = features.strip_sentence(sentence)
print(sentence)
pos = features.get_pos(sentence)
print(pos)
triples = features.get_triples(pos)

print(triples)

Can  a dog see 744  in colour
[('Can', 'MD'), ('a', 'DT'), ('dog', 'NN'), ('see', 'NN'), ('744', 'CD'), ('in', 'IN'), ('colour', 'NN')]
['MD-DT-NN', 'DT-NN-NN', 'NN-NN-CD', 'NN-CD-IN', 'CD-IN-NN']


In [6]:
#### Bespoke Features Generator Example - Get a Python Dictionary of Features ####
sentences = ["Can a dog see in colour?",
             "Hey, How's it going?",
             "Oracle 12.2 will be released for on-premises users on 15 March 2017",
             "When will Oracle 12 be released"]
id = 1
for s in sentences:
    features_dict = features.features_dict(str(id),s)
    features_string,header = features.get_string(str(id),s)
    print(features_dict)
    #print(features_string)
    id += 1

{'id': '1', 'qMark': 1, 'wordCount': 6, 'stemmedCount': 4, 'qVerbCombo': 1, 'verbBeforeNoun': 1, 'VBG': 0, 'VBZ': 0, 'NNP': 0, 'NN': 3, 'NNS': 0, 'NNPS': 0, 'PRP': 0, 'CD': 0, 'stemmedEndNN': 0, 'startTuple0': 0, 'endTuple0': 1, 'endTuple1': 0, 'endTuple2': 0, 'qTripleScore': 0, 'sTripleScore': 0, 'class': 'X'}
{'id': '2', 'qMark': 1, 'wordCount': 4, 'stemmedCount': 3, 'qVerbCombo': 1, 'verbBeforeNoun': 0, 'VBG': 1, 'VBZ': 0, 'NNP': 2, 'NN': 0, 'NNS': 0, 'NNPS': 0, 'PRP': 1, 'CD': 0, 'stemmedEndNN': 0, 'startTuple0': 0, 'endTuple0': 0, 'endTuple1': 0, 'endTuple2': 0, 'qTripleScore': 0, 'sTripleScore': 0, 'class': 'X'}
{'id': '3', 'qMark': 0, 'wordCount': 12, 'stemmedCount': 8, 'qVerbCombo': 1, 'verbBeforeNoun': 0, 'VBG': 0, 'VBZ': 0, 'NNP': 1, 'NN': 1, 'NNS': 1, 'NNPS': 0, 'PRP': 0, 'CD': 3, 'stemmedEndNN': 0, 'startTuple0': 0, 'endTuple0': 0, 'endTuple1': 0, 'endTuple2': 0, 'qTripleScore': 0, 'sTripleScore': 2, 'class': 'X'}
{'id': '4', 'qMark': 0, 'wordCount': 6, 'stemmedCount': 4, '

In [7]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

FNAME = 'featuresDump.csv' # !! Modify this to the CSV data location

df = pd.read_csv(filepath_or_buffer = FNAME, )   
print(str(len(df)), "rows loaded")


df.columns = df.columns[:].str.strip()
df['class'] = df['class'].map(lambda x: x.strip())

width = df.shape[1]


100 rows loaded


In [8]:
#split into test and training (is_train: True / False col)
np.random.seed(seed=1)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
train, test = df[df['is_train']==True], df[df['is_train']==False]
print(str(len(train)), " rows split into training set,", str(len(test)), "split into test set.")

features = df.columns[1:width-1]  #remove the first ID col and last col=classifier
print("FEATURES = {}".format(features))

77  rows split into training set, 23 split into test set.
FEATURES = Index(['wordCount', 'stemmedCount', 'stemmedEndNN', 'CD', 'NN', 'NNP', 'NNPS',
       'NNS', 'PRP', 'VBG', 'VBZ', 'startTuple0', 'endTuple0', 'endTuple1',
       'endTuple2', 'verbBeforeNoun', 'qMark', 'qVerbCombo', 'qTripleScore',
       'sTripleScore'],
      dtype='object')


In [9]:
# Fit an RF Model for "class" given features
clf = RandomForestClassifier(n_jobs=2, n_estimators = 100)
clf.fit(train[features], train['class'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
# Predict against test set
preds = clf.predict(test[features])
predout = pd.DataFrame({ 'id' : test['id'], 'predicted' : preds, 'actual' : test['class'] })

In [11]:
## Cross-check accuracy ##
print(pd.crosstab(test['class'], preds, rownames=['actual'], colnames=['preds']))
print("\n",pd.crosstab(test['class'], preds, rownames=['actual']
                       , colnames=['preds']).apply(lambda r: round(r/r.sum()*100,2), axis=1) )

from sklearn.metrics import accuracy_score
print("\n\nAccuracy Score: ", round(accuracy_score(test['class'], preds),3) ) # https://en.wikipedia.org/wiki/Jaccard_index

preds   C  Q   S
actual          
C       3  0   0
Q       0  8   1
S       1  0  10

 preds        C      Q      S
actual                      
C       100.00   0.00   0.00
Q         0.00  88.89  11.11
S         9.09   0.00  90.91


Accuracy Score:  0.913


In [16]:
# load in some pre-formated FAQ data in a CSV
FNAME = 'pythonFAQ.csv' # !! Modify this to the CSV data location

import csv
import hashlib 

import features

fin = open(FNAME, 'rt')
reader = csv.reader(fin)

keys = ["id",
"wordCount",
"stemmedCount",
"stemmedEndNN",
"CD",
"NN",
"NNP",
"NNPS",
"NNS",
"PRP",
"VBG",
"VBZ",
"startTuple0",
"endTuple0",
"endTuple1",
"endTuple2",
"verbBeforeNoun",
"qMark",
"qVerbCombo",
"qTripleScore",
"sTripleScore",
"class"]

rows = []

next(reader)  #Assume we have a header 
for line in reader:
    sentence = line[0]  
    c = line[1]        #class-label
    id = hashlib.md5(str(sentence).encode('utf-8')).hexdigest()[:16] # generate a unique ID
    
    f = features.features_dict(id,sentence, c)
    row = []
    
    for key in keys:
        value = f[key]
        row.append(value)
    rows.append(row)
    
    
faq = pd.DataFrame(rows, columns=keys)
fin.close()

[['e8af070019393e21', 3, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 'Q'], ['cb2bfe367a7f5bfe', 6, 4, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 'Q'], ['30c3a9b0a23a6365', 9, 5, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 'Q'], ['8f285b6ab8472a48', 8, 5, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 'Q'], ['ce34f3fa53325140', 5, 3, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 'Q'], ['8c07862921357acb', 8, 6, 1, 0, 3, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 'Q'], ['3bc89e0e912eb455', 10, 6, 1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 'Q'], ['1fe20f36f120eda2', 7, 5, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 'Q'], ['6f861882dbd754e0', 9, 6, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 'Q'], ['4b26e82ce2c72a57', 10, 6, 1, 0, 2, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 'Q'], ['93443ca750695c0b', 10, 7, 1, 0, 3, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 'Q'], ['33a7957acaea7387', 10, 7, 1, 0, 0, 1,

In [17]:
# Predict against FAQ test set
featureNames = faq.columns[1:width-1]  #remove the first ID col and last col=classifier
faqPreds = clf.predict(faq[featureNames])

predout = pd.DataFrame({ 'id' : faq['id'], 'predicted' : faqPreds, 'actual' : faq['class'] })

In [18]:
## Cross-check accuracy ##
print(pd.crosstab(faq['class'], faqPreds, rownames=['actual'], colnames=['preds']))

print("\n",pd.crosstab(faq['class'], faqPreds, rownames=['actual'],
                       colnames=['preds']).apply(lambda r: round(r/r.sum()*100,2), axis=1) )


preds    C   Q   S
actual            
C       12   5   2
Q        0  14   2
S        0   3  13

 preds       C      Q      S
actual                     
C       63.16  26.32  10.53
Q        0.00  87.50  12.50
S        0.00  18.75  81.25


In [19]:
print("Accuracy Score:", round(accuracy_score(faq['class'], faqPreds) ,3) )

Accuracy Score: 0.765


In [23]:
textout = {'Q': "QUESTION", 'C': "CHAT", 'S':"STATEMENT"}

# mySentence = "Scikit-learn is a popular Python library for Machine Learning."
#mySentence = "The cat is dead"
mySentence = "Is the cat dead"

myFeatures = features.features_dict('1',mySentence, 'X')
values=[]
for key in keys:
    values.append(myFeatures[key])

s = pd.Series(values)
width = len(s)
myFeatures = s[1:width-1]  #All but the last item (this is the class for supervised learning mode)
predict = clf.predict([myFeatures])

print("\n\nPrediction is: ", textout[predict[0].strip()])



Prediction is:  QUESTION
