In [1]:
data_file='../data/merged.csv'

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from __future__ import division
import inspect
from swa import Transcript
from os import listdir
from time import time
from re import findall, sub
from sklearn import tree
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from collections import Counter
from nltk import word_tokenize
import pydotplus
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import enchant
import string
import itertools
from stemming.porter2 import stem
import pickle

In [3]:
class BagOfWords:
    def __init__(self):
        self.space = Counter()

    def populateSpace(self, data):
        # generate space for bag of words on all the data
        # space => {'unique_word': (unique_index_for_word, <number_of_occurences>)..}
        # data
        for utter in data:
            # utter.text
            for token in utter.tokens:
               if token in ['{', '}', '[', ']', '/']:  # ignore literals
                   continue
               if self.space[token] == 0:
                    self.space[token] = [len(self.space), 1]
               else:
                    self.space[token][1] += 1
        #print self.space


    def featurize(self, utterances):
        speec_acts=[]
        utter_text=[]
        feature_vectors=[]
        feature_vectors_bow = open("pkl/feature_vectors_bow.pkl","wb")
        speec_acts_bow = open("pkl/speec_acts_bow.pkl","wb")
        utter_text_bow = open("pkl/utter_text_bow.pkl","wb")
        counter=0
        print "-----------bow space---------", len(self.space)
        # form feature vector for sentences
        for utter in utterances:
            #print utter
            #utterTokens = word_tokenize(utter.text)
            feature_vector_utter = [0] * len(self.space)
            
            for utterToken in utter.tokens:
                if utterToken in ['{', '}', '[', ']', '/']:  # ignore literals
                   continue
                if self.space[utterToken] != 0:
                    feature_vector_utter[self.space[utterToken][0]] = 1     # get the unique index of the word in space
            counter+=1
            if(counter % 1000==0):
                print counter
            
            speec_acts.append(utter.act_tag)
            utter_text.append(utter.text)
            feature_vectors.append(feature_vector_utter)

        pickle.dump(speec_acts, speec_acts_bow)
        pickle.dump(utter.text, utter_text_bow)
        pickle.dump(feature_vectors, feature_vectors_bow)
            
        feature_vectors_bow.close()
        speec_acts_bow.close()
        utter_text_bow.close()
            #speec_acts.append(utter.act_tag)
            #utter_text.append(utter.text)
            #feature_vectors.append(feature_vector_utter)
            #print feature_vectors
        #return feature_vectors, speec_acts, utter_text

In [4]:
class Feature:
    def __init__(self, utterance, previousUtterance_act_tag):
        self.utterance = utterance
        self.previousUtterance_act_tag = previousUtterance_act_tag
        self.featureHeaders = [
            'question_mark',        # check for presence of question mark
            'wh_question',          # check for presence of wh- question words
            'i_dont_know',          # check for presence of phrase 'i don't know'
            'no_words',             # check for presence of "No" words
            'yes_words',            # check for presence of "Yes" words
            'do_words',             # check for presence of tense of "do" - did, does
            'non_verbal',           # check for presence of non-verbal words, < action >
            'UH_count',             # check for presence of Interjection (UH) Parts of speech in the sentence
            #'CC_count',             # check for presence of co-ordinating conjunction (CC)
            'thanking_words',       # check for presence of words expressing "Thanks"
            'apology_words',        # check for presence of words
            #'sub_utterance_index',  # add sub-utterance index
            #'utterance_index',      # add utterance index
            #'utterance_count'       # add conversation length
            'qrr_sequence'          # check for presence of speech tag "q<x>" in previous utterance and current occurence
        ]

        self.featureKeys = {
            "question_mark" :       '?',
            "wh_question"   :       ['who', 'which', 'where', 'what', 'how'],
            "i_dont_know"   :       ["i don't know"],
            "no_words"      :       ["no", "nah"],
            "yes_words"     :       ["yes", "yeah"],
            "do_words"      :       ["do", "did", "does"],
            "non_verbal"    :       '^<.*?>',
            "UH_count"      :       '/UH',
            #"CC_count"      :       '/CC',
            "thanking_words":       ['thank', 'thanks', 'thank you'],
            "apology_words" :       ['sorry', 'apology'],
            "qrr_sequence"  :       ['qw', 'qh', 'qo', 'qr']
        }

    def qrr_sequence(self):
        if len(self.previousUtterance_act_tag) != 0 and (self.previousUtterance_act_tag in self.featureKeys[inspect.currentframe().f_code.co_name]):
            return 1
        return 0

    def question_mark(self):
        if self.featureKeys[inspect.currentframe().f_code.co_name] in self.utterance.text:
            return 1
        return 0

    def wh_question(self):
        tag_word_count = 0
        for tag_word in self.featureKeys[inspect.currentframe().f_code.co_name]:
            if findall('\\b'+tag_word+'\\b', self.utterance.text):
                tag_word_count += 1
        return tag_word_count

    def i_dont_know(self):
        tag_word_count = 0
        for tag_word in self.featureKeys[inspect.currentframe().f_code.co_name]:
            if findall('\\b'+tag_word+'\\b', self.utterance.text):
                tag_word_count += 1
        return tag_word_count

    def no_words(self):
        tag_word_count = 0
        for tag_word in self.featureKeys[inspect.currentframe().f_code.co_name]:
            if findall('\\b'+tag_word+'\\b', self.utterance.text):
                tag_word_count += 1
        return tag_word_count

    def yes_words(self):
        tag_word_count = 0
        for tag_word in self.featureKeys[inspect.currentframe().f_code.co_name]:
            if findall('\\b'+tag_word+'\\b', self.utterance.text):
                tag_word_count += 1
        return tag_word_count

    def do_words(self):
        tag_word_count = 0
        for tag_word in self.featureKeys[inspect.currentframe().f_code.co_name]:
            if findall('\\b'+tag_word+'\\b', self.utterance.text):
                tag_word_count += 1
        return tag_word_count

    def non_verbal(self):
        # search for string <abcde>,
        #  ^ -> start of sentence, non-greedy pattern <.*?>
        return len(findall(self.featureKeys[inspect.currentframe().f_code.co_name], self.utterance.text))

    def UH_count(self):
        # maybe, check for length of text; if length less than 2 then return true? - Skepticism :-/
        if len(self.utterance.pos.split()) < 3 and \
                self.featureKeys[inspect.currentframe().f_code.co_name] in self.utterance.pos:
            return 1
        return 0

    def CC_count(self):
        if len(self.utterance.pos.split()) < 3 and \
                self.featureKeys[inspect.currentframe().f_code.co_name] in self.utterance.pos:
            return 1
        return 0

    def thanking_words(self):
        tag_word_count = 0
        for tag_word in self.featureKeys[inspect.currentframe().f_code.co_name]:
            if findall('\\b'+tag_word+'\\b', self.utterance.text):
                tag_word_count += 1
        return tag_word_count

    def apology_words(self):
        tag_word_count = 0
        for tag_word in self.featureKeys[inspect.currentframe().f_code.co_name]:
            if findall('\\b'+tag_word+'\\b', self.utterance.text):
                tag_word_count += 1
        return tag_word_count

    def sub_utterance_index(self):
            return self.utterance.subutterance_index

    def utterance_index(self):
            return self.utterance.utterance_index

    def utterance_count(self):
            return self.utterance.utterance_count

In [5]:
class Classifier:
    def __init__(self, dataset, datasetPath):
        self.dataName = dataset
        self.datasetPath = datasetPath
        self.data = []
        self.totalDataCount = 0
        self.trainData = []
        self.testData = []
        self.trainPercentage = 90
        self.testPercentage = 10
        self.speech_acts_class = [
            'sd',
            'b',
            'sv',
            #'aa',
            'qy',
            'x',
            'ny',
            'qw',
            'nn',
            'h',
            'qy^d',
            #'qw^d',
            'fa',
            'ft'
        ]
        self.speech_acts_class = self.speechActDictify()

    def speechActDictify(self):
        speech_acts_class = Counter()
        for speech_act in self.speech_acts_class:
            speech_acts_class[speech_act] = 1

        return speech_acts_class

    def getData(self):
        # list directories for dataset files
        #for dir in listdir(self.datasetPath):
         #   if dir.startswith('.'):
          #      continue

            #print dir
           # for file in listdir(self.datasetPath + dir):
            #    if file.startswith('.'):
             #       continue
               # dataFile = self.datasetPath + dir + '/' + file
                trans = Transcript(self.datasetPath)
                #getdata=open("getdata.pkl","wb")
                #pickle.dump(trans.utterances, getdata)
                self.data.extend(trans.utterances)
                self.totalDataCount += len(trans.utterances)

    def getTrainAndTestData(self):
        self.trainData = self.data[:int(self.trainPercentage/100 * self.totalDataCount)]
        self.testData = self.data[-int(self.testPercentage/100 * self.totalDataCount):]

    def featurize(self, utterances):
        #feature_vectors = []
        #speec_acts = []
        #utter_text = []
        feature_vectors1 = open("feature_vectors.pkl","wb")
        speec_acts1 = open("speec_acts.pkl","wb")
        utter_text1 = open("utter_text.pkl","wb")
        
        previousUtter = ''
        # form feature vector for sentences
        for utter in utterances:
            feature = Feature(utter, previousUtter)
            #feature_vector = {}
            feature_vector_utter = []
            for headers in feature.featureHeaders:
                #feature_vector[headers] = getattr(feature, headers)()
                feature_vector_utter.append(getattr(feature, headers)())
            pickle.dump(utter.act_tag, speec_acts1)
            pickle.dump(utter.text, utter_text1)
            pickle.dump(feature_vector_utter, feature_vectors1)
            
            #speec_acts.append(utter.act_tag)
            #utter_text.append(utter.text)
            #feature_vectors.append(feature_vector_utter)
            previousUtter = utter.act_tag
            #feature_vectors.append([feature_vector[key] for key in feature_vector])
            #print utter.text, feature_vector
        feature_vectors1.close()
        speec_acts1.close()
        utter_text1.close()
        #return feature_vectors, speec_acts, utter_text

    def normalizeSpeechAct(self, speechActs):
        # normalize speech_acts
        for speechActIndex in range(len(speechActs)):
            trimSpeechAct = sub('\^2|\^g|\^m|\^r|\^e|\^q|\^d', '',speechActs[speechActIndex])
            if self.speech_acts_class[speechActs[speechActIndex]] != 1 or \
                trimSpeechAct in ['sd', 'sv'] or \
                    self.speech_acts_class[trimSpeechAct] != 1:
                 #speechActs[speechActIndex] = 'other'
                 speechActs[speechActIndex] = 's'

    def normalizeSpeechActTest(self, speechActs):
        # normalize speech_acts
        for speechActIndex in range(len(speechActs)):
            trimSpeechAct = sub('\^2|\^g|\^m|\^r|\^e|\^q|\^d', '',speechActs[speechActIndex])
            if trimSpeechAct in ['sd', 'sv']:
                speechActs[speechActIndex] = 's'
            elif self.speech_acts_class[speechActs[speechActIndex]] != 1 or \
                    self.speech_acts_class[trimSpeechAct] != 1:
                 speechActs[speechActIndex] = 'rest'

    def normalizePrediction(self, predicted_speech_act, labelledSpeechAct):
        for i in range(len(labelledSpeechAct)):
            if labelledSpeechAct[i] == 'rest' and predicted_speech_act[i] == 's':
                predicted_speech_act[i] = 'rest'

    def combineFeatureVectors(self, feature_vectors_bow, feature_vectors_cust):
        feature_vectors = []
        for i in range(len(feature_vectors_bow)):
            #feature_vectors.append(feature_vectors_bow[i] + feature_vectors_cust[i])
            feature_vectors.append(list(itertools.chain(feature_vectors_bow[i],feature_vectors_cust[i])))
        return feature_vectors

    def findmajorityclass(self,speech_act):
        class_dist=Counter(speech_act)
        majority_class=class_dist.most_common(1)
    	print "Majority class", majority_class
    	count=majority_class[0]
    	print "Majority percentage: ",100*count[1]/len(speech_act)

In [6]:
classifier = Classifier('swa', data_file)
bagofwords = BagOfWords()

In [7]:
bagofwords = BagOfWords()

In [8]:
classifier.getData()

////////////////////storing the train and the test data in pickle file//////////////////

In [9]:
classifier.getTrainAndTestData()
value=[]
for i in range(0,len(classifier.trainData)):   
    temp=[]
    #train_data[1]#utterance.text
    for t in classifier.trainData[i].tokens:
        if t in ['{', '}', '[', ']', '/','+','>','<','-']:
            continue
        else:
            temp.append(t)
    value.append(' '.join(temp))
train_data = open("pkl/train_data.pkl","wb")
pickle.dump(value,train_data)
train_data.close()
value=[]
for i in range(0,len(classifier.testData)):   
    temp=[]
    #train_data[1]#utterance.text
    for t in classifier.testData[i].tokens:
        if t in ['{', '}', '[', ']', '/','+','>','<','-']:
            continue
        else:
            temp.append(t)
    value.append(' '.join(temp))
test_data = open("pkl/test_data.pkl","wb")
pickle.dump(value,test_data)
test_data.close()

In [14]:
value=[]
for i in range(0,len(classifier.trainData)):   
    value.append(classifier.trainData[i].act_tag)
train_target = open("pkl/train_target.pkl","wb")
pickle.dump(value,train_target)
train_target.close()
value=[]
for i in range(0,len(classifier.testData)):   
    value.append(classifier.testData[i].act_tag)
test_target = open("pkl/test_target.pkl","wb")
pickle.dump(value,test_target)
test_target.close()

In [38]:
class garzon(object):
    def __init__(self,data):
        t=[]
        tag=[]
        for token in data:
            b=token.pos
            tag1=token.act_tag
            t.append(token.pos)
            tag.append(tag1)
        print len(tag)
    
        #without=b.translate(None,string.punctuation)

        ngrams=[]
        for i in range(0,len(t)):
            without=[]
            a=t[i].split('/')
            question=False
            for word in a:
                #add question mark or use any other method to check for question mark
                #print word.translate(None,string.punctuation).split(' ')
                without.append(word.translate(None,string.punctuation).split(' '))
        #print anand


            flat_list = [item for sublist in without for item in sublist]
            my_list = [x for x in flat_list if not x == '']
            my_list_1=[my_list[i] for i in range(0,len(my_list),2)]
            my_list_3=[stem(my_list_1[i]) for i in range(0,len(my_list_1))]
            my_list_2=[my_list[i] for i in range(1,len(my_list),2)]
            if(not my_list_3 or not my_list_2):
                my_list_3=['NA']
                my_list_2=['NA']
                
            word_pos=zip(my_list_3,my_list_2)
            ngrams.append(word_pos)
        print len(ngrams)
        zipped=zip(ngrams,tag)
        #print zipped
        w,tags=zip(*zipped)
        self.w=w
        for i in range(0,len(self.w)):
            if(i==34):
                print self.w[34]
        
    def featurize(self):
        feat=[]
        for i in range(0,len(self.w)):
            feature=[0] * 5
            #print i
            word,pos=zip(*self.w[i])
            pos=list(pos)
            word=list(word)
            if ('PRP' in pos):
                feature[0]=1
            if((pos[0] in ('MD','VB','VBD','VBG','VBN','VBP','VBZ')) and word[0] != 'JOIN' and word[0] != 'PART'):
                feature[1]=1
            for i in range(0,len(pos)-1):
                if pos[i] == 'TO' and (pos[i+1] == 'VB' or pos[i+1] == 'VBP'):
                    feature[2]=1
            if pos[0] in ('WP','WRB','WDT'):
                feature[3]=1
            if pos[len(pos)-1] in ('NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'):
                feature[4]=1
            feat.append(feature)
    
        return feat
    #o,p=zip(*w[1])
    #print p
    
    #bigrams


In [39]:
classifier.getTrainAndTestData()

In [40]:
bagofwords.populateSpace(classifier.trainData)

In [41]:
bagofwords.featurize(classifier.trainData)

-----------bow space--------- 5514
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000


In [41]:
feature_vectors_bow = []
infile = open('pkl/feature_vectors_bow.pkl', 'rb')
while 1:
    try:
        feature_vectors_bow.append(pickle.load(infile))
    except (EOFError):
        break
infile.close()

In [43]:
len(feature_vectors_bow[1])

0

In [12]:
feature_vectors_cust, speech_acts, utter_text = classifier.featurize(classifier.trainData)
print len(feature_vectors_cust)

12910


In [13]:
feature_vectors_garzon=garzon(classifier.trainData).featurize()
print len(feature_vectors_garzon)

12910
12910
[('NA', 'NA')]
12910


//concatenate feature vectors

In [14]:
feature_vectors = classifier.combineFeatureVectors(feature_vectors_bow, feature_vectors_cust)
feature_vectors = classifier.combineFeatureVectors(feature_vectors, feature_vectors_garzon)

//defines the 's' class which is undefined----go through this again

In [15]:
classifier.normalizeSpeechAct(speech_acts)

//see if this is creates problems

In [16]:
classifier.findmajorityclass(speech_acts)

Majority class [('s', 10145)]
Majority percentage:  78.5824941905


In [17]:
clf = tree.DecisionTreeClassifier()
#clf = RandomForestClassifier()

In [18]:
len(feature_vectors[1])

5530

In [19]:
clf = clf.fit(feature_vectors, speech_acts)

//test data

In [20]:
feature_vectors_bow, labelled_speech_acts, utter_text = bagofwords.featurize(classifier.testData)
feature_vectors_cust, speech_acts, utter_text = classifier.featurize(classifier.testData)
feature_vectors_garzon=garzon(classifier.testData).featurize()
feature_vectors = classifier.combineFeatureVectors(feature_vectors_bow, feature_vectors_cust)
feature_vectors = classifier.combineFeatureVectors(feature_vectors,feature_vectors_garzon)

1434
1434
[('Uhhuh', 'UH')]


In [21]:
classifier.normalizeSpeechActTest(labelled_speech_acts)

In [22]:
predicted_speech_act = clf.predict(feature_vectors)

In [23]:
len(feature_vectors)

1434

In [24]:
classifier.normalizePrediction(predicted_speech_act, labelled_speech_acts)

In [25]:
correctResult = Counter()
wrongResult = Counter()

i=0
for i in range(len(predicted_speech_act)):
    if predicted_speech_act[i] == labelled_speech_acts[i]:
        correctResult[predicted_speech_act[i]] += 1
    else:
        wrongResult[predicted_speech_act[i]] += 1
print i

total_correct = sum([correctResult[i] for i in correctResult])
total_wrong = len(predicted_speech_act) - total_correct

print "total_correct", total_correct
print "total wrong", total_wrong
print "accuracy", (total_correct/len(predicted_speech_act)) * 100

print "Classification_report:\n", classification_report(labelled_speech_acts, predicted_speech_act)#, target_names=target_names)
print "accuracy_score:", round(accuracy_score(labelled_speech_acts, predicted_speech_act), 2)
print "b: Acknowledge (Backchannel), fa: Apology, ft: Thanking, h: Hedge, nn: No answers, ny: Yes answers, qw: Wh-Question, qy: Yes-No-Question, qy^d: Declarative Yes-No-Question, rest, s: statement opinion & statement non-opinion, x: Non-verbal"

1433
total_correct 1249
total wrong 185
accuracy 87.0990237099
Classification_report:
             precision    recall  f1-score   support

          b       0.70      0.88      0.78       239
          h       0.37      0.64      0.47        11
         nn       1.00      0.11      0.20         9
         ny       0.00      0.00      0.00        19
         qw       0.57      0.57      0.57        14
         qy       0.76      0.73      0.75        30
       qy^d       0.11      0.25      0.15         4
       rest       1.00      0.82      0.90       493
          s       0.91      0.97      0.94       592
          x       1.00      0.96      0.98        23

avg / total       0.88      0.87      0.87      1434

accuracy_score: 0.87
b: Acknowledge (Backchannel), fa: Apology, ft: Thanking, h: Hedge, nn: No answers, ny: Yes answers, qw: Wh-Question, qy: Yes-No-Question, qy^d: Declarative Yes-No-Question, rest, s: statement opinion & statement non-opinion, x: Non-verbal


//accuracy score decreases significantly if sv and sd both are used

//features usage

In [26]:
print "features usage"
print "---------------------------------------"
feature_vectors_cust, speech_acts, utter_text = classifier.featurize(classifier.data)
total={}
for headers in range(0,len(feature_vectors_cust[1])):
    total[headers]=0
    for i in range(0,len(feature_vectors_cust)):
        if(feature_vectors_cust[i][headers]==1):
            total[headers]+=1
print "question-mark: ", total[0]
print "wh_question: ",total[1]
print "i_dont_know: ",total[2]
print "no_words: ",total[3]
print "yes_words: ",total[4]
print "do_words: ",total[5]
print "non_verbal: ",total[6]
print "UH_count: ",total[7]
print "thanking_words: ",total[8]
print "apology_words: ",total[9]
print "qrr_sequence: ",total[10]

features usage
---------------------------------------
question-mark:  607
wh_question:  996
i_dont_know:  154
no_words:  229
yes_words:  1485
do_words:  857
non_verbal:  458
UH_count:  1762
thanking_words:  4
apology_words:  8
qrr_sequence:  252
