In [None]:
!pip install --upgrade pip

In [None]:
!pip install pattern

In [None]:
!pip install https://github.com/clips/pattern/archive/development.zip

In [None]:
!pip install pixiedust

In [1]:
# import pixiedust

from pattern.en import parse, pprint, parsetree, wordnet

from pattern.vector import Document, Model, TFIDF
from pattern.vector import stem, PORTER, LEMMA
from pattern.vector import SVM, RADIAL, gridsearch, kfoldcv, count
from pattern.vector import distance, COSINE, tfidf

# For Latent Semantic Analysis (LSA)
import numpy as np
from numpy.linalg import svd
from numpy import dot, diag


import gensim
from gensim.models import doc2vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import pyemd

import re
import math
import operator
from collections import Counter



In [2]:
qlines = list()
alines = list()
# corpus = list()
documents = list()
qaDocs = list()
# container = list()
container = dict()

In [3]:
def readAndCreateCorpus():
    with open("FAQs.txt") as f:
        for line in f:
            if(re.match(r"^[0-9]{1,}.", line)):
                question = re.sub(r"^[0-9]{1,}. (.*)", "\\1", line)
                qlines.append(question)
                pass
            elif(re.match(r"^[A].", line)):
                answer = re.sub(r"^[A]. (.*)", "\\1", line)
                alines.append(answer)
                pass
            pass
        pass
    
    for i in range(len(qlines)):
        qa = qlines[i] + alines[i]
        # corpus.append(qa)
        # documents.append({"Q" : qlines[i], "A" : alines[i]})
        documents.append(qa)
        pass
    pass

In [4]:
readAndCreateCorpus()

In [8]:
parsetree(documents[0], lemmata = True)

[Sentence('What/WP/O/O/what ways/NNS/B-NP/O/way do/VBP/B-VP/O/do you/PRP/B-NP/O/you support/VB/B-VP/O/support payment/NN/B-NP/O/payment ?/./O/O/?'),
 Sentence('You/PRP/B-NP/O/you can/MD/B-VP/O/can pay/VB/I-VP/O/pay using/VBG/I-VP/O/used a/DT/B-NP/O/a credit/NN/I-NP/O/credit or/CC/O/O/or debit/NN/B-NP/O/debit card/NN/I-NP/O/card ././O/O/.'),
 Sentence('You/PRP/B-NP/O/you can/MD/B-VP/O/can also/RB/I-VP/O/also use/VB/I-VP/O/use your/PRP$/B-NP/O/your GameLink/NN/I-NP/O/gamelink points/NNS/I-NP/O/point to/TO/B-VP/O/to pay/VB/I-VP/O/pay for/IN/B-PP/B-PNP/for items/NNS/B-NP/I-PNP/item ././O/O/.')]

In [9]:
def createContainerAlt():
    for i in range(len(documents)):
        qaDocs.append(Document(documents[i], stemmer = LEMMA, name = "DOC {}".format(i)))
        
        # pos = parse(documents[i], lemmata = True)
        parse_tree = parsetree(documents[i], lemmata = True)
        
        DOC_WORDS = list()
        
        word = None
        pos = None
        lemma = None
        stemmed = None
        wordnetted = dict()
        
        for sentence in parse_tree:
            
            heads = list()
            for chunk in sentence.chunks:        
                heads.append(chunk.head)
                pass
            
            container.update({"DOC {}".format(i) : {"heads" : heads}})
            # print(container.get("DOC {}".format(i)))
            
            for w in sentence:
                word = w.string
                pos = w.pos
                lemma = w.lemma
                stemmed = stem(word, stemmer = LEMMA)

                hypernyms = list()
                hyponyms = list()
                holonyms = list()
                meronyms = list()

                synsetList = wordnet.synsets(word)
                for synset in synsetList:
                    hypernyms.append(synset.hypernyms())
                    hyponyms.append(synset.hyponyms())
                    holonyms.append(synset.holonyms())
                    meronyms.append(synset.meronyms())
                    pass

                wordnetted = {"hypernyms" : hypernyms,
                 "hyponyms" : hyponyms,
                 "holonyms" : holonyms,
                 "meronyms" : meronyms}
                
                DOC_WORDS.append({word : {"pos" : pos, "stemmed" : stemmed, "lemma" : lemma, "wordneted" : wordnetted}})
                pass
            pass
        
        # container.append({"DOC {}".format(i) : {"word" : word, "pos" : pos, "stemmed" : stemmed, "lemma" : lemma, "deptree" : parse_tree, "wordneted" : wordnetted}})
        # container.append({"DOC {}".format(i) : {"deptree" : parse_tree, "DOC_WORDS" : DOC_WORDS}})
        # container.update({"DOC {}".format(i) : {"deptree" : parse_tree, "DOC_WORDS" : DOC_WORDS}})
        container.get("DOC {}".format(i)).update({"deptree" : parse_tree, "DOC_WORDS" : DOC_WORDS})
        
        pass
    pass

In [10]:
createContainerAlt()

In [11]:
container

{'DOC 0': {'heads': [Word('You/PRP'),
   Word('use/VB'),
   Word('points/NNS'),
   Word('pay/VB'),
   Word('for/IN'),
   Word('items/NNS')],
  'deptree': [Sentence('What/WP/O/O/what ways/NNS/B-NP/O/way do/VBP/B-VP/O/do you/PRP/B-NP/O/you support/VB/B-VP/O/support payment/NN/B-NP/O/payment ?/./O/O/?'),
   Sentence('You/PRP/B-NP/O/you can/MD/B-VP/O/can pay/VB/I-VP/O/pay using/VBG/I-VP/O/used a/DT/B-NP/O/a credit/NN/I-NP/O/credit or/CC/O/O/or debit/NN/B-NP/O/debit card/NN/I-NP/O/card ././O/O/.'),
   Sentence('You/PRP/B-NP/O/you can/MD/B-VP/O/can also/RB/I-VP/O/also use/VB/I-VP/O/use your/PRP$/B-NP/O/your GameLink/NN/I-NP/O/gamelink points/NNS/I-NP/O/point to/TO/B-VP/O/to pay/VB/I-VP/O/pay for/IN/B-PP/B-PNP/for items/NNS/B-NP/I-PNP/item ././O/O/.')],
  'DOC_WORDS': [{'What': {'pos': 'WP',
     'stemmed': 'what',
     'lemma': 'what',
     'wordneted': {'hypernyms': [],
      'hyponyms': [],
      'holonyms': [],
      'meronyms': []}}},
   {'ways': {'pos': 'NNS',
     'stemmed': 'way',
   

In [12]:
def createInputContainerAlt(inputSentence):
    inputContainer = dict()
    # pos = parse(inputSentence, lemmata = True)
    parse_tree = parsetree(inputSentence, lemmata = True)
    
    DOC_WORDS = list()

    word = None
    pos = None
    lemma = None
    stemmed = None
    wordnetted = dict()

    for sentence in parse_tree:
        
        heads = list()
        for chunk in sentence.chunks:        
            heads.append(chunk.head)
            pass
        
        inputContainer.update({"Input" : {"heads" : heads}})
        
        for w in sentence:
            
            # print(w.__dict__)
            # print(dir(w))
            
            word = w.string
            pos = w.pos
            lemma = w.lemma
            stemmed = stem(word, stemmer = LEMMA)
            
            """
            print("word : {}".format(word))
            print("pos : {}".format(pos))
            print("lemma : {}".format(lemma))
            print("stemmed : {}".format(stemmed))
            print("\n")
            """

            hypernyms = list()
            hyponyms = list()
            holonyms = list()
            meronyms = list()

            synsetList = wordnet.synsets(word)
            for synset in synsetList:
                hypernyms.append(synset.hypernyms())
                hyponyms.append(synset.hyponyms())
                holonyms.append(synset.holonyms())
                meronyms.append(synset.meronyms())
                pass

            wordnetted = {"hypernyms" : hypernyms,
             "hyponyms" : hyponyms,
             "holonyms" : holonyms,
             "meronyms" : meronyms}
            
            DOC_WORDS.append({word : {"pos" : pos, "stemmed" : stemmed, "lemma" : lemma, "wordneted" : wordnetted}})
            pass
        pass
    
    # inputContainer.update({"Input" : {"deptree" : parse_tree, "DOC_WORDS" : DOC_WORDS}})
    inputContainer.get("Input").update({"deptree" : parse_tree, "DOC_WORDS" : DOC_WORDS})
    
    return inputContainer
    pass

In [13]:
def toList(part, wordList):
    parts = list()
    
    for words in wordList:
        for w in words:
            parts.append((w, words[w][part]))
            pass
        pass
    
    return parts
    pass

In [14]:
toList("pos", container["DOC 1"]["DOC_WORDS"])

[('Can', 'MD'),
 ('I', 'PRP'),
 ('pre-order', 'VBP'),
 ('games', 'NNS'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('site', 'NN'),
 ('?', '.'),
 ('Yes', 'UH'),
 (',', ','),
 ('you', 'PRP'),
 ('can', 'MD'),
 ('definitely', 'RB'),
 ('pre-order', 'JJ'),
 ('games', 'NNS'),
 ('on', 'IN'),
 ('our', 'PRP$'),
 ('site', 'NN'),
 ('.', '.'),
 ('Some', 'DT'),
 ('games', 'NNS'),
 ('also', 'RB'),
 ('offer', 'VBP'),
 ('pre-order', 'NN'),
 ('bonuses', 'NNS'),
 ('and', 'CC'),
 ('discounts', 'NNS'),
 ('.', '.')]

In [None]:
toList("pos", inputContainer["Input"]["DOC_WORDS"])

In [None]:
Counter(toList("pos", inputContainer["Input"]["DOC_WORDS"]))

In [None]:
toList("wordneted", container["DOC 1"]["DOC_WORDS"])

In [None]:
conParts = toList("wordneted", container["DOC 1"]["DOC_WORDS"])
conParts
# [conPart[1] for conPart in conParts]

In [15]:
def getSimilarity(part, containerDict, inputDict):
    
    """
    print(part)
    print("\n")
    print(containerDict["DOC_WORDS"])
    print("\n")
    print(inputDict["DOC_WORDS"])
    print("\n\n")
    """
    
    similarity = 0
    
    conParts = toList(part, containerDict["DOC_WORDS"])
    inParts = toList(part, inputDict["DOC_WORDS"])
    
    # conCountPart = [conPart[1] for conPart in conParts]
    # inCountPart = [inPart[1] for inPart in inParts]
    
    
    # Calculate Cosine Similarity
    def get_cosine(vec1, vec2):
        intersection = set(vec1.keys()) & set(vec2.keys())
        numerator = sum([vec1[x] * vec2[x] for x in intersection])
        
        sum1 = sum([vec1[x]**2 for x in vec1.keys()])
        sum2 = sum([vec2[x]**2 for x in vec2.keys()])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        
        if not denominator:
            return 0.0
            pass
        else:
            return float(numerator) / denominator
            pass
        pass
        
    
    if (part == "wordneted"):
        
        hyperSim = hypoSim = holoSim = meroSim = 0
        
        wordnetParts = ["hypernyms", "hyponyms", "holonyms", "meronyms"]
        
        # print(conCountPart)
        print(inCountPart)
        
        conCounter = Counter(conCountPart)
        # inCounter = Counter(inCountPart)
        
        # print(get_cosine(conCounter, inCounter))
        
        pass
    else:
        # conCounter = Counter(conCountPart)
        # inCounter = Counter(inCountPart)
        
        conCounter = Counter(conParts)
        inCounter = Counter(inParts)

        """
        print(conCounter)
        print("\n")
        print(inCounter)
        print("\n\n")
        """
        
        similarity = get_cosine(conCounter, inCounter)
        pass
    
    # similarity = distance(conCounter, inCounter, method=COSINE)
    return similarity
    pass

In [16]:
getSimilarity("pos", container["DOC 49"], inputContainer["Input"])

NameError: name 'inputContainer' is not defined

In [None]:
inputAlt1 = createInputContainerAlt("What ways do you support payment?")
inputAlt2 = createInputContainerAlt("What ways do you support payment?")
getSimilarity("pos", inputAlt1["Input"], inputAlt2["Input"])

In [None]:
getSimilarity("pos", container["DOC 49"], inputContainer["Input"])

In [78]:
def testSimilarity(inputContainer):
    availableParts = ["pos", "stemmed", "lemma"]
    wordNetSimList = list()
    
    for availablePart in availableParts:
        print("TESTING FOR {}".format(availablePart))
        
        simList = list()
        for doc in container:
            similarity = getSimilarity(availablePart, container[doc], inputContainer["Input"])
            simList.append((doc, similarity))
            totalSimilarity = getWordnetSimilarity(container[doc], inputContainer["Input"])
            wordNetSimList.append((doc, totalSimilarity))
            pass
        print(sorted(simList, key = lambda x : x[1], reverse = True)[:10])
        # simList.clear()
        del simList
        print("\n\n\n")
        pass
    
    print("TESTING USING WORDNET : {}".format(sorted(set(wordNetSimList), key = lambda x : x[1], reverse = True)[:10]))
    del wordNetSimList
    print("\n\n\n")
    pass

In [None]:
in1 = createInputContainerAlt("What ways do you support payment?")
# in2 = createInputContainerAlt("Do you support online payment?")
in2 = createInputContainerAlt("Do you support online payment?")

in1 = createInputContainerAlt("What ways do you support payment?")
in2 = createInputContainerAlt("Is this a different support sentence?")

In [None]:
getSimilarity("pos", in1["Input"], in2["Input"])

In [None]:
getSimilarity("lemma", in1["Input"], in2["Input"])

In [None]:
from pattern.vector import COSINE, EUCLIDEAN, MANHATTAN, HAMMING
import math
q = {('What', 'WP'): 1, ('ways', 'NNS'): 1, ('do', 'VBP'): 1, ('you', 'PRP'): 1, ('support', 'VB'): 1, ('payment', 'NN'): 1, ('?', '.'): 1}

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

get_cosine(q, q)

In [None]:
document.vector

In [None]:
bagofwords = [Counter(re.findall(r'\w+', txt)) for txt in documents]

In [None]:
# Calculating the match score for the input query against each of the FAQs

def match_qa_pair(input_bag):
    match_score = dict()
    for word in input_bag:
        for w in word:
            for i in range(len(bagofwords)):
                if( w in bagofwords[i].keys()):
                    match_score[i] = match_score.get(i, 0) + 1
                    pass
                else:
                    match_score[i] = 0
                    pass
                pass
            pass
        pass
    
    #print(match_score)
    maximum = max(match_score.items(), key=lambda k : k[1])
    #print(maximum[0])
    for key in match_score:
        if key == maximum[0]:
            print("The matching question is: " + qlines[key])
            print("The matching answer is: " + alines[key])
            pass
        pass
    pass


In [None]:
document = Document(sentence, stemmer = LEMMA)
pos = parse(sentence, lemmata = True)
parse_tree = parsetree(sentence, lemmata = True)

In [None]:
faqDoc = Document(" ".join(corpus))

In [None]:
len(set(faqDoc.vector))

In [None]:
# pprint(parse(sentence, relations=True, lemmata=True))
# pprint(parsetree(sentence, relations=True, lemmata=True))

In [None]:
for word in sentence.split():
    synsetList = wordnet.synsets(word)
    
    print(word)
    for synset in synsetList:
        print("Definition : {}".format(synset.gloss))
        print("Synonyms : {}".format(synset.synonyms))
        print("Hypernyms : {}".format(synset.hypernyms()))
        print("Hyponyms : {}".format(synset.hyponyms()))
        print("Holonyms : {}".format(synset.holonyms()))
        print("Meronyms : {}".format(synset.meronyms()))
        pass
    
    print("\n")
    pass

In [None]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
def prettyContainerDoc(qa):
    pprint(qa.get("deptree"))
    
    for doc in qa["DOC_WORDS"]:
        # print(doc.keys())
        for d in doc.keys():
            # print("""
            # {:-^13} : {}
            # {:-^13} : {}
            # {:-^13} : {}
            # {:-^13} : {}
            # """.format("Word", d,
              #       "POS Tags", doc[d]["pos"],
               #      "Stem", doc[d]["stemmed"],
                #     "Lemma", doc[d]["lemma"]))
                
            print("""
            {:-^13} : {}
            {:-^13} : {}
            """.format("Word", d,
                     "Stem", doc[d]["stemmed"]))
            
            print("""
            {:-^13} : {}
            {:-^13} : {}
            {:-^13} : {}
            {:-^13} : {}
            """.format("Hypernyms", doc[d]["wordneted"]["hypernyms"],
                       "Hyponyms", doc[d]["wordneted"]["hyponyms"],
                       "Holonyms", doc[d]["wordneted"]["holonyms"],
                       "Meronyms", doc[d]["wordneted"]["meronyms"]))

            pass
        pass
    
    
    print("""{:-^13} : {}""".format("WM Distance", qa.get("WM Distance")))
    
    pass

In [None]:
for qa in container:
    prettyContainerDoc(container[qa])
    pass

In [None]:
wmDistDict = dict()

punctuation = " .,;:!?()[]{}`''\"@#$^&*+-|=~_"

# inSentence = "Can I play games when I am offline?"
inSentence = "Can I play games without an internet connection?"

for doc in range(len(documents)):
    distance = model.wmdistance(documents[doc], inSentence)
    wmDistDict.update({"DOC {}".format(doc) : distance})
    pass

# print('distance = %.3f' % distance)

sorted(wmDistDict.items(), key = operator.itemgetter(1))

In [None]:
def calcWMDistance(inSentence):
    wmDistDict = dict()
    
    for doc in range(len(documents)):
        distance = model.wmdistance(documents[doc], inSentence)
        wmDistDict.update({"DOC {}".format(doc) : distance})
        
        container["DOC {}".format(doc)].update({"WM Distance" : distance})
        pass
    
    return wmDistDict
    pass

In [None]:
wmDistance = calcWMDistance(inSentence)
sorted(wmDistance.items(), key = operator.itemgetter(1))

In [None]:
container["DOC 0"]["heads"]

In [None]:
pt = parsetree(inSentence, lemmata = True)

In [None]:
pt.xml

In [68]:
in1 = createInputContainerAlt("What ways do you support payment?")
in1 = createInputContainerAlt("Can I transfer my achievements to a new account?")

In [41]:
def getWordnetSimilarity(containerDict, inputDict):
    wordnetSimilarityDict = dict()
    totalSimilarity = 0
    
    containerHeads = containerDict["heads"]
    inputHeads = inputDict["heads"]
    
    for containerHead in containerHeads:
        for inputHead in inputHeads:
            containerHeadSynsets = wordnet.synsets(containerHead)
            inputHeadSynsets = wordnet.synsets(inputHead)
            
            for containerHeadSynset in containerHeadSynsets:
                for inputHeadSynset in inputHeadSynsets:
                    sim = wordnet.similarity(containerHeadSynset, inputHeadSynset)
                    wordnetSimilarityDict.update({"{}_{}".format(containerHead, inputHead) : sim})
                    totalSimilarity += sim
                    pass
                pass
            pass
        pass
    
    return totalSimilarity
    pass

In [32]:
getWordnetSimilarity(container["DOC 8"], in1["Input"])

({'games_developers': -0.0,
  'games_have': -0.0,
  'games_servers': -0.0,
  'set_developers': 0.5086478278513146,
  'set_have': 0.21294500526058824,
  'set_servers': 0.30217415785648744,
  'server_developers': 0.25168769069964214,
  'server_have': 0.1566331713346424,
  'server_servers': 1.0},
 15.547386199483935)

In [33]:
getWordnetSimilarity(container["DOC 9"], in1["Input"])

({'have_developers': 0.16372559722590763,
  'have_have': 1.0,
  'have_servers': 0.1566331713346424,
  'server_developers': 0.25168769069964214,
  'server_have': 0.1566331713346424,
  'server_servers': 1.0,
  'verification_developers': -0.0,
  'verification_have': -0.0,
  'verification_servers': -0.0,
  'support_developers': -0.0,
  'support_have': -0.0,
  'support_servers': -0.0},
 13.76820096024762)

In [34]:
getWordnetSimilarity(container["DOC 10"], in1["Input"])

({'developers_developers': 1.0,
  'developers_have': 0.16372559722590763,
  'developers_servers': 0.25168769069964214,
  'servers_developers': 0.25168769069964214,
  'servers_have': 0.1566331713346424,
  'servers_servers': 1.0},
 12.257911935859914)

In [53]:
getWordnetSimilarity(container["DOC 13"], in1["Input"])

41.191798494141324

In [79]:
testSimilarity(in1)

TESTING FOR pos
[('DOC 33', 0.5773502691896257), ('DOC 51', 0.48989794855663554), ('DOC 12', 0.4183300132670378), ('DOC 13', 0.4110541536602924), ('DOC 34', 0.37953605763829484), ('DOC 46', 0.3794733192202055), ('DOC 17', 0.369012483211554), ('DOC 2', 0.3627381250550058), ('DOC 24', 0.35445877847928325), ('DOC 23', 0.3499999999999999)]




TESTING FOR stemmed
[('DOC 33', 0.5773502691896257), ('DOC 51', 0.48989794855663554), ('DOC 13', 0.4110541536602924), ('DOC 12', 0.404145188432738), ('DOC 34', 0.37953605763829484), ('DOC 46', 0.3794733192202055), ('DOC 17', 0.3614031611621005), ('DOC 2', 0.3580574370197164), ('DOC 24', 0.35445877847928325), ('DOC 14', 0.3464101615137754)]




TESTING FOR lemma
[('DOC 33', 0.5773502691896257), ('DOC 51', 0.48989794855663554), ('DOC 13', 0.4110541536602924), ('DOC 12', 0.404145188432738), ('DOC 34', 0.37953605763829484), ('DOC 46', 0.3794733192202055), ('DOC 17', 0.3614031611621005), ('DOC 2', 0.3580574370197164), ('DOC 24', 0.35445877847928325), ('DO