In [None]:
!pip install --upgrade pip

In [None]:
!pip install pattern

In [None]:
!pip install https://github.com/clips/pattern/archive/development.zip

In [None]:
!pip install pixiedust

In [1]:
# import pixiedust

from pattern.en import parse, pprint, parsetree, wordnet

from pattern.vector import Document, Model, TFIDF
from pattern.vector import stem, PORTER, LEMMA
from pattern.vector import SVM, RADIAL, gridsearch, kfoldcv, count
from pattern.vector import distance, COSINE, tfidf

# For Latent Semantic Analysis (LSA)
import numpy as np
from numpy.linalg import svd
from numpy import dot, diag


import gensim
from gensim.models import doc2vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import pyemd

import re
import math
import operator
from collections import Counter



In [2]:
def readAndCreateCorpus():
    with open("FAQs.txt") as f:
        for line in f:
            if(re.match(r"^[0-9]{1,}.", line)):
                question = re.sub(r"^[0-9]{1,}. (.*)", "\\1", line)
                qlines.append(question)
                pass
            elif(re.match(r"^[A].", line)):
                answer = re.sub(r"^[A]. (.*)", "\\1", line)
                alines.append(answer)
                pass
            pass
        pass
    
    for i in range(len(qlines)):
        qa = qlines[i] + alines[i]
        # corpus.append(qa)
        # documents.append({"Q" : qlines[i], "A" : alines[i]})
        documents.append(qa)
        pass
    pass

In [3]:
def createContainer():
    for i in range(len(documents)):
        qaDocs.append(Document(documents[i], stemmer = LEMMA, name = "DOC {}".format(i)))
        
        # pos = parse(documents[i], lemmata = True)
        parse_tree = parsetree(documents[i], lemmata = True)
        
        DOC_WORDS = list()
        
        word = None
        pos = None
        lemma = None
        stemmed = None
        wordnetted = dict()
        
        for sentence in parse_tree:
            
            heads = list()
            for chunk in sentence.chunks:        
                heads.append(chunk.head)
                pass
            
            container.update({"DOC {}".format(i) : {"heads" : heads}})
            
            for w in sentence:
                word = w.string
                pos = w.pos
                lemma = w.lemma
                stemmed = stem(word, stemmer = LEMMA)

                hypernyms = list()
                hyponyms = list()
                holonyms = list()
                meronyms = list()

                synsetList = wordnet.synsets(word)
                for synset in synsetList:
                    hypernyms.append(synset.hypernyms())
                    hyponyms.append(synset.hyponyms())
                    holonyms.append(synset.holonyms())
                    meronyms.append(synset.meronyms())
                    pass

                wordnetted = {"hypernyms" : hypernyms,
                 "hyponyms" : hyponyms,
                 "holonyms" : holonyms,
                 "meronyms" : meronyms}
                
                DOC_WORDS.append({word : {"pos" : pos, "stemmed" : stemmed, "lemma" : lemma, "wordneted" : wordnetted}})
                pass
            pass
        
        container.get("DOC {}".format(i)).update({"deptree" : parse_tree, "DOC_WORDS" : DOC_WORDS})
        pass
    pass

In [169]:
# Calculating the match score for the input query against each of the FAQs
def match_qa_pair(input_bag):
    match_score = dict()
    
    for word in re.findall(r"\w+", input_bag):
        for i in range(len(bagofwords)):
            if word in bagofwords[i].keys():
                match_score[i] = match_score.get(i, 0) + 1
                pass
            else:
                match_score[i] = 0
                pass
            pass
        pass
    
    # print(match_score)
    
    """
    maximum = max(match_score.items(), key=lambda k : k[1])
    # print(maximum[0])
    
    for key in match_score:
        if key == maximum[0]:
            print("The matching question is: " + qlines[key])
            print("The matching answer is: " + alines[key])
            pass
        pass
    """
    
    # print(sorted(match_score.items(), key=operator.itemgetter(1), reverse = True)[:10])
    
    for key in sorted(match_score.keys(), key = lambda x : str(match_score[x]), reverse = True)[:10]:
        print("The matching question is: " + qlines[key])
        print("The matching answer is: " + alines[key])
        print("\n\n")
        pass    
    pass

In [4]:
def createInputContainer(inputSentence):
    inputContainer = dict()
    # pos = parse(inputSentence, lemmata = True)
    parse_tree = parsetree(inputSentence, lemmata = True)
    
    DOC_WORDS = list()

    word = None
    pos = None
    lemma = None
    stemmed = None
    wordnetted = dict()

    for sentence in parse_tree:
        
        heads = list()
        for chunk in sentence.chunks:        
            heads.append(chunk.head)
            pass
        
        inputContainer.update({"Input" : {"heads" : heads}})
        
        for w in sentence:
            word = w.string
            pos = w.pos
            lemma = w.lemma
            stemmed = stem(word, stemmer = LEMMA)

            hypernyms = list()
            hyponyms = list()
            holonyms = list()
            meronyms = list()

            synsetList = wordnet.synsets(word)
            for synset in synsetList:
                hypernyms.append(synset.hypernyms())
                hyponyms.append(synset.hyponyms())
                holonyms.append(synset.holonyms())
                meronyms.append(synset.meronyms())
                pass

            wordnetted = {"hypernyms" : hypernyms,
             "hyponyms" : hyponyms,
             "holonyms" : holonyms,
             "meronyms" : meronyms}
            
            DOC_WORDS.append({word : {"pos" : pos, "stemmed" : stemmed, "lemma" : lemma, "wordneted" : wordnetted}})
            pass
        pass
    
    inputContainer.get("Input").update({"deptree" : parse_tree, "DOC_WORDS" : DOC_WORDS})
    
    return inputContainer
    pass

In [5]:
def toList(part, wordList):
    parts = list()
    
    for words in wordList:
        for w in words:
            parts.append((w, words[w][part]))
            pass
        pass
    
    return parts
    pass

In [6]:
def getSimilarity(part, containerDict, inputDict):
    similarity = 0
    
    conParts = toList(part, containerDict["DOC_WORDS"])
    inParts = toList(part, inputDict["DOC_WORDS"])
        
    # Calculate Cosine Similarity
    def get_cosine(vec1, vec2):
        intersection = set(vec1.keys()) & set(vec2.keys())
        numerator = sum([vec1[x] * vec2[x] for x in intersection])
        
        sum1 = sum([vec1[x]**2 for x in vec1.keys()])
        sum2 = sum([vec2[x]**2 for x in vec2.keys()])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        
        if not denominator:
            return 0.0
            pass
        else:
            return float(numerator) / denominator
            pass
        pass
            
    conCounter = Counter(conParts)
    inCounter = Counter(inParts)
    
    similarity = get_cosine(conCounter, inCounter)
    return similarity
    pass

In [179]:
def testSimilarity(inputContainer):
    availableParts = ["pos", "stemmed", "lemma"]
    wordNetSimList = list()
    
    for availablePart in availableParts:
        print("TESTING FOR {}".format(availablePart))
        
        simList = list()
        for doc in container:
            similarity = getSimilarity(availablePart, container[doc], inputContainer["Input"])
            simList.append((doc, similarity))
            
            totalWordnetSimilarity = getWordnetSimilarity(container[doc], inputContainer["Input"])
            wordNetSimList.append((doc, totalWordnetSimilarity))
            pass
        
        print(sorted(simList, key = lambda x : x[1], reverse = True)[:10])
        
        # simList.clear()
        del simList
        
        print("\n\n\n")
        pass
    
    print("TESTING FOR wordnet")
    print(sorted(set(wordNetSimList), key = lambda x : x[1], reverse = True)[:10])
        
    print("\n\n\n")
    pass

In [9]:
def prettyContainerDoc(qa):
    pprint(qa.get("deptree"))
    
    for doc in qa["DOC_WORDS"]:
        # print(doc.keys())
        for d in doc.keys():
            # print("""
            # {:-^13} : {}
            # {:-^13} : {}
            # {:-^13} : {}
            # {:-^13} : {}
            # """.format("Word", d,
              #       "POS Tags", doc[d]["pos"],
               #      "Stem", doc[d]["stemmed"],
                #     "Lemma", doc[d]["lemma"]))
                
            print("""
            {:-^13} : {}
            {:-^13} : {}
            """.format("Word", d,
                     "Stem", doc[d]["stemmed"]))
            
            print("""
            {:-^13} : {}
            {:-^13} : {}
            {:-^13} : {}
            {:-^13} : {}
            """.format("Hypernyms", doc[d]["wordneted"]["hypernyms"],
                       "Hyponyms", doc[d]["wordneted"]["hyponyms"],
                       "Holonyms", doc[d]["wordneted"]["holonyms"],
                       "Meronyms", doc[d]["wordneted"]["meronyms"]))

            pass
        pass
    
    
    print("""{:-^13} : {}""".format("WM Distance", qa.get("WM Distance")))
    
    pass

In [10]:
def calcWMDistance(inSentence):
    wmDistDict = dict()
    
    for doc in range(len(documents)):
        distance = model.wmdistance(documents[doc], inSentence)
        wmDistDict.update({"DOC {}".format(doc) : distance})
        
        container["DOC {}".format(doc)].update({"WM Distance" : distance})
        pass
    
    return wmDistDict
    pass

In [11]:
def getWordnetSimilarity(containerDict, inputDict):
    wordnetSimilarityDict = dict()
    totalSimilarity = 0
    
    containerHeads = containerDict["heads"]
    inputHeads = inputDict["heads"]
    
    for containerHead in containerHeads:
        for inputHead in inputHeads:
            containerHeadSynsets = wordnet.synsets(containerHead)
            inputHeadSynsets = wordnet.synsets(inputHead)
            
            for containerHeadSynset in containerHeadSynsets:
                for inputHeadSynset in inputHeadSynsets:
                    sim = wordnet.similarity(containerHeadSynset, inputHeadSynset)
                    wordnetSimilarityDict.update({"{}_{}".format(containerHead, inputHead) : sim})
                    totalSimilarity += sim
                    pass
                pass
            pass
        pass
    
    return totalSimilarity
    pass

In [19]:
parsetree("Hello", lemmata = True)

[Sentence('Hello/UH/O/O/hello')]

In [25]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [134]:
inSentence = input()

How many systems can I link simultaneously with my account?


In [135]:
qlines = list()
alines = list()
documents = list()
qaDocs = list()
container = dict()

In [136]:
readAndCreateCorpus()

In [137]:
createContainer()

In [138]:
bagofwords = [Counter(re.findall(r'\w+', txt)) for txt in documents]

In [139]:
match_qa_pair(inSentence)

The matching question is: How many systems can I link simultaneously with my account?

The matching answer is: You can link upto 3 systems with a single account.



In [119]:
for qa in container:
    prettyContainerDoc(container[qa])
    pass

          WORD   TAG    CHUNK   ROLE   ID     PNP    LEMMA     
                                                               
          What   WP     -       -      -      -      what      
          ways   NNS    NP      -      -      -      way       
            do   VBP    VP      -      -      -      do        
           you   PRP    NP      -      -      -      you       
       support   VB     VP      -      -      -      support   
       payment   NN     NP      -      -      -      payment   
             ?   .      -       -      -      -      ?         

          WORD   TAG    CHUNK   ROLE   ID     PNP    LEMMA    
                                                              
           You   PRP    NP      -      -      -      you      
           can   MD     VP      -      -      -      can      
           pay   VB     VP ^    -      -      -      pay      
         using   VBG    VP ^    -      -      -      used     
             a   DT     NP      -      -     


            ----Word----- : I
            ----Stem----- : i
            

            --Hypernyms-- : [[Synset('chemical_element.n.01'), Synset('halogen.n.01')], [Synset('digit.n.01')], [Synset('letter.n.02')]]
            --Hyponyms--- : [[Synset('iodine-125.n.01'), Synset('iodine-131.n.01')], [Synset('monad.n.02'), Synset('singleton.n.01')], []]
            --Holonyms--- : [[], [], [Synset('roman_alphabet.n.01')]]
            --Meronyms--- : [[], [], []]
            

            ----Word----- : edit
            ----Stem----- : edit
            

            --Hypernyms-- : []
            --Hyponyms--- : []
            --Holonyms--- : []
            --Meronyms--- : []
            

            ----Word----- : maps
            ----Stem----- : map
            

            --Hypernyms-- : [[Synset('representation.n.02')], [Synset('mathematical_relation.n.01')]]
            --Hyponyms--- : [[Synset('chart.n.02'), Synset('choropleth_map.n.01'), Synset('contour_map.n.01'), Synset('plat.n

            

            ----Word----- : of
            ----Stem----- : of
            

            --Hypernyms-- : []
            --Hyponyms--- : []
            --Holonyms--- : []
            --Meronyms--- : []
            

            ----Word----- : service
            ----Stem----- : service
            

            --Hypernyms-- : [[Synset('work.n.01')], [Synset('aid.n.02')], [Synset('religious_ceremony.n.01')], [Synset('company.n.01')], [Synset('employment.n.02')], [Synset('force.n.04')], [], [Synset('helpfulness.n.01')], [Synset('tableware.n.01')], [Synset('coupling.n.03')], [Synset('activity.n.01')], [Synset('tennis_stroke.n.01')], [Synset('delivery.n.01')], [Synset('care.n.06')], [Synset('accommodation.n.05')]]
            --Hyponyms--- : [[Synset('consulting_service.n.01'), Synset('facility.n.05'), Synset('national_service.n.01'), Synset('utility.n.03')], [Synset('childcare.n.01'), Synset('community_service.n.01'), Synset('community_service.n.02'), Synset('help_desk.n.01'

In [140]:
wmDistance = calcWMDistance(inSentence)
sorted(wmDistance.items(), key = operator.itemgetter(1))[:10]

[('DOC 19', 0.27055288860296917),
 ('DOC 17', 0.7870174806979101),
 ('DOC 20', 0.8688072545395806),
 ('DOC 14', 0.9044911523019552),
 ('DOC 24', 0.9424520374228315),
 ('DOC 18', 0.947977346033127),
 ('DOC 43', 0.9533117972694977),
 ('DOC 0', 0.9570868387800772),
 ('DOC 50', 0.9657944299630516),
 ('DOC 49', 0.9888544324659416)]

In [141]:
inputContainer = createInputContainer(inSentence)

In [142]:
testSimilarity(inputContainer)

TESTING FOR pos
[('DOC 19', 0.8257228238447705), ('DOC 18', 0.42640143271122094), ('DOC 17', 0.351839597706834), ('DOC 14', 0.3302891295379082), ('DOC 34', 0.3289758474798845), ('DOC 49', 0.30151134457776363), ('DOC 20', 0.2934695928267111), ('DOC 33', 0.27524094128159016), ('DOC 12', 0.2279211529192759), ('DOC 47', 0.22613350843332272)]




TESTING FOR stemmed
[('DOC 19', 0.8528028654224417), ('DOC 18', 0.42640143271122094), ('DOC 17', 0.3876574430285532), ('DOC 14', 0.3302891295379082), ('DOC 34', 0.3289758474798845), ('DOC 49', 0.30151134457776363), ('DOC 20', 0.28603877677367767), ('DOC 33', 0.27524094128159016), ('DOC 47', 0.22613350843332272), ('DOC 13', 0.22395700438234173)]




TESTING FOR lemma
[('DOC 19', 0.8528028654224417), ('DOC 18', 0.42640143271122094), ('DOC 17', 0.3876574430285532), ('DOC 14', 0.3302891295379082), ('DOC 34', 0.3289758474798845), ('DOC 49', 0.30151134457776363), ('DOC 20', 0.28603877677367767), ('DOC 33', 0.27524094128159016), ('DOC 47', 0.2261335084333

In [143]:
questions = ["How do I pay for games?",
"How many genres of games do you have?",
"Do I need the client to play games?",
"How can I use your streaming service?",
"Can achievements be purchased on the store?",
"Are game saves automatically synced with the server?",
"Can I buy concept art for different games?",
"Can I back up my games?",
"Do you have an in-game chat service?",
"Do you have level and map editors for games?"]

In [252]:
for question in questions:    
    print(question + "\n")
    
    #match_qa_pair(question)
    
    wmDistance = calcWMDistance(question)
    #print(sorted(wmDistance.items(), key = operator.itemgetter(1))[:10])
    
    inputContainer = createInputContainer(question)
    
    #testSimilarity(inputContainer)
    
    """
    posWeight = 0.1
    stemmedWeight = 0.3
    lemmaWeight = 0.3
    wmWeight = 0.0
    """
    
    
    posWeight = 0.1
    stemmedWeight = 0.3
    lemmaWeight = 0.3
    wmWeight = 0.01
    
    pslWeights = PSL(inputContainer, posWeight, stemmedWeight, lemmaWeight)
    
    # print(pslWeights)
    
    totalWeights = dict()
    for psl, wm in zip(sorted(pslWeights.items()), sorted(wmDistance.items())):
        # print(psl[1], wm[1] * 0.1)
        totalWeights.update({psl[0] : (psl[1] + (wm[1] * wmWeight))})
        pass
    
    print(sorted(totalWeights.items(), key = operator.itemgetter(1), reverse = True)[:10])
    # print(totalWeights)
    
    print("\n\n\n\n\n")
    pass

How do I pay for games?

[('DOC 38', 0.13102062512018384), ('DOC 26', 0.12729014352924567), ('DOC 25', 0.12508798877921026), ('DOC 7', 0.1084954584912395), ('DOC 36', 0.1055567603354672), ('DOC 43', 0.10512767744923524), ('DOC 0', 0.10503793464248438), ('DOC 32', 0.10460229142786791), ('DOC 21', 0.1042822657802534), ('DOC 46', 0.10423922457025568)]






How many genres of games do you have?

[('DOC 6', 0.17603893984336785), ('DOC 36', 0.1271282630679571), ('DOC 5', 0.12569159583350703), ('DOC 44', 0.12370976680039912), ('DOC 7', 0.11886419553175484), ('DOC 4', 0.11383554456353635), ('DOC 26', 0.11076117669088217), ('DOC 39', 0.10676612249830392), ('DOC 18', 0.10636010355136906), ('DOC 11', 0.09731614252145468)]






Do I need the client to play games?

[('DOC 28', 0.21600764735153632), ('DOC 21', 0.17813896749195365), ('DOC 29', 0.16926287741138335), ('DOC 44', 0.1624825113551426), ('DOC 24', 0.15091267748562975), ('DOC 3', 0.1462807867981468), ('DOC 13', 0.13989742775684721), ('DOC 

In [223]:
def PSL(inputContainer, posWeight, stemmedWeight, lemmaWeight):
    weightedSum = dict()
    
    availableParts = ["pos", "stemmed", "lemma"]
    wordNetSimList = list()
    
    for availablePart in availableParts:        
        simList = list()
        for doc in container:
            similarity = getSimilarity(availablePart, container[doc], inputContainer["Input"])
            
            if availablePart == "pos":
                similarity *= posWeight
                pass
            elif availablePart == "stemmed":
                similarity *= stemmedWeight
                pass
            elif availablePart == "lemma":
                similarity *= lemmaWeight
                pass
            
            simList.append((doc, similarity))
            
            # totalWordnetSimilarity = getWordnetSimilarity(container[doc], inputContainer["Input"])
            totalWordnetSimilarity = 0.0
            wordNetSimList.append((doc, totalWordnetSimilarity))
            
            weightedSum.update({doc : similarity + totalWordnetSimilarity})
            pass
                
        del simList
        pass
    
    # print(weightedSum)
    
    return weightedSum
    pass