In [1]:
import gensim
import os
import collections
import smart_open
import random

In [2]:
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [3]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [4]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [5]:
train_corpus[:2]

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [6]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [7]:

model.build_vocab(train_corpus)

In [8]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 6.85 s, sys: 263 ms, total: 7.12 s
Wall time: 3.14 s


In [10]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires','in','india'])

array([-0.04081251,  0.09907233, -0.14071181,  0.03392525, -0.2349259 ,
       -0.05982138, -0.21663159,  0.0523302 , -0.29451266, -0.31753203,
        0.04433207, -0.19574216, -0.19006261,  0.1650976 ,  0.03685386,
        0.14444134,  0.26979125, -0.18260871, -0.04855327,  0.05406506,
       -0.23888269, -0.04999337, -0.27730277, -0.05390741, -0.17177421,
       -0.13757473,  0.03903996, -0.07838061, -0.24421807,  0.21927716,
       -0.1069326 ,  0.16755913,  0.2672514 ,  0.37260506, -0.00315433,
        0.17631668,  0.02131809,  0.00920411, -0.00371837, -0.11262745,
       -0.04888618, -0.09076045, -0.02059839, -0.13992186, -0.07845638,
       -0.14535038, -0.05908379, -0.13469926,  0.17294142, -0.03269662],
      dtype=float32)

In [11]:

ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [12]:
collections.Counter(ranks)

Counter({0: 292, 1: 8})

In [16]:
model.n_similarity(['only', 'you', 'can', 'prevent', 'forest', 'fires','in','india'],['only', 'you', 'can', 'prevent', 'forest', 'fires','in','australia'])

  """Entry point for launching an IPython kernel.


0.9664531

In [17]:
from gensim.utils import simple_preprocess

In [30]:
set(model.wv.vocab.keys())

{'embryos',
 'end',
 'gathering',
 'agencies',
 'nato',
 'vast',
 'breach',
 'government',
 'moroccan',
 'streets',
 'match',
 'boat',
 'talks',
 'cent',
 'terrorists',
 'hopman',
 'true',
 'legislation',
 'unindicted',
 'flying',
 'five',
 'palace',
 'yes',
 'settlers',
 'senior',
 'ring',
 'succeed',
 'inquiry',
 'season',
 'development',
 'address',
 'stepped',
 'engulfed',
 'ended',
 'nasa',
 'number',
 'bid',
 'evans',
 'winners',
 'trapping',
 'managers',
 'all',
 'practice',
 'lights',
 'st',
 'promised',
 'harris',
 'fighters',
 'marked',
 'tanks',
 'cooperate',
 'ashes',
 'locals',
 'couple',
 'were',
 'dominated',
 'typhoid',
 'taking',
 'new',
 'ripped',
 'mixed',
 'stop',
 'szondy',
 'prove',
 'sweet',
 'collins',
 'aiming',
 'potentially',
 'strike',
 'term',
 'hiv',
 'so',
 'acting',
 'emails',
 'announce',
 'veteran',
 'norwegian',
 'for',
 'reducing',
 'rose',
 'apparently',
 'detained',
 'passenger',
 'his',
 'easy',
 'targets',
 'cigarettes',
 'emotional',
 'staff',
 

In [36]:
def tidy_sentence(sentence, vocabulary):
    return [word for word in simple_preprocess(sentence) if word in vocabulary]

In [37]:
sen1=tidy_sentence('A consumer survey will help you understand your customers’ likes, dislikes, and where you need to make improvements. For example, what does the average customer think about your prices? Too high? Just right? How well is your staff doing on customer service, or how well does your client success team understand the growing needs of your customers—and prospects?',model.wv.vocab)

In [38]:
sen2=tidy_sentence('Employee surveys are valuable for learning about several topics, all of them directly related to employee morale, satisfaction, and involvement with the company.Health benefits, wellness programs, compensation, managers’ performance, career development, work environment: These are all areas you can investigate in depth with a well-designed employee satisfaction survey.',model.wv.vocab)

In [41]:
sen3=tidy_sentence('Provide better care for your patients. From collecting patient feedback to getting input from staff, our healthcare surveys can get you the answers you need to make improvements across your organization. You can even collect protected health information by turning on HIPAA-compliant features.',model.wv.vocab)

In [42]:
model.n_similarity(sen1,sen3)

  """Entry point for launching an IPython kernel.


0.966503

In [43]:
model.n_similarity(sen2,sen3)

  """Entry point for launching an IPython kernel.


0.89485955

In [44]:
import json

In [46]:
with open('surveys.json', 'r') as f:
        datastore = json.load(f)

In [48]:
len(datastore)

10

In [54]:
l = [None] * len(datastore)
for key,value in datastore.items():
    l[int(key)-1]=value["description"]

In [55]:
l

['The general purpose of customer satisfaction surveys is to assess how satisfied your customers are with different aspects of your product/service. Identifying unhappy customers is as important as identifying extremely happy ones (potential advocates).Having (and maintaining) happy customers improves the odds of repeat purchases as well as raises your general customer lifetime value.Unhappy customers, on the other hand, can be a lot more harmful than you think. Ask them to rate their satisfaction with a specific aspect on a 1 to 10 scale.Using scales to measure customer satisfaction makes it a lot more practical to measure changes in satisfaction over time. That’s even more important when trying to measure the impact of a specific initiative.Measuring and tracking your overall customer satisfaction is the first and most important step towards creating amazing experiences. Keep that in mind, always, and you’ll be on the right path towards developing an awesome product/service.',
 'An E

In [71]:
desc='To know how the employees feel about their job. To see if the employees are satisfied with the work environment and th get some feedback to improve'

In [72]:
tidied=[]
for i in l:
    tidied.append(tidy_sentence(i,model.wv.vocab))
    

In [73]:
desc=tidy_sentence(desc,model.wv.vocab)

In [74]:
desc

['to',
 'know',
 'how',
 'the',
 'employees',
 'feel',
 'about',
 'their',
 'job',
 'to',
 'see',
 'if',
 'the',
 'employees',
 'are',
 'satisfied',
 'with',
 'the',
 'work',
 'environment',
 'and',
 'th',
 'get',
 'some',
 'to',
 'improve']

In [83]:
model.n_similarity(desc,tidied[0])

  """Entry point for launching an IPython kernel.


0.9391995

In [70]:
tidied[5]

['is',
 'information',
 'provided',
 'by',
 'about',
 'whether',
 'they',
 'are',
 'satisfied',
 'or',
 'with',
 'or',
 'service',
 'and',
 'about',
 'general',
 'experience',
 'they',
 'had',
 'with',
 'company',
 'their',
 'opinion',
 'is',
 'for',
 'experience',
 'and',
 'your',
 'actions',
 'to',
 'their',
 'needs',
 'for',
 'software',
 'and',
 'this',
 'is',
 'important',
 'since',
 'you',
 'need',
 'to',
 'be',
 'aware',
 'of',
 'what',
 'your',
 'users',
 'like',
 'and',
 'what',
 'the',
 'to',
 'their']

In [85]:
import gensim.corpora.wikicorpus as wiki

In [86]:
wiki

<module 'gensim.corpora.wikicorpus' from '/home/infyblr/anaconda3/envs/surveybrain/lib/python3.6/site-packages/gensim/corpora/wikicorpus.py'>