# Doc2Vec Model

In [29]:
!pip install python-Levenshtein
!pip install gensim

Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 8.4 MB/s  eta 0:00:01
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25ldone
[?25h  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp36-cp36m-linux_x86_64.whl size=155943 sha256=8ec53e88e3aefbd11f14763e33224cf57c50b4f3176b180c65333123879f89b8
  Stored in directory: /home/ec2-user/.cache/pip/wheels/4a/a4/bf/d761b0899395c75fa76d003d607b3869ee47f5035b8afc30a2
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [31]:
import logging
import boto3
import gensim
import os
import smart_open
import Levenshtein
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

lee_train_file = 's3://awstranscribe-tests/doc2vec/corpus.cor'
lee_test_file = 's3://awstranscribe-tests/transcribeOutputs/proc_files/no_IPA/test_file.csv'

### Read and process text

In [13]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
                
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))
print(train_corpus[:2])

[TaggedDocument(words=['alo', 'buenas', 'tardes', 'hablã³', 'con', 'don', 'jorge', 'cadenas', 'kilos', 'favor', 'diga', 'sã', 'no', 'alo', 'no', 'se', 'debemos', 'disculpe', 'pero', 'usted', 'don', 'jorge', 'cadenas', 'quiero'], tags=[0]), TaggedDocument(words=['alo', 'buenas', 'tardes', 'hablã³', 'con', 'don', 'hã', 'ctor', 'arriagada', 'cabello', 'favor', 'diga', 'sã', 'no'], tags=[1])]


### Training the model

In [14]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)

2021-02-17 12:23:23,868 : INFO : collecting all words and their counts
2021-02-17 12:23:23,869 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-02-17 12:23:23,870 : INFO : collected 300 word types and 50 unique tags from a corpus of 50 examples and 1937 words
2021-02-17 12:23:23,871 : INFO : Loading a fresh vocabulary
2021-02-17 12:23:23,872 : INFO : effective_min_count=2 retains 161 unique words (53% of original 300, drops 139)
2021-02-17 12:23:23,872 : INFO : effective_min_count=2 leaves 1798 word corpus (92% of original 1937, drops 139)
2021-02-17 12:23:23,874 : INFO : deleting the raw counts dictionary of 300 items
2021-02-17 12:23:23,875 : INFO : sample=0.001 downsamples 74 most-common words
2021-02-17 12:23:23,876 : INFO : downsampling leaves estimated 788 word corpus (43.9% of prior 1798)
2021-02-17 12:23:23,877 : INFO : estimated required memory for 161 words and 50 dimensions: 154900 bytes
2021-02-17 12:23:23,878 : INFO : resetting layer wei

In [27]:
type(model.docvecs)

gensim.models.keyedvectors.Doc2VecKeyedVectors

### Accesing the model

In [22]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

2021-02-17 12:31:04,423 : INFO : precomputing L2-norms of doc weight vectors


In [23]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({15: 5, 6: 3, 42: 3, 41: 3, 39: 3, 24: 2, 17: 2, 12: 2, 1: 2, 21: 2, 43: 2, 0: 1, 31: 1, 23: 1, 18: 1, 40: 1, 33: 1, 8: 1, 10: 1, 19: 1, 22: 1, 45: 1, 47: 1, 4: 1, 14: 1, 34: 1, 9: 1, 5: 1, 29: 1, 30: 1, 13: 1, 49: 1})


In [24]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (49): «alo buenos dã as hablã³ con doã roxana reta mal verdejo»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (3, 0.255410760641098): «alo buenos dã as hablã³ con don ronnie muã oz araya sã don ronnie habla con su ejecutiva virtual para su seguridad esta conversaciã³n podrã ser grabada le estamos llamando de socofin por encargo de banco de chile para informarle que se encuentra en pago su tarjeta de crã dito con fecha cuatro de noviembre de dos mil veinte si usted ya cancelã³ por favor omitir este mensaje que tenga un buen dã»

SECOND-MOST (29, 0.21810851991176605): «alo buenas tardes hablã³ con don jorge hernã ndez haedo sã disculpe tenemos un mensaje importante es usted don jorge hernã ndez si disculpe tenemos un mensaje importante es usted don jorge hernã ndez favor diga sã no si don jorge habla con su ejecutiva virtual para su seguridad esta conversaciã³n podrã ser grabada le estamos llamando de socofin por encargo de banco de chile para i

In [28]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (2): «alo buenas tardes alo hablã³ con doã miriam acostara ya favor indica sã no»

Similar Document (40, 0.29418909549713135): «alo buenos dã as hablã³ con don josã juan churrianero iba bien favor responder sã no no»



In [38]:
nice = 'alo? Buenas tardes. Habló con Don Jorge. Cadenas, kilos favor Diga sí o no. alo? No se debemos. Disculpe. Pero usted, Don Jorge Cadenas, quiero'
bad = 'a lo buenas tardes. Habló con Don Jorge. Cadenas, kilos Favor, Diga sí o no. a No se debemos. Disculpe. Pero usted, Don Jorge Cadenas, quiero'

## Lev / lawrgo de palabra  
print(len(nice))
Levenshtein.distance(nice, bad)  / len(nice)

143


0.055944055944055944