## Import packages:

In [5]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk 
import gensim
import re
import datetime
import collections
import random
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os

%matplotlib inline

## Load dataset:

In [6]:
train = pd.read_csv("../Dataset/cleaned_data.csv", delimiter="\t")

In [7]:
train.head()

Unnamed: 0.1,Unnamed: 0,body
0,FM20160429032ACC9MUHD,acea essere pronto giocare partita banda ultra...
1,FM20160501023ACLAzwID,unico certezza momento essere prossimo tappa s...
2,FM20160503035ACaNSzJD,potere aspettare mese luglio formalizzazione p...
3,FM20160503035ACDx00JD,fondo chiuso aiutare piccolo medio impresa ita...
4,FM20160428033ACi2kvGD,volto provocazione sortire effetto sperare ext...


In [8]:
train.shape

(9283, 2)

In [9]:
def tagDocuments(text):
    for i, line in enumerate(text):
        if(i%1000 == 0):
            print("> Iteration: " + str(i))
        yield gensim.models.doc2vec.TaggedDocument(
            [w for w in 
             gensim.utils.simple_preprocess(line)], [i])

## Extract train data from dataframe into list:

In [10]:
print("> START %s" % datetime.datetime.now())
train_corpus = list(tagDocuments(train['body'].tolist()))
print("> END %s" % str(datetime.datetime.now()))

len(train_corpus)

> START 2017-05-22 21:45:05.700830
> Iteration: 0
> Iteration: 1000
> Iteration: 2000
> Iteration: 3000
> Iteration: 4000
> Iteration: 5000
> Iteration: 6000
> Iteration: 7000
> Iteration: 8000
> Iteration: 9000
> END 2017-05-22 21:45:07.769022


9283

## Prepare model:

In [11]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [12]:
window = 20
size = 20
dm_concat = 1
alpha = 0.02
model = gensim.models.doc2vec.Doc2Vec(size=size, min_count=10, 
                                      dm_concat= dm_concat, 
                                      window=window, alpha = alpha,iter=55)

In [13]:
model.build_vocab(train_corpus)

2017-05-22 21:45:17,769 : INFO : collecting all words and their counts
2017-05-22 21:45:17,773 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-05-22 21:45:18,109 : INFO : collected 28585 word types and 9283 unique tags from a corpus of 9283 examples and 1175199 words
2017-05-22 21:45:18,110 : INFO : Loading a fresh vocabulary
2017-05-22 21:45:18,142 : INFO : min_count=10 retains 7397 unique words (25% of original 28585, drops 21188)
2017-05-22 21:45:18,144 : INFO : min_count=10 leaves 1123040 word corpus (95% of original 1175199, drops 52159)
2017-05-22 21:45:18,168 : INFO : deleting the raw counts dictionary of 28585 items
2017-05-22 21:45:18,171 : INFO : sample=0.001 downsamples 31 most-common words
2017-05-22 21:45:18,172 : INFO : downsampling leaves estimated 1036816 word corpus (92.3% of prior 1123040)
2017-05-22 21:45:18,173 : INFO : estimated required memory for 7397 words and 20 dimensions: 29295060 bytes
2017-05-22 21:45:18,201 : INFO : usi

In [14]:
model_name = "doc2vec_model"

## Train model:

In [15]:
if not os.path.exists(str(os.getcwd()) + '/' + model_name):
    %time model.train(train_corpus)
    model.save(model_name)
else:
    model = gensim.models.doc2vec.Doc2Vec.load(model_name)

2017-05-22 21:45:24,639 : INFO : loading Doc2Vec object from doc2vec_model
2017-05-22 21:45:24,893 : INFO : loading wv recursively from doc2vec_model.wv.* with mmap=None
2017-05-22 21:45:24,893 : INFO : setting ignored attribute syn0norm to None
2017-05-22 21:45:24,894 : INFO : loading docvecs recursively from doc2vec_model.docvecs.* with mmap=None
2017-05-22 21:45:24,895 : INFO : setting ignored attribute cum_table to None
2017-05-22 21:45:24,897 : INFO : loaded doc2vec_model


In [12]:
ranks = [] #per ogni documento l'indice che indica a che livello si trova nella lista dei simili a se stesso)

second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

2017-05-22 19:35:57,959 : INFO : precomputing L2-norms of doc weight vectors


## Evaluate model (raw):

In [13]:
def score(listOfRanks):
    correct = 0
    for e in listOfRanks:
        if(e >= 0 | e <= 2):
        #if( e == 0):
            correct = correct + 1
    return correct/len(listOfRanks)

In [14]:
score(ranks)

0.9598190240224066

## Model inspection

In [15]:
model.infer_vector(['oggi', 'voglio', 'comprare', 'una', 'azione', 'di', 'mediaset'])

array([-0.17726685, -0.00284267, -0.04112091, -0.18763816,  0.26730201,
       -0.01805161, -0.15042204,  0.0606822 ,  0.00520629, -0.30798909,
        0.26504353,  0.0281793 , -0.09364226, -0.02050012, -0.15537296,
        0.01127828, -0.08701586,  0.06999869, -0.22491051, -0.13964911], dtype=float32)

In [16]:
model.wv.vocab['gruppo'].count

9215

In [17]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (9282): «toscano aeroporto apprendere indiscrezione stampa commissione valutazione impatto ambientale ministero ambiente avere esprimere parere favorevole masterplan aeroporto amerigo vespucci firenze prevedere altro cosa realizzazione cosiddetto pista parallelo nuovo terminal aeroportuale quotare mercato telematico azionario organizzare gestire borsa italiano riservare qualsiasi commento merito fino pubblicazione parere relativo documentazione parte ministero riferire nota»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/c,d20,n5,w20,mc10,s0.001,t3):

MOST (9282, 0.8520050048828125): «toscano aeroporto apprendere indiscrezione stampa commissione valutazione impatto ambientale ministero ambiente avere esprimere parere favorevole masterplan aeroporto amerigo vespucci firenze prevedere altro cosa realizzazione cosiddetto pista parallelo nuovo terminal aeroportuale quotare mercato telematico azionario organizzare gestire borsa italiano riservare qualsiasi commento merito fino pubbl

In [18]:
# Pick a random document from the test corpus and infer a vector from the model

doc_id = random.randint(0, len(train_corpus))

# Compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (7542): «nuovo ingresso prelios agency trattare antonio chiatellino nominare head of capital leasing chiatellino avere recentemente lavorare banca esperia interno prelios agency ulteriormente team dedicare offerta servizio advisory brokerage investitore pubblico privato fondo immobiliare operatore istituzionale»

Similar Document (2220, 0.7440720796585083): «milanoil consiglio amministrazione idea capital funds sgr avere deliberare terzo closing fondo idea taste of italy primo fondo privato equity italiano specializzare settore agroalimentare ammontare complessivo pari milione euro board avere deliberare inizio attività investimento fondo idea ccr corporate credit recovery particolare essere_stare esaminare esaminare investimento quattro società già presente portafoglio comparto credito fondo spiegare nota conferma strategia originariamente delineare prevedere rilancio sviluppo azienda fare parte comparto infine assemblea socio idea capital funds sgr avere aumentare nume