# Doc2vec

In [None]:
# Importing dependencies

# Spacy Italian models (to split by sentence)
from spacy.lang.it import Italian


#  gensim's dependencies to compute doc to vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
# gensim crude tokenizer that ignores one-letter words and punctuation
from gensim.utils import simple_preprocess

In [None]:
# Finding out the number of cores available
import multiprocessing
num_cores = multiprocessing.cpu_count()
num_cores

In [None]:
# text = """Doha was founded in the 1820s as an offshoot of Al Bidda. It was officially declared as the country's capital in 1971, when Qatar gained independence from being a British protectorate.[4] As the commercial capital of Qatar and one of the emergent financial centers in the Middle East, Doha is considered a beta-level global city by the Globalization and World Cities Research Network. Doha accommodates Education City, an area devoted to research and education, and Hamad Medical City, an administrative area of medical care. It also includes Doha Sports City, or Aspire Zone, an international sports destination that includes Khalifa International Stadium, a stadium for the 2022 FIFA World Cup; Hamad Aquatic Centre; and the Aspire Dome. """

# https://it.wikipedia.org/wiki/Doha, as in November 2022
text = "Doha è la capitale e la città più popolata dello stato del Qatar. Si trova sul " + \
    "Golfo Persico e aveva nel 2015 una popolazione di 956 460 abitanti. È la più grande città " + \
    "del Qatar e, con la sua area urbana e suburbana, ospita circa il 60% della popolazione " + \
    "dello stato. A Doha vi è la sede del governo del Qatar, il cui capo è l'emiro Tamim bin " + \
    "Hamad al-Thani. Nei pressi della città sorge Education City, una zona in cui si sono " + \
    "insediati diversi campus universitari e istituti dedicati alla ricerca e all'innovazione. " + \
    "La città presenta un carattere cosmopolita. A Doha è stanziato il principale quartier " + \
    "generale del Comando centrale militare USA, il più grande dell'intero Medio Oriente. "

nlp = Italian()
nlp.add_pipe("sentencizer")
doc = nlp(text)
for sent in doc.sents:
    print(sent)

In [None]:
# We need a list of documents as it's iterable
training_corpus = []
for i, text in enumerate(doc.sents):
#   your homework belongs here (it should be one line, or you can plug it directly within TaggedDocument)
    tagged_doc = TaggedDocument(simple_preprocess(text.text), [i])
    training_corpus.append(tagged_doc)
print(training_corpus)

In [None]:
# Instantiating and training the object, all at once 
model = Doc2Vec(training_corpus, vector_size=100, window=2, min_count=1, workers=4)

## Inferring a  vector for a new document

In [None]:
# your homework also belongs here (and it should be, again, one line)
model.infer_vector(simple_preprocess('Doha è la più grande città del Qatar'))

In [None]:
# Legacy version (for an older version of gensim)

corpus = ["The faster Harry got to the store, the faster and faster Harry would get home."]
corpus.append("Harry is hairy and faster than Jill.")
corpus.append("Jill is not as hairy as Harry.")

training_corpus = []
for i, text in enumerate(corpus):
    tagged_doc = TaggedDocument(simple_preprocess(text), [i])
    training_corpus.append(tagged_doc)
print(training_corpus)

# Instantiating the object 
model = Doc2Vec(vector_size=100,   # dimensions of the vectors
                min_count=2, # min frequency for the tokens
                workers=num_cores,  
                iter=10)   # number of iterations
# Compiling the vocabulary 
model.build_vocab(training_corpus)

model.infer_vector(simple_preprocess('Indeed Jill is the fastest'), steps=10)


# Homework

1. Plug the proper tokenisation for Italian from spacy (instead of gensim's crude)