# Doc2Vec Model

In [7]:
import re
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from scipy import spatial
from pmaw import PushshiftAPI
from pathlib import Path
from typing import List

In [8]:
Word = str
Sentence = List[Word]

reddit_data_path = Path("data/reddit_data.txt")
model_path = "doc2vec_model"

In [9]:
#scrapping reddit data and store it in a txt file

def get_reddit_data(nbPost: int) -> None:
    api = PushshiftAPI()
    comments = api.search_submissions(subreddit="Politiquefrancaise", limit=nbPost, TimeoutError=1000)
    
    comment_list = [comment['selftext'] for comment in comments]
    with open(reddit_data_path, "w", encoding="utf-8") as f:
        for comment in comment_list:
            #remove urls
            comment = re.sub(r'http\S+', '', comment)
            f.write(comment)
    return

if not reddit_data_path.exists():
    print("Creating reddit data file")
    get_reddit_data(10000)
else:
    print("Data already exists")


Data already exists


For the doc2vec model, input data should be in format of iterable TaggedDocuments"

Each TaggedDocument instance comprises words and tags

Hence, each document (i.e., a sentence or paragraph) should have a unique tag which is identifiable

In [10]:


#télcharge "quinze essais politiques" dans data et en fait une liste
# opening the file in read mode

def read_text_data(file_path: Path, encoding="") -> List[Sentence]:
    if encoding != "":
        with open(file_path, "r", encoding=encoding) as f:
            corpus = f.read()
    else:
        with open(file_path, "r") as f:
            corpus = f.read()

    corpus = corpus.replace('\n', ' ') # put text on one line
    corpus = corpus.replace('.', ',') # text separated by commas will be considered as different sentences

    sentences_list = corpus.split(",") # split the text into a list of sentences
    
    for i in range(len(sentences_list)):
        sentences_list[i] = sentences_list[i].lower() # change all characters to lower
        sentences_list[i] = re.sub(r'[^\w\s]','',sentences_list[i]) # remove punctuation
        sentences_list[i] = sentences_list[i].split() # split every sentence into a list of words
        
    return sentences_list





     


In [11]:
def tag_text(sentences: List[Sentence]) -> List[TaggedDocument]:
    """converting each sentence into a TaggedDocument"""
    tagged_docs = []
    for i in range(len(sentences)):
        tagged_docs.append(TaggedDocument(words = sentences[i], tags = ['sent{}'.format(i)]))
    return tagged_docs
corpus = read_text_data(reddit_data_path, encoding="utf-8")
print(f"The corpus 2 has {len(corpus)} sentences")
for sentence in corpus[:10]:
    print(sentence)
tagged_c1 = tag_text(corpus)
print(tagged_c1[0])

The corpus 2 has 4098 sentences
['la', 'france', 'connaîtra', 'une', 'augmentation', 'rapide', 'des', 'cas', 'de', 'contamination', 'a', 'la', 'covid19', 'dans', 'les', 'prochains', 'jours']
['a', 'déclaré', 'hier', 'gabriel', 'attal']
['porteparole', 'du', 'gouvernement', 'français']
['ajoutant', 'que', 'rien', 'nindique', 'que', 'le', 'nombre', 'de', 'cas', 'vas', 'diminuer']
['gabriel', 'attal', 'a', 'déclaré', 'que', 'les', 'nouveaux', 'cas', 'avaient', 'atteint', 'des', 'niveaux', 'extrêmement', 'élevés', 'dans', 'la', 'région', 'iledefrance', 'entourant', 'paris', 'et', 'certaines', 'autres', 'régions', 'du', 'pays']
['tout', 'en', 'notant', 'que', 'la', 'situation', 'dans', 'les', 'hôpitaux', 'pourrait', 'se', 'détériorer', 'dans', 'les', 'semaines', 'à', 'venir']
['les', 'réactions', 'se', 'poursuivent', 'toujours', 'en', 'france', 'suite', 'aux', 'déclarations', 'faites', 'mardi', 'par', 'le', 'président', 'emmanuel', 'macron', 'au', 'journal', 'le', 'parisien']
['dans', 'lesq

In [28]:
corpus = tagged_c1

model = Doc2Vec(documents = corpus, vector_size = 100, min_count = 1)
model.init_sims(replace = True)

model.save(model_path)
model = Doc2Vec.load(model_path)


  model.init_sims(replace = True)


In [33]:
v1 = model.infer_vector(['votez macron'])    # in doc2vec, infer_vector() function is used to infer the vector embedding of a document
v2 = model.infer_vector(['macron quitte gouvernement'])
v3= model.infer_vector(['zemmour président'])
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)
print(cosine_similarity(v1, v2))
print(cosine_similarity(v1, v3))
print(cosine_similarity(v2, v3))

#V2 and v3 should be the most similar vectors but it is not the case





-0.14759613573551178
-0.004457784816622734
-0.0378972664475441


# Comparison with spacy

In [34]:
import spacy
# Load the spacy model that you have installed
nlp = spacy.load('fr_core_news_sm')# python -m spacy download fr_core_news_sm
# process a sentence using the model
doc1 = nlp("votez macron")
doc2 = nlp("Macron quitte gouvernement")
doc3 = nlp("zemmour président")
# It's that simple - all of the vectors and words are assigned after this point
# Get the vector for 'text':
vs1=doc1.vector
vs2=doc2.vector
vs3=doc3.vector
print(cosine_similarity(vs1, vs2))
print(cosine_similarity(vs1, vs3))
print(cosine_similarity(vs2, vs3))

#Spacy is able to see similarity between v2 and V3


0.2963140308856964
0.3943708837032318
0.5074030756950378
96
