# Doc2Vec Model

In [37]:
import re
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from scipy import spatial
from pmaw import PushshiftAPI
from pathlib import Path
from typing import List

In [39]:
Word = str
Sentence = List[Word]

reddit_data_path = Path("data/reddit_data.txt")
essais_politiques_path = Path("data/quinze_essais_politiques.txt")
model_path = "models/doc2vec_model"

In [40]:
#scrapping reddit data and store it in a txt file

def get_reddit_data(nbPost: int) -> None:
    api = PushshiftAPI()
    comments = api.search_submissions(subreddit="Politiquefrancaise", limit=nbPost, TimeoutError=1000)
    
    comment_list = [comment['selftext'] for comment in comments]
    with open(reddit_data_path, "w", encoding="utf-8") as f:
        for comment in comment_list:
            #remove urls
            comment = re.sub(r'http\S+', '', comment)
            f.write(comment)
    return

if not reddit_data_path.exists():
    print("Creating reddit data file")
    get_reddit_data(10000)
else:
    print("Data already exists")


Data already exists


In [59]:
#télcharge "quinze essais politiques" dans data et en fait une liste
# opening the file in read mode

def read_text_data(file_path: Path, encoding="") -> List[Sentence]:
    if encoding != "":
        with open(file_path, "r", encoding=encoding) as f:
            corpus = f.read()
    else:
        with open(file_path, "r") as f:
            corpus = f.read()

    corpus = corpus.replace('\n', ' ') # put text on one line
    corpus = corpus.replace('.', ',') # text separated by commas will be considered as different sentences

    sentences_list = corpus.split(",") # split the text into a list of sentences
    
    for i in range(len(sentences_list)):
        sentences_list[i] = sentences_list[i].lower() # change all characters to lower
        sentences_list[i] = re.sub(r'[^\w\s]','',sentences_list[i]) # remove punctuation
        sentences_list[i] = sentences_list[i].split() # split every sentence into a list of words
        
    return sentences_list

corpus1 = read_text_data(essais_politiques_path)
corpus2 = read_text_data(reddit_data_path, encoding="utf-8")

print(f"The corpus 1 has {len(corpus1)} sentences")
for sentence in corpus1[:10]:
    print(sentence)
    
print(f"The corpus 2 has {len(corpus2)} sentences")
for sentence in corpus2[:10]:
    print(sentence)

The corpus 1 has 4723 sentences
['avertissement', 'du', 'traducteur', 'retour', 'à', 'la', 'table', 'des', 'matières', 'ce', 'fichier']
['malgré', 'son', 'titre']
['ne', 'comporte', 'pas', 'tous', 'les', 'essais', 'des', 'political', 'discourses', '1', 'de', '1752', 'edinburgh']
['kincaid', 'et', 'donaldson', 'dont', 'la', 'plupart', 'ont', 'déjà', 'été', 'traduits', 'par', 'nos', 'soins', 'et', 'intégrés', 'au', 'fichier', 'essais', 'économiques', 'de', 'hume', 'paru', 'en', 'août', '2007', 'aux', 'classiques', 'des', 'sciences', 'sociales']
['ce', 'présent', 'fichier', 'contient', 'un', 'choix', 'dessais', 'qui', 'traitent', 'plus', 'directement', 'de', 'questions', 'politiques']
['david', 'hume', 'essai', 'sur', 'la', 'liberté', 'de', 'la', 'presse']
['of', 'the', 'liberty', 'of', 'the', 'press', 'in', 'essays']
['moral', 'and', 'political', '1', 'volume', 'edinburgh']
['a']
['kincaid', '1741', 'traduction', 'de', 'philippe', 'folliot']
The corpus 2 has 4085 sentences
['le', 'présid

For the doc2vec model, input data should be in format of iterable TaggedDocuments"

Each TaggedDocument instance comprises words and tags

Hence, each document (i.e., a sentence or paragraph) should have a unique tag which is identifiable

In [112]:
def tag_text(sentences: List[Sentence]) -> List[TaggedDocument]:
    """converting each sentence into a TaggedDocument"""
    tagged_docs = []
    for i in range(len(sentences)):
        tagged_docs.append(TaggedDocument(words = sentences[i], tags = ['sent{}'.format(i)]))
    return tagged_docs

tagged_c1 = tag_text(corpus2)
print(tagged_c1[0])

TaggedDocument<['le', 'président', 'emmanuel', 'macron', 'a', 'appelé', 'dans', 'un', 'discours', 'devant', 'le', 'parlement', 'européen', 'à', 'proposer', 'une', 'nouvelle', 'alliance', 'avec', 'les', 'pays', 'africains', 'en', 'matière', 'dinvestissements'], ['sent0']>


In [113]:
corpus = tagged_c1

model = Doc2Vec(documents = corpus, vector_size = 10, min_count = 1)
model.init_sims(replace = True)

model.save(model_path)
model = Doc2Vec.load(model_path)


  model.init_sims(replace = True)


In [124]:
v1 = model.infer_vector(['front national'])    # in doc2vec, infer_vector() function is used to infer the vector embedding of a document
v2 = model.infer_vector(['insoumis'])    # in doc2vec, infer_vector() function is used to infer the vector embedding of a document
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)
cosine_similarity(v1, v2)






0.18243521451950073