# Biblioteki

In [1]:
import multiprocessing, spacy, os, random
import pandas as pd

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from IPython.display import clear_output
from itertools import product
from tqdm import tqdm

cores = multiprocessing.cpu_count()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Czyste dane

In [2]:
def getCleanData():
    clean_paths = os.listdir('new_clean_data')
    clean_data = pd.DataFrame()

    for path in tqdm(clean_paths, desc='Reading Data'):
        clean_articles = pd.read_csv(f'new_clean_data/{path}', sep=',')
        clean_data = pd.concat([clean_data, clean_articles], ignore_index=True)

    return clean_data

# Oznaczone dokumenty

In [3]:
def getTaggedDocuments(clean_data):
    tqdm.pandas(desc="Tagging documents")

    def process_article(article):
        
        return TaggedDocument(str(article["clean_abstract"]).split(), tags=[str(article["pmid"])])

    documents = clean_data.progress_apply(process_article, axis=1)

    return documents

# Trening

In [5]:
clean_data = getCleanData()
documents = getTaggedDocuments(clean_data)

vector_size_list = [200, 300]
min_count_list = [20, 25]
window_list = [10, 12]
sample_list = [5e-5, 1e-4]
alpha_list = [0.03, 0.035]
min_alpha_list = [0.00001, 0.000001]

model = Doc2Vec(vector_size=200, min_count=20, window=4, sample=1e-4,
                alpha=0.03, min_alpha=0.00001, workers=cores-1)

print("Building vocabulary...")
model.build_vocab(documents)

batch_size = 1000
num_batches = len(documents) // batch_size + 1

for i in tqdm(range(num_batches), desc='Training'):
    batch = documents[i*batch_size:(i+1)*batch_size]
    model.train(batch, total_examples=len(batch), epochs=80)

model.save(f"models/main80.model")

Reading Data: 100%|██████████| 1166/1166 [00:19<00:00, 61.26it/s]
Tagging documents: 100%|██████████| 552462/552462 [00:14<00:00, 37364.86it/s]


Building vocabulary...


Training: 100%|██████████| 553/553 [3:04:15<00:00, 19.99s/it]  
