# Import bibliotek

In [None]:
import os, csv, gensim, spacy, re, time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from pprint import pprint
from collections import defaultdict 
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords

import locale
locale.getpreferredencoding = lambda: "UTF-8"

from IPython.display import HTML, display

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
        </progress>
    """.format(value=value, max=max))

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Load data

In [None]:
paths = os.listdir('drive/MyDrive/inzynierka/data')
paths = paths[1050:]
out = display(progress(0, 116), display_id=True)

df = pd.DataFrame()

for index, path in enumerate(paths):
  df = pd.concat([df, pd.read_csv(f'drive/MyDrive/inzynierka/data/{path}', sep=';', quoting=csv.QUOTE_NONE)])
  out.update(progress(index+1, 116))

In [None]:
df = df.loc[:, ~df.columns.isin(['year', 'title', 'headings'])]
df.head()

Unnamed: 0,pmid,abstract
0,1523516,Twenty-two patients with neurologic deficit d...
1,1523518,Twenty-four patients undergoing anterior and ...
2,1523519,Thirty-eight patients with unstable thoracolu...
3,1523520,"Between 1981 and 1990, twenty-two patients wi..."
4,1523574,Gallbladder function and lipid composition of...


In [None]:
df.isnull().sum()

pmid            0
abstract    75656
dtype: int64

In [None]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()
df.shape

(183664, 2)

# Cleaning

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]

    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['abstract'])
pmids = list(df['pmid'])

In [None]:
del df
import gc
gc.collect()

21360

In [None]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

In [None]:
stop_words = set(stopwords.words('english'))
  
word_tokens = word_tokenize(txt)
filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

In [None]:
df_clean = pd.DataFrame({'pmid': df['pmid'],'abstract': txt})

df = pd.DataFrame()

df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

# Bigrams

In [None]:
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df_clean['abstract']]

In [None]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [None]:
bigram = Phraser(phrases)

In [None]:
sentences = bigram[sent]

# Most frequent

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

# Train

In [None]:
import multiprocessing
from nltk.tokenize import word_tokenize

from gensim.models import Doc2Vec

In [None]:
cores = multiprocessing.cpu_count()

In [None]:
documents = [TaggedDocument(
      words=abstract.split(), 
      tags=[str(pmid)]
    ) for abstract, pmid in zip(df_clean['abstract'], df_clean['pmid'])]
w2v_model = Doc2Vec(documents, 
                    min_count=20,
                    window=2,
                    vector_size=300,
                    sample=6e-5, 
                    alpha=0.03, 
                    min_alpha=0.0007, 
                    negative=20,
                    workers=cores-1)

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=80)

In [None]:
model.build_vocab(documents)

In [None]:
model.train(documents, total_examples=model.corpus_count, epochs=80)

In [None]:
#save
model.save("drive/MyDrive/inzynierka/dv2.model")

In [None]:
#load
model = Doc2Vec.load("drive/MyDrive/inzynierka/dv2.model")

In [None]:
query = 'brain effect legs and arms'.split()

new_vector = model.infer_vector(query)
sims = model.dv.most_similar(positive=[new_vector], topn=100)

In [None]:
sims