In [7]:
import sys
sys.path.append('..\scdv')

In [36]:
import nltk
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
from nltk import word_tokenize
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline

In [8]:
from scdv import SCDV
from baseline import BaselineEmbedding

In [14]:
scdv_model = SCDV.load("../saved_models/bbc_word2vec_sg_5_100.pkl")
baseline_model = SCDV.load("../saved_models/baseline_bbc_word2vec_sg_100.pkl")

In [27]:
data_path = "../data/bbc/all/"

documents = list()
p = Path(data_path)
files = list(p.glob("**/*.txt"))
for file in tqdm(files):
    try:
        with open(file, "r", encoding='utf8') as f:
            text = f.read().strip()
    except:
        pass
    documents.append(text)
document_words = [word_tokenize(document) for document in tqdm(documents)]

  0%|          | 0/2225 [00:00<?, ?it/s]

  0%|          | 0/2225 [00:00<?, ?it/s]

In [33]:
def get_qd_score(model, query_idx, document_idx, query_unigram_probabilities, document_unigram_probabilities):
    _sum = 0
    for word in query_words[query_idx]:
        _sum += query_unigram_probabilities[query_idx][word] * get_probability_wd(word, document_idx, document_unigram_probabilities, model)
    return _sum

def get_scores(model, query_idx, document_idx, query_unigram_probabilities, document_unigram_probabilities):
    query_vector = query_vectors[query_idx]
    document_vector = document_vectors[document_idx]
    similarity = model.similarity(query_vector, document_vector)
    score_qd = get_qd_score(model, query_idx, document_idx, query_unigram_probabilities, document_unigram_probabilities)
    score_pv = (1 - args.lambda_) * score_qd + args.lambda_ * similarity
    return score_pv, score_qd

def get_probability_pv(word, document_idx, model):
    word_vector = model.get_word_vector(word)
    document_vector = document_vectors[document_idx]
    similarity = model.similarity(word_vector, document_vector)
    similarity_exponent = np.exp(similarity)
    _sum = 0
    for word in document_words[document_idx]: # model.vocabulary: # is this model vocabulary?
        _sum += np.exp(model.similarity(document_vector, model.get_word_vector(word)))
    probability_pv = similarity_exponent / _sum
    return probability_pv

def get_probability_wd(word, document_idx, document_unigram_probabilities, model):
    probability_lm = document_unigram_probabilities[document_idx].get(word, 0)
    return (1 - args.lambda_) * probability_lm + args.lambda_ * get_probability_pv(word, document_idx, model)

def make_sparse_document_vectors(document_vectors, p=0.5):
    ndim = document_vectors.shape[1]
    min_ndim = list()
    max_mdin = list()
    for i in range(ndim):
        min_ndim.append(np.min(document_vectors[:, i]))
        max_mdin.append(np.max(document_vectors[:, i]))
    a_min = np.mean(min_ndim)
    a_max = np.mean(max_mdin)
    t = (np.abs(a_min) + np.abs(a_max)) / 2
    pt = p * t
    document_vectors[np.abs(document_vectors) < pt] = 0
    return document_vectors

In [34]:
scdv_document_vectors = [scdv_model.get_document_vector(word_tokenize(document)) for document in tqdm(documents)]
scdv_document_vectors = np.asarray(scdv_document_vectors)
scdv_document_vectors = make_sparse_document_vectors(scdv_document_vectors)

baseline_document_vectors = [baseline_model.get_document_vector(word_tokenize(document)) for document in tqdm(documents)]
baseline_document_vectors = np.asarray(baseline_document_vectors)

  0%|          | 0/2225 [00:00<?, ?it/s]

  0%|          | 0/2225 [00:00<?, ?it/s]

In [39]:
document_lm = list()
document_unigram_probabilities = list()
for document in tqdm(document_words):
    unigram_probabilities = dict()
    train, vocab = padded_everygram_pipeline(3, [document])
    lm = MLE(3)
    lm.fit(train, vocab)
    document_lm.append(lm)
    for word in document:
        unigram_probabilities[word] = lm.score(word)
    document_unigram_probabilities.append(unigram_probabilities)

  0%|          | 0/2225 [00:00<?, ?it/s]

In [29]:
query = "India"

In [None]:
def get_query_results(query):
    query_words = word_tokenize(query)
    unigram_probabilities = dict()
    train, vocab = padded_everygram_pipeline(1, [query])
    lm = MLE(1)
    lm.fit(train, vocab)
    for word in query:
        unigram_probabilities[word] = lm.score(word)
        
    query_vector_scdv = np.asarray(scdv_model.get_document_vector(query_words))
    query_vector_baseline = np.asarray(baseline_model.get_document_vector(query_words))
    
    for document_idx in tqdm(range(total_documents)):

            if (document_idx + 1) % 1000 == 0:
                logging.info(f'{document_idx + 1}/{total_documents} documents processed')

            score_lm = document_lm[document_idx].score(query_words[query_idx][-1], query_words[query_idx][:-1])
            score_pv, score_qd = get_scores(model, query_idx, document_idx, query_unigram_probabilities, document_unigram_probabilities)
            scores.append((score_pv, score_qd, score_lm, document_idx))
    
    
    
    
    
