In [1]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text

DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, lambda x: strip_short(s=x,minsize=1), strip_multiple_whitespaces, remove_stopwords, stem_text]

In [2]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [3]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

# Load the pre-trained GloVe vectors
glove_file = f'{DATA_DIR}/glove.840B/glove.840B.300d.txt'
google_word2vec = f'{DATA_DIR}/GoogleNews-vectors-negative300.bin'
word_vectors = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [4]:
# save the word vectors to a file
# word_vectors.save_word2vec_format(f'{DATA_DIR}/glove_word2vec.bin', binary=True)

In [4]:
def embeddings(word):
    if word in word_vectors.key_to_index:
        return word_vectors.get_vector(word)
    else:
        return np.zeros(300)

In [5]:
# Define a function to compute the document vectors
def compute_doc_vectors(docs, word_vectors):
    doc_vectors = []
    for doc in tqdm(docs):
        wv = [embeddings(x) for x in doc]
        if len(wv) == 0:
            doc_vectors.append(np.zeros(word_vectors.vector_size))
        if len(wv) > 0:
            wv = np.array(wv)
            #print('wv shape ', wv.shape)
            doc_vector = wv.mean(axis=0)
            doc_vectors.append(doc_vector)
    return np.array(doc_vectors)

# Define a function to compute the document vectors
def compute_doc_vectors_idf(docs, word_vectors, idfs):
    doc_vectors = []
    for doc in tqdm(docs):
        wv = [word_vectors.get_vector(word) for word in doc if word in word_vectors.key_to_index]
        weights = [idfs[word] for word in doc if word in word_vectors.key_to_index]
        if len(wv) == 0:
            doc_vectors.append(np.zeros(word_vectors.vector_size))
        if len(wv) > 0:
            wv = np.array(wv)
            #print('wv shape ', wv.shape)
            doc_vector = np.average(wv, axis=0, weights=weights)
            doc_vectors.append(doc_vector)
    return np.array(doc_vectors)

# Define a function to perform document retrieval
def retrieve_documents(query, docs, k=10):
    query_vector = compute_doc_vectors([query])[0]
    doc_vectors = compute_doc_vectors(docs)
    sim_scores = cosine_similarity(query_vector.reshape(1, -1), doc_vectors)
    sim_scores = sim_scores[0]
    top_k_indexes = np.argsort(sim_scores)[::-1][:k]
    return [(docs[i], sim_scores[i]) for i in top_k_indexes]

In [6]:
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
stopword_list = nltk.corpus.stopwords.words('english')
def data_clean(text):
    pattern = r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern,'',' '.join(text))
    tokens = [token.strip() for token in text.split()]
    filtered = [token for token in tokens if token.lower() not in stopword_list]
    filtered = ' '.join(filtered)
    return filtered

In [21]:
# from gensim.utils import simple_preprocess

# documents = corpus_df['text'].tolist()
# documents = [x.strip() for x in documents]
# documents = [preprocess_string(x, CUSTOM_FILTERS) for x in tqdm(documents,total=len(documents))]
# # save documents to a pickle file
# import pickle
# with open(f'{DATA_DIR}/documents_preprocess_string_no_stopwords.pkl', 'wb') as f:
#     pickle.dump(documents, f)

# -----------------------------------------------------------------
# documents = corpus_df['text'].tolist()
# documents = [x.strip() for x in documents]
# documents = [data_clean(nltk.word_tokenize(x)) for x in tqdm(documents,total=len(documents))]

# import pickle
# with open(f'{DATA_DIR}/documents_preprocess_custom.pkl', 'wb') as f:
#     pickle.dump(documents, f)

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [8]:
# load documents from pkl file
import pickle
with open(f'{DATA_DIR}/documents_preprocess_string_no_stopwords.pkl', 'rb') as f:
    documents = pickle.load(f)

In [14]:
# Compute the number of documents that contain each word
doc_freqs = {}
for doc in tqdm(documents):
    for word in set(doc):
        doc_freqs[word] = doc_freqs.get(word, 0) + 1

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [15]:
num_docs = len(documents)
# idfs = {word: math.log(num_docs / freq) for word, freq in doc_freqs.items()}
idfs = {word: math.log((num_docs - freq + 0.5)/(freq + 0.5)+1) for word, freq in doc_freqs.items()}

In [16]:
doc_vecs = compute_doc_vectors(docs=documents, word_vectors=word_vectors)

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [17]:
doc_vecs_idf = compute_doc_vectors_idf(docs=documents, word_vectors=word_vectors, idfs=idfs)

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [11]:
total_in = 0
total_not_in = 0
seen_words = set()
words_not_in = set()
words_in = set()

for doc in tqdm(documents, total=len(documents)):
    for word in doc:
        if word in word_vectors.key_to_index:
            if word not in seen_words:
                total_in += 1
                seen_words.add(word)
                words_in.add(word)
        else:
            if word not in seen_words:
                seen_words.add(word)
                total_not_in += 1
                words_not_in.add(word)

print(total_in, total_not_in, total_in/(total_in+total_not_in))

  0%|          | 0/1471406 [00:00<?, ?it/s]

238588 450133 0.34642184571110796


In [19]:
# get the first query
query = queries_df.iloc[0].text
query = query.strip()
query = preprocess_string(query, CUSTOM_FILTERS)
print(query)
# retrieve the top 10 documents
query_vector = compute_doc_vectors([query], word_vectors=word_vectors)[0]

['immedi', 'impact', 'success', 'manhattan', 'project']


  0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
sim_scores = cosine_similarity(query_vector.reshape(1, -1), doc_vecs)
sim_scores.shape

(1, 1471406)

In [21]:
# get the top 10 documents
top_k_indexes = np.argsort(sim_scores[0])[::-1][:10]
print(len(top_k_indexes))
for i in top_k_indexes:
    # get the index from corpus_df
    print(f'Document id ', corpus_df.iloc[i].name)
    # get the text from corpus_df
    print(corpus_df.iloc[i].text)
    # print score
    print('Score: ', sim_scores[0][i])


10
Document id  7243450
The project was given its name due to the fact that at least 10 of the sites used for the research were located in Manhattan. Following is a timeline of the key events related to the development of the atomic bomb and the Manhattan Project. Manhattan Project Timeline
Score:  0.7496238455407507
Document id  7408257
This is due to the fact that each change to the scope of the project will have an impact on the deadlines of the deliverables, so the changes may increase project cost by increasing the effort needed for the project.
Score:  0.7419746842421292
Document id  5775868
The project risk plan balances the investment of the mitigation against the benefit for the project. The project team often develops an alternative method for accomplishing a project goal when a risk event has been identified that may frustrate the accomplishment of that goal. These plans are called contingency plans.
Score:  0.7390905693414698
Document id  1597822
Downtown Columbus Strategic

In [22]:
query_vector_idf = compute_doc_vectors_idf([query], word_vectors=word_vectors, idfs=idfs)[0]

  0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
sim_scores = cosine_similarity(query_vector_idf.reshape(1, -1), doc_vecs_idf)
sim_scores.shape

(1, 1471406)

In [24]:
# get the top 10 documents
top_k_indexes = np.argsort(sim_scores[0])[::-1][:10]
print(len(top_k_indexes))
for i in top_k_indexes:
    # get the index from corpus_df
    print(f'Document id ', corpus_df.iloc[i].name)
    # get the text from corpus_df
    print(corpus_df.iloc[i].text)
    # print score
    print('Score: ', sim_scores[0][i])


10
Document id  3827898
It is the respon-sibility of organizational strategic leadership toconsider the external and internal business environ-ment and make sense of complexity when creatingthe organizationâs vision, mission and strategies,and planning their implementation.ork is more than just a pay-check for many. People love having a largerpurpose for what they do and they want tofeel pride in their work. A good vision cangive larger meaning to work by clarifyingits purpose, its interrelationship with otherwork and its impact on the organization asa whole.
Score:  0.6017698813646459
Document id  8841437
This is a very special organizati on, with a truly wonderful vision of how they want to continue the Laura Ingalls Wilder and Little House on the Prairie story for future generation s! The leadership a... re personally involved on a daily basis with a vested interest in the successful implementa tion of the mission, and the impact that this historic site can have.
Score:  0.5990484

In [57]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [60]:
glove_vectors = gensim.downloader.load('conceptnet-numberbatch-17-06-300')



In [13]:
doc_vecs = compute_doc_vectors(docs=documents, word_vectors=glove_vectors)

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [64]:
glove_vectors.get_vector('hello')

KeyError: "Key 'hello' not present"

In [61]:
total_in = 0
total_not_in = 0
seen_words = set()

for doc in tqdm(documents, total=len(documents)):
    for word in doc:
        if word in glove_vectors.key_to_index:
            if word not in seen_words:
                total_in += 1
                seen_words.add(word)
        else:
            if word not in seen_words:
                seen_words.add(word)
                total_not_in += 1

print(total_in, total_not_in, total_in/(total_in+total_not_in))

  0%|          | 0/1471406 [00:00<?, ?it/s]

0 1246359 0.0


In [15]:
doc_vecs_idf = compute_doc_vectors_idf(docs=documents, word_vectors=glove_vectors, idfs=idfs)

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [17]:
# get the first query
query = queries_df.iloc[0].text
query = preprocess_string(query, CUSTOM_FILTERS)
print(query)

['immediate', 'impact', 'success', 'manhattan', 'project']


In [18]:
query_vector = compute_doc_vectors([query], word_vectors=glove_vectors)[0]

  0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
query_vector_idf = compute_doc_vectors_idf([query], word_vectors=glove_vectors, idfs=idfs)[0]

  0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(query_vector.reshape(1, -1), doc_vecs)
sim_idf = cosine_similarity(query_vector_idf.reshape(1, -1), doc_vecs_idf)

In [25]:
# get the top 10 documents
top_k_indexes = np.argsort(sim[0])[::-1][:10]
print(len(top_k_indexes))
for i in top_k_indexes:
    # get the index from corpus_df
    print(f'Document id ', corpus_df.iloc[i].name)
    # get the text from corpus_df
    print(corpus_df.iloc[i].text)
    # print score
    print('Score: ', sim[0][i])


10
Document id  1541595
The process of project management is an integrative oneâan action (or failure to take action) in one area will usually affect other areas. For example, a scope change will almost always affect cost and schedule estimates, but it may also have an impact on other factors as diverse as team morale and product quality.
Score:  0.8968486818440411
Document id  4314198
The objectives of both project management. and the project are different and the control of time, cost. and progress, which are often the project management. objectives, should not be confused with measuring project. success.
Score:  0.894768419850468
Document id  7408257
This is due to the fact that each change to the scope of the project will have an impact on the deadlines of the deliverables, so the changes may increase project cost by increasing the effort needed for the project.
Score:  0.891107974098197
Document id  7408263
This characteristic reflects the advanced stage at which project control

In [26]:
# get the top 10 documents
top_k_indexes = np.argsort(sim_idf[0])[::-1][:10]
print(len(top_k_indexes))
for i in top_k_indexes:
    # get the index from corpus_df
    print(f'Document id ', corpus_df.iloc[i].name)
    # get the text from corpus_df
    print(corpus_df.iloc[i].text)
    # print score
    print('Score: ', sim_idf[0][i])


10
Document id  1541595
The process of project management is an integrative oneâan action (or failure to take action) in one area will usually affect other areas. For example, a scope change will almost always affect cost and schedule estimates, but it may also have an impact on other factors as diverse as team morale and product quality.
Score:  0.8880611690199094
Document id  1676113
Before you can achieve success, you need to define what success means to you. Unless you have a clear vision of what success is to you, you cannot work towards it. Success means different things to different people. For some, monetary reward is a measure of success. Yet others have multiple definition of success. 1. The Different Areas. They measure it across a few areas. It can be career, health, spiritual, emotional, time or financial.
Score:  0.8854146720214371
Document id  601542
Earned value management (EVM), or Earned value project/performance management (EVPM) is a project management technique f