In [None]:
# https://towardsdatascience.com/detecting-document-similarity-with-doc2vec-f8289a9a7db7

In [2]:
from sklearn import datasets

In [None]:
categories = ["soc.religion.christian", "sci.space", "talk.politics.mideast", "rec.sport.baseball"]
cat_dict = {} # contains raw training dat organized by category
cat_dict_test = {} # contains raw test data organized by category
for cat in categories:
    cat_dict[cat] = datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=[cat]).data
    cat_dict_test[cat] = datasets.fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=[cat]).data

cat_dict["soc.religion.christian"]


In [None]:
print(cat_dict['soc.religion.christian'][0])

In [None]:
import gensim

def tokenize(text, stopwords, max_len=20):
    return [token for token in gensim.utils.simple_preprocess(text, max_len=max_len) if token not in stopwords]

cat_dict_tagged_train = {} # contains clean tagged training data organized by category
cat_dict_test_clean = {} # contains un-tagged test dat orgainized by categroy

offset = 0 # used for managing IDs of tagged documents
for k, v in cat_dict.items():
    cat_dict_tagged_train[k] = [gensim.models.doc2vec.TaggedDocument(tokenize(text, [], max_len=200), [i+offset]) for i, text in enumerate(v)]
    offset += len(v)
    
offset = 0
for k, v in cat_dict_test.items():
    cat_dict_test_clean[k] = [tokenize(text, [], max_len=200) for i, text in enumerate(v)]
    offset += len(v)
    
# Eventually contains final versions of the training data to actually train the model
train_corpus = [taggeddoc for taggeddoc_list in list(cat_dict_tagged_train.values()) for taggeddoc in taggeddoc_list]
train_corpus[0]

In [12]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=40, window=2)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

# look at https://en.wikipedia.org/wiki/Hyperparameter_optimization for hyperparameter tuning

In [14]:
metadata = {}
infered_vector_test = {} # contains, categor-wise, inferred doc vecs for each document in the test set
for cat, docs in cat_dict_test_clean.items():
    infered_vector_test[cat] = [model.infer_vector(doc) for doc in list(docs)]
    metadata[cat] = len(infered_vector_test[cat])
print(infered_vector_test['soc.religion.christian'][0])
print(metadata)

[-1.1941751   1.8838603   0.75064266  1.3050057   0.21004878 -0.588853
 -0.8507837  -0.5707667  -1.7714983   2.4439347  -1.3262669   1.2783276
 -0.7765641  -2.2415097  -1.5915937   0.11307847  1.9322265  -0.6980872
  0.78167206 -0.5227096   0.24200228 -1.304511   -2.1284945   0.78711414
 -0.09894901 -1.5077336  -2.9868999   1.0842909   1.0127064  -1.0165315 ]
{'soc.religion.christian': 398, 'sci.space': 394, 'talk.politics.mideast': 376, 'rec.sport.baseball': 397}


In [19]:
import csv

def write_to_csv(input, output_file, delimeter='\t'):
    with open(output_file, "w") as f:
        writer = csv.writer(f, delimiter=delimeter)
        writer.writerows(input)
        
veclist_metadata = []
veclist = []

for cat in cat_dict.keys():
    for tag in [cat]*metadata[cat]:
        veclist_metadata.append([tag])
    for vec in infered_vector_test[cat]:
        veclist.append(list(vec))
        
write_to_csv(veclist, "doc2vec_20Newsgroups_vectors.csv")
write_to_csv(veclist_metadata, "doc2vec_20Newsgroups_vectors_metadata.csv")

In [21]:
import random
    
cat_id = {id:cat for id, cat in enumerate(categories)} # Give each category a numerical id
test_doc_pairs = {tuple(sorted([id,id2])):[] for id in cat_id for id2 in cat_id}
for pair_id in test_doc_pairs:
    # Create same-category doc pairs, e.g. (C3, C3)
    if pair_id[0] == pair_id[1]:
        test_doc_pairs[pair_id] = [(doc, cat_dict_test_clean[cat_id[pair_id[0]]][i]) for doc_index, doc in enumerate(list(cat_dict_test_clean[cat_id[pair_id[0]]])) for i in range(doc_index+1, len(list(cat_dict_test_clean[cat_id[pair_id[0]]])))]
    #Create cross-category doc pairs, e.g. (C3, C4)
    else:
        test_doc_pairs[pair_id] = [(doc, doc2) for doc in list(cat_dict_test_clean[cat_id[pair_id[0]]]) for doc2 in list(cat_dict_test_clean[cat_id[pair_id[1]]])]
similarities_test = {pair_id:[] for pair_id in test_doc_pairs}
for id in cat_id:
    for id2 in cat_id:
        similarities_test[tuple(sorted([id, id2]))] = [model.dv.similarity_unseen_docs(model, pair[0], pair[1]) for pair in random.sample(test_doc_pairs[tuple(sorted([id,id2]))],len(test_doc_pairs[tuple(sorted([id,id2]))]))[:500]] # Create a similarity list of selected pairs


NotImplementedError: Call similarity_unseen_docs on a Doc2Vec model instead.