# Main speed increaser - we compare topics, not whole texts
# result - 30 sec (0.67) instead 30 min (0.88)

In [26]:
# nltk.download('popular') # run it once
# nltk.download('punkt')   # run it once (try without it)
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd

# doc1 = 'This is a function to test document_path_similarity.'
# doc2 = 'Use this function to see if your code in doc_to_synsets \
# and similarity_score is correct!'
with open('../input/plato.txt', 'r', encoding='utf8') as f: doc1 = f.read()
with open('../input/socrates.txt', 'r', encoding='utf8') as f: doc2 = f.read()
with open('../input/airplane.txt', 'r', encoding='utf8') as f: doc3 = f.read()

def convert_tag(tag):
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'} # Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets
    try: return tag_dict[tag[0]]
    except KeyError: return None
    
def text_to_synsets_list(doc): # convert string to similar string with changed words (to similar or delete, if there is not similar)
    # input: 'This is a function to test document_path_similarity.'
    # output: # [Synset('be.v.01'), Synset('angstrom.n.01'), Synset('function.n.01'), Synset('test.v.01')]
    words = nltk.word_tokenize(doc)                     # ['This', 'is', 'a', 'function', 'to', 'test', 'document_path_similarity', '.']
    words_n_pos = nltk.pos_tag(words)                   # [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('function', 'NN'), ('to', 'TO'), ('test', 'VB'), ('document_path_similarity', 'NN'), ('.', '.')]
    poses = [y for x,y in words_n_pos]                  # ['DT', 'VBZ', 'DT', 'NN', 'TO', 'VB', 'NN', '.']
    # just renames by first letter
    wntag = [convert_tag(x) for x in poses]             # [None, 'v', None, 'n', None, 'v', 'n', None]
    #mix words and PoS first letters
    ans = list(zip(words,wntag))                       # [('This', None), ('is', 'v'), ('a', None), ('function', 'n'), ('to', None), ('test', 'v'), ('document_path_similarity', 'n'), ('.', None)]
    # similar words from WordNet
    synsets = [wn.synsets(x,y) for x,y in ans]          # [[], [Synset('be.v.01'),  Synset('exist.v.01'),  Synset('equal.v.01'),...
    # remove empty groups and all synsets expect 1 in each groups
    final = [val[0] for val in synsets if len(val) > 0] 
    return final # [Synset('be.v.01'), Synset('angstrom.n.01'), Synset('function.n.01'), Synset('test.v.01')]

def compare_2_sysnet_lists(s1, s2):
    scores_best =[]
    for i in s1:
        #print(i, 'first')        # Synset('be.v.01')  /n  Synset('angstrom.n.01')  /n  Synset('function.n.01')  /n  Synset('test.v.01')
        scores_all =[]
        for j in s2:
            # print(j) # 4 loops of: Synset('use.v.01')   Synset('function.n.01')   Synset('see.v.01')   Synset('code.n.01')   Synset('inch.n.01')   Synset('be.v.01')   Synset('correct.a.01')
            # if words are similar - path_similarity returns 1 (be and be)
            #print(i.path_similarity(j)) # 0.33 0.14 0.25 0.14 0.11 1.0 0.33 None 0.1 None 0.1 0.25 None None None ...
            similarity = i.path_similarity(j)
            if (similarity != None): scores_all.append(similarity)
        if scores_all: scores_best.append(max(scores_all))
    # scores_best                     # [1.0, 0.25, 1.0, 0.2]      
    return sum(scores_best)/len(scores_best) # 0.6125
    


In [27]:
import pickle
import gensim #anaconda prompt -> pip install -U gensim
from sklearn.feature_extraction.text import CountVectorizer

def find_topics_of_string(s, topic_count):
    data = [s]
    vect = CountVectorizer(min_df=0, max_df=1, stop_words='english', token_pattern='(?u)\\b\\w\\w\\w+\\b')
    X = vect.fit_transform(data)
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)
    id_map = dict((v, k) for k, v in vect.vocabulary_.items())
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word=id_map, num_topics=1, passes=25, random_state=34) # wait too long
    return ldamodel.print_topics(num_topics=1, num_words=topic_count) #[0][1].split('+')[0].split('*')[1]
    
def clean_topics(top): #[(0,   '0.031*"plato" + 0.011*"socrates")] => ['plato', 'socrates']
    return [x.split('*')[1].replace('"', '').rstrip() for x in top[0][1].split('+')]    
    
doc1 = clean_topics(find_topics_of_string(doc1, 1000)) #['plato', 'socrates', 'dialogues', 'forms', 'republic']
doc2 = clean_topics(find_topics_of_string(doc2, 1000))
doc3 = clean_topics(find_topics_of_string(doc3, 1000))
doc1 =  ''.join(e+' ' for e in doc1).rstrip()  #'plato socrates dialogues forms republic'
doc2 =  ''.join(e+' ' for e in doc2).rstrip()  
doc3 =  ''.join(e+' ' for e in doc3).rstrip()  
s1 = text_to_synsets_list(doc1) # synsets: [be angstrom function test]
s2 = text_to_synsets_list(doc2) 
s3 = text_to_synsets_list(doc3)

#compare_2_sysnet_lists(s1, s2) #0.6789118246687038  plato vs socrates
compare_2_sysnet_lists(s1, s3) #0.5099787370815394   plato vs airplane

0.5099787370815394