In [1]:
from os import path
from json import dump, load
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
with open('topic_output.json', 'r') as f:
    topics_dist = load(f)

In [3]:
def get_tf_idf_weights(topics):
    vectorizer = TfidfVectorizer(min_df=0,)
    X = vectorizer.fit_transform(topic.replace(', ', ' ') for topic in topics)
    idf = vectorizer._tfidf.idf_
    tf_idf_weights = {}
    for word, weight in dict(zip(vectorizer.get_feature_names(), idf)).items():
        tf_idf_weights[word] = round(weight, 2)
    return tf_idf_weights

In [4]:
def get_counts(topics):
    return Counter(', '.join(topics).split(', '))

In [5]:
def get_topics(item):
    topics = []
    for value in item:
        for topic in value:
            topics.append(topic['topic'])
    return topics

In [6]:
def get_words_dict(tf_idf_weights, counts):
    words = defaultdict(lambda: {})
    for word in counts.keys():
        try:
            words[str(word)]['tf_idf'] = float(tf_idf_weights[word])
            words[str(word)]['freq'] = float(counts[word])
        except KeyError:
            pass
    return dict(words)

In [7]:
words_data = defaultdict(lambda: {})

for key, item in topics_dist.items():
    topics = get_topics(item)
    tf_idf_weights = get_tf_idf_weights(topics)
    counts = get_counts(topics)
    words_data[key] = get_words_dict(tf_idf_weights, counts)

In [8]:
with open(path.join('..', 'data', 'words_freqs.json'), 'w') as f:
    dump(dict(words_data), f)