In [1]:
import sys
import string
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from wordcloud import WordCloud
from collections import defaultdict
import os



In [2]:
def get_common_surface_form(original_corpus, stemmer):
    counts = defaultdict(lambda : defaultdict(int))
    surface_forms = {} 
    for document in original_corpus:
        for token in document:
            stemmed = stemmer.stem(token)
            counts[stemmed][token] += 1 
    for stemmed, originals in counts.items():
        surface_forms[stemmed] = max(originals, key=lambda i: originals[i]) 
    return surface_forms

In [4]:
def generateKeywordCloud(category):
    # Stemmer for reducing terms to root form
    stemmer = PorterStemmer()
    # For storing the stemmed tokens
    stemmed_corpus = []
    # For storing the non-stemmed tokens
    original_corpus = []

    path = "./textForms/" + category.lower()
    for file in os.listdir(path):
        # Load file contents
        contents = open(path+"/"+file).read().lower() 
        contents = ' '.join([word for word in contents.split() if word not in stopwords.words("english")])
        contents = "".join(l for l in contents if l not in string.punctuation)
        # Extract tokens
        tokens = word_tokenize(contents)
        # Stem tokens
        stemmed = [stemmer.stem(token) for token in tokens]
        # Store stemmed document
        stemmed_corpus.append(stemmed)
        # Store original document
        original_corpus.append(tokens)
    dictionary = Dictionary(stemmed_corpus)
    # Get the surface form for each stemmed word
    counts = get_common_surface_form(original_corpus, stemmer)
    # Convert to vector corpus
    vectors = [dictionary.doc2bow(text) for text in stemmed_corpus]
    # Train TF-IDF model
    tfidf = TfidfModel(vectors, normalize=True)
    # Get TF-IDF weights
    weights = tfidf[vectors[0]]
    # Replace term IDs with human consumable strings
    weights = [(counts[dictionary[pair[0]]], pair[1]) for pair in weights]
    # Initialize the cloud
    wc = WordCloud(
        background_color="white",
        max_words=2000,
        width = 1024,
        height = 720,
        stopwords=stopwords.words("english")
    )
    # Generate the cloud
    wc.generate_from_frequencies(weights)
    # Save the cloud to a file
    wc.to_file("word_cloud.png")

In [6]:
generateKeywordCloud('B')