In [73]:
import sys
import string
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from wordcloud import WordCloud
from collections import defaultdict
import os

In [74]:
def get_common_surface_form(original_corpus, stemmer):
    counts = defaultdict(lambda : defaultdict(int))
    surface_forms = {} 
    for document in original_corpus:
        for token in document:
            stemmed = stemmer.stem(token)
            counts[stemmed][token] += 1 
    for stemmed, originals in counts.items():
        surface_forms[stemmed] = max(originals, key=lambda i: originals[i]) 
    return surface_forms

In [77]:
stemmer = PorterStemmer()
stemmed_corpus = []
original_corpus = []

path = "./textForms/a"
for file in os.listdir(path):
    contents = open(path+"/"+file).read().lower() 
    contents = ' '.join([word for word in contents.split() if word not in stopwords.words("english")])
    contents = "".join(l for l in contents if l not in string.punctuation)
    tokens = word_tokenize(contents)
    stemmed = [stemmer.stem(token) for token in tokens]
    stemmed_corpus.append(stemmed)
    original_corpus.append(tokens)
dictionary = Dictionary(stemmed_corpus)
counts = get_common_surface_form(original_corpus, stemmer)
vectors = [dictionary.doc2bow(text) for text in stemmed_corpus]
tfidf = TfidfModel(vectors, normalize=True)
weights = tfidf[vectors[0]]
weights = [(counts[dictionary[pair[0]]], pair[1]) for pair in weights]
print(weights)
wc = WordCloud(
    background_color="white",
    max_words=2000,
    width = 1024,
    height = 720,
    stopwords=stopwords.words("english")
)
wc.generate_from_frequencies(weights)
wc.to_file("word_cloud.png")

[('p', 0.2719975911770874), ('labour', 0.2549146970341732), ('subject', 0.215672730086235), ('resolution', 0.22938947859615944), ('tomorrow', 0.22938947859615944), ('back', 0.1730646175053071), ('meeting', 0.13693306984253517), ('made', 0.1569369516485224), ('manchester', 0.2719975911770874), ('ps', 0.18678136601523151), ('peers', 0.20446533050981125), ('griffiths', 0.2719975911770874), ('exchanges', 0.22938947859615944), ('stop', 0.22938947859615944), ('put', 0.1730646175053071), ('foot', 0.19498960918436273), ('michael', 0.2470734430907392), ('moves', 0.22938947859615944), ('nomination', 0.215672730086235), ('life', 0.215672730086235), ('mr', 0.07203225211605804), ('gaitskell', 0.1618572179288833)]


<wordcloud.wordcloud.WordCloud at 0x1111eb518>

In [78]:
type(weights)

list