Cleaned it up a little bit.

amueller · Dec 11, 2013 · 0269230 · 0269230
1 parent 3d6b3b3
commit 0269230
Show file tree

Hide file tree

Showing 11 changed files with 150 additions and 94 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 build
-wordcloud/query_integral_image.c
-wordcloud/query_integral_image.so
+/wordcloud/query_integral_image.c
+/wordcloud/query_integral_image.so
 *.pyc
 *~
 *.png
+!/examples/constitution.png
+!/examples/alice.png
diff --git a/LICENSE b/LICENSE
@@ -1,21 +1,18 @@
-
 Copyright (c) 2012 Andreas Christian Mueller
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
 
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,21 +1,58 @@
 word_cloud
 ==========
 
-A fork of [Andreas Mueller](https://github.com/amueller)'s
-[word_cloud](https://github.com/amueller/word_cloud) to make it a little more
-extendable and installable as a package.
+A little word cloud generator in Python. Read more about it on the [blog
+post][blog-post].
 
-Install it by running:
-
-    pip install Cython
-    pip install numpy
-    pip install PIL
+## Installation
 
+Get this package:
+
     wget https://github.com/paul-nechifor/word_cloud/archive/master.zip
     unzip master.zip
+    rm master.zip
     cd word_cloud-master
-    sudo python setup.py install
-    cd ..
-    sudo rm -r word_cloud-master master.zip
 
-Run the files in `examples/` to for a short intro.
+Install it:
+
+    sudo pip install -r requirements.txt
+    sudo python setup
+
+## Examples
+
+Note that if you are not on Ubuntu, you need to adjust FONT_PATH to point to
+some existing font.
+
+Check out [examples/simple.py][simple] for a short intro. A sample output is:
+
+![Constitution](examples/constitution.png)
+
+Or run [examples/more.py][more] to see more options.
+
+## Used in
+
+### Reddit Cloud
+
+[Reddit Cloud][reddit-cloud] is a Reddit bot which generates word clouds for
+comments in submissions and user histories. You can see it being operated on
+[/u/WordCloudBot2][wc2] ([top posting][wc2top]).
+
+![A Reddit Cloud sample](http://i.imgur.com/tcbZnKW.png)
+
+### <other>
+
+*Send a pull request to add yours here.*
+
+## Issues
+
+Using Pillow instead of PIL might might get you the [`TypeError: 'int' object is
+not iterable` problem][intprob] also showcased on the blog.
+
+[blog-post]: http://peekaboo-vision.blogspot.de/2012/11/a-wordcloud-in-python.html
+[simple]: examples/simple.py
+[simple]: examples/more.py
+[reddit-cloud]: https://github.com/paul-nechifor/reddit-cloud
+[wc2]: http://www.reddit.com/user/WordCloudBot2
+[wc2top]: http://www.reddit.com/user/WordCloudBot2/?sort=top
+[intprob]: http://peekaboo-vision.blogspot.de/2012/11/a-wordcloud-in-python.html#bc_0_28B
+
diff --git a/examples/alice.png b/examples/alice.png
diff --git a/examples/constitution.png b/examples/constitution.png
diff --git a/examples/more.py b/examples/more.py
@@ -1,10 +1,17 @@
 #!/usr/bin/env python2
 
+from os import path
 import sys
-import os
 import wordcloud
 
-text = open('alice.txt').read()
-words, counts = wordcloud.process_text(text, max_features=2000)
-elements = wordcloud.fit_words(words, counts, width=500, height=500)
-wordcloud.draw(elements, 'alice.png', width=500, height=500, scale=2)
+d = path.dirname(__file__)
+
+# Read the whole text.
+text = open(path.join(d, 'alice.txt')).read()
+# Separate into a list of (word, frequency).
+words = wordcloud.process_text(text, max_features=2000)
+# Compute the position of the words.
+elements = wordcloud.fit_words(words, width=500, height=500)
+# Draw the positioned words to a PNG file.
+wordcloud.draw(elements, path.join(d, 'alice.png'), width=500, height=500,
+        scale=2)
diff --git a/examples/simple.py b/examples/simple.py
@@ -1,10 +1,16 @@
 #!/usr/bin/env python2
 
+from os import path
 import sys
-import os
 import wordcloud
 
-text = open('constitution.txt').read()
-words, counts = wordcloud.process_text(text)
-elements = wordcloud.fit_words(words, counts)
-wordcloud.draw(elements, 'constitution.png')
+d = path.dirname(__file__)
+
+# Read the whole text.
+text = open(path.join(d, 'constitution.txt')).read()
+# Separate into a list of (word, frequency).
+words = wordcloud.process_text(text)
+# Compute the position of the words.
+elements = wordcloud.fit_words(words)
+# Draw the positioned words to a PNG file.
+wordcloud.draw(elements, path.join(d, 'constitution.png'))
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+Cython>=0.19.1
+PIL>=1.1.7
+numpy>=1.7.1
diff --git a/setup.py b/setup.py
@@ -1,10 +1,13 @@
+import os
 from distutils.core import setup
 from Cython.Build import cythonize
 
 setup(
     name='wordcloud',
-    ext_modules=cythonize('wordcloud/query_integral_image.pyx'),
+    version='1.0.0',
     url='https://github.com/paul-nechifor/word_cloud',
+    license='MIT',
+    ext_modules=cythonize('wordcloud/query_integral_image.pyx'),
     packages=['wordcloud'],
     package_data={'wordcloud': ['stopwords']}
 )
diff --git a/wordcloud/__init__.py b/wordcloud/__init__.py
@@ -1,6 +1,6 @@
 # Author: Andreas Christian Mueller <amueller@ais.uni-bonn.de>
 # (c) 2012
-# Author: Paul Nechifor <paul@nechifor.net>
+# Modified by: Paul Nechifor <paul@nechifor.net>
 #
 # License: MIT
 
@@ -19,29 +19,24 @@
 STOPWORDS = set([x.strip() for x in open(os.path.join(os.path.dirname(__file__),
         'stopwords')).read().split('\n')])
 
-def fit_words(words, counts, font_path=None, width=400, height=200,
+def fit_words(words, font_path=None, width=400, height=200,
                    margin=5, ranks_only=False, prefer_horiz=0.90):
-    """Build word cloud using word counts.
+    """Generate the positions for words.
 
     Parameters
     ----------
-    words : numpy array of strings
-        Words that will be drawn in the image.
-
-    counts : numpy array of word counts
-        Word counts or weighting of words. Determines the size of the word in
-        the final image.
-        Will be normalized to lie between zero and one.
-
+    words : array of tuples
+        A tuple contains the word and its frequency.
+    
     font_path : string
-        Font path to the font that will be used.
-        Defaults to DroidSansMono path.
+        Font path to the font that will be used (OTF or TTF).
+        Defaults to DroidSansMono path, but you might not have it.
 
     width : int (default=400)
-        Width of the word cloud image.
+        Width of the canvas.
 
     height : int (default=200)
-        Height of the word cloud image.
+        Height of the canvas.
 
     ranks_only : boolean (default=False)
         Only use the rank of the words, not the actual counts.
@@ -51,44 +46,37 @@ def fit_words(words, counts, font_path=None, width=400, height=200,
 
     Notes
     -----
-    Larger Images with make the code significantly slower.
-    If you need a large image, you can try running the algorithm at a lower
-    resolution and then drawing the result at the desired resolution.
-
-    In the current form it actually just uses the rank of the counts,
-    i.e. the relative differences don't matter.
-    Play with setting the font_size in the main loop vor differnt styles.
-
-    Colors are used completely at random. Currently the colors are sampled
-    from HSV space with a fixed S and V.
-    Adjusting the percentages at the very end gives differnt color ranges.
-    Obviously you can also set all at random - haven't tried that.
-
+    Larger canvases with make the code significantly slower. If you need a large
+    word cloud, run this function with a lower canvas size, and draw it with a
+    larger scale.
+    
+    In the current form it actually just uses the rank of the counts, i.e. the
+    relative differences don't matter. Play with setting the font_size in the
+    main loop for different styles.
     """
-    if len(counts) <= 0:
+
+    if len(words) <= 0:
         print("We need at least 1 word to plot a word cloud, got %d."
-              % len(counts))
+                % len(words))
 
     if font_path is None:
         font_path = FONT_PATH
 
-    # normalize counts
-    #counts = counts / float(max(counts))
-    # sort words by counts
-    #inds = np.argsort(counts)[::-1]
-    #counts = counts[inds]
-    #words = words[inds]
+    if not os.path.exists(font_path):
+        raise ValueError("The font %s does not exist." % font_path)
 
     # create image
     img_grey = Image.new("L", (width, height))
     draw = ImageDraw.Draw(img_grey)
     integral = np.zeros((height, width), dtype=np.uint32)
     img_array = np.asarray(img_grey)
     font_sizes, positions, orientations = [], [], []
+
     # intitiallize font size "large enough"
-    font_size = 1000
+    font_size = height
+
     # start drawing grey image
-    for word, count in zip(words, counts):
+    for word, count in words:
         # alternative way to set the font size
         if not ranks_only:
             font_size = min(font_size, int(100 * np.log(count + 100)))
@@ -155,7 +143,7 @@ def draw(elements, file_name, font_path=None, width=400, height=200, scale=1,
 
     img = Image.new("RGB", (width * scale, height * scale))
     draw = ImageDraw.Draw(img)
-    for word, font_size, position, orientation in elements:
+    for (word, count), font_size, position, orientation in elements:
         font = ImageFont.truetype(font_path, font_size * scale)
         transposed_font = ImageFont.TransposedFont(font,
                                                    orientation=orientation)
@@ -165,20 +153,37 @@ def draw(elements, file_name, font_path=None, width=400, height=200, scale=1,
         draw.text(pos, word, fill=color)
     img.save(file_name)
 
-def process_text(text, max_features=200, stopwords=STOPWORDS):
+def process_text(text, max_features=200, stopwords=None):
     """Splits a long text into words, eliminates the stopwords and returns
     (words, counts) which is necessary for make_wordcloud().
+
+    Parameters
+    ----------
+    text : string
+        The text to be processed.
+    
+    max_features : number (default=200)
+        The maximum number of words.
+        
+    stopwords : set of strings
+        The words that will be eliminated.
+        
+    Notes
+    -----
+    There are better ways to do word tokenization, but I don't want to include
+    all those things.
     """
-    # there are better ways to do this, but I don't want to include all those
-    # things
 
-    d = {}
+    if stopwords is None:
+        stopwords = STOPWORDS
 
+    d = {}
     for word in re.findall(r"\w[\w']*", text):
         word_lower = word.lower()
         if word_lower in stopwords:
             continue
-        # Look in all lowercase dict.
+
+        # Look in lowercase dict.
         if d.has_key(word_lower):
             d2 = d[word_lower]
         else:
@@ -192,21 +197,15 @@ def process_text(text, max_features=200, stopwords=STOPWORDS):
             d2[word] = 1
 
     d3 = {}
-    for dv in d.values():
+    for d2 in d.values():
         # Get the most popular case.
-        first = sorted(dv.iteritems(), key=lambda x: x[1], reverse=True)[0][0]
-        d3[first] = sum(dv.values())
+        first = sorted(d2.iteritems(), key=lambda x: x[1], reverse=True)[0][0]
+        d3[first] = sum(d2.values())
 
-    sd = sorted(d3.iteritems(), key=lambda x: x[1], reverse=True)
-    sd = sd[:max_features]
-
+    words = sorted(d3.iteritems(), key=lambda x: x[1], reverse=True)
+    words = words[:max_features]
     maximum = float(max(d3.values()))
+    for i, (word, count) in enumerate(words):
+        words[i] = word, count/maximum
 
-    words = []
-    counts = []
-
-    for word, count in sd:
-        words.append(word)
-        counts.append(count / maximum)
-
-    return words, counts
+    return words
diff --git a/wordcloud/stopwords b/wordcloud/stopwords
@@ -43,6 +43,7 @@ few
 for
 from
 further
+get
 had
 hadn't
 has
@@ -105,6 +106,7 @@ ourselves
 out
 over
 own
+r
 same
 shan't
 she