Permalink
Browse files

lots of changes

  • Loading branch information...
anatolyg committed Feb 28, 2013
1 parent 3328794 commit 3d4658f25b9bcf743939fc64e87243da8afe26b3
Showing with 172 additions and 16 deletions.
  1. +172 −16 src/test/WordCounter.java
View
@@ -1,32 +1,60 @@
package test;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicLong;
/**
* Calculates word frequency in the input text collection. Note: the Util class is out of
* the scope of this task.
*
* Tasks:
* 1. Compile and execute the program.
+ * DONE
+ *
* 2. Assuming that the program generates the correct output, is there anything that can be improved?
+ * - add switch to ignore case
+ * - improve data structure for storing the words:counts (two lists vs. one map)
+ * - make output more generic
+ * - use StringBuilder instead of string concatenation (Object creation)
+ *
* 3. If so, how would you change the program to make it better?
+ * - above
+ *
* 3. How would you sort the output in the alphabetical order? In the word frequency order?
+ * - DONE, see below: outputCountByWords and outputCountByFrequency
+ *
* 4. Design the program that would calculate word frequency in multiple threads.
+ * - Using ThreadPoolExecutor
*
*/
public class WordCounter {
+ private List<String> text;
+ private ConcurrentHashMap<String, Integer> wordCounter = new ConcurrentHashMap<String, Integer>();
+ private AtomicLong globalCounter = new AtomicLong(0L);
+ private boolean isProcessed = false;
+ private boolean ignoreCase = false;
+ private int threads = 1;
+
+ WordCounter(List<String> text, boolean ignoreCase, int threads) {
+ this.text = text;
+ if (ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ }
+ if (threads > 1) {
+ this.threads = threads;
+ }
+ }
/**
* Countes the words in the input collection and prints the results,
* if printResults parameter is set.
* Every element in the collection contain is a word from the text.
*/
- public static void countWords(List<String> text, boolean printResults) {
+ public static void countWordsSlow(List<String> text, boolean printResults) {
List<String> words = new ArrayList<String>();
List<Integer> counters = new ArrayList<Integer>();
-
for (String word : text) {
int pos = words.indexOf(word);
if (pos != -1) {
@@ -37,29 +65,157 @@ public static void countWords(List<String> text, boolean printResults) {
counters.add(1);
}
}
-
-
- // printing results
+
+ // printing results
int count = 0;
if (printResults) {
for (int i = 0; i < words.size(); i++) {
- System.out.print(words.get(i) + "=" + counters.get(i) + " ");
+ System.out.print(words.get(i) + "=" + counters.get(i) + " ");
count += counters.get(i);
}
System.out.println("\nTotal: " + count);
+ System.out.println("\nTotal Words: " + words.size());
+ }
+ }
+
+ static class WordFrequencyCounter extends Thread {
+ List<String> text;
+ private boolean ignoreCase = false;
+ ConcurrentHashMap<String, Integer> wordCounter;
+ AtomicLong globalCounter;
+
+ WordFrequencyCounter(List<String> words, boolean ignoreCase, ConcurrentHashMap<String, Integer> wordCounter, AtomicLong globalCounter) {
+ this.text = words;
+ if (ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ }
+ this.wordCounter = wordCounter;
+ this.globalCounter = globalCounter;
+ }
+
+ public void run() {
+ System.out.println("Counting " + this.text.size() + " words [" + this.hashCode() + "]");
+ for (String word : this.text) {
+ if (this.ignoreCase) {
+ word = word.toLowerCase();
+ }
+ if (this.wordCounter.containsKey(word)) {
+ this.wordCounter.put(word, this.wordCounter.get(word)+1);
+ }
+ else {
+ this.wordCounter.put(word, 1);
+ }
+ this.globalCounter.incrementAndGet();
+ }
+ System.out.println("Finished counting [" + this.hashCode() + "]");
+ }
+ }
+
+ private ConcurrentHashMap<String, Integer> wordCounts() {
+ if (!this.isProcessed) {
+ ThreadPoolExecutor executor = new ThreadPoolExecutor(2, this.threads, 0, TimeUnit.MILLISECONDS,
+ new LinkedBlockingQueue<Runnable>());
+
+ int numItems = new Double(Math.floor(this.text.size()/this.threads)).intValue();
+ int fromIndex = 0, toIndex = numItems;
+ int total = 0;
+
+ for (int threadId=0; threadId < this.threads; threadId++) {
+ System.out.println("Queuing From: " + fromIndex + ", To: " + toIndex);
+ List<String> words = this.text.subList(fromIndex, toIndex);
+ executor.execute(new WordFrequencyCounter(words, this.ignoreCase, this.wordCounter, this.globalCounter));
+ fromIndex = toIndex;
+ toIndex = numItems * (threadId+2) + 1;
+ total+=words.size();
+ }
+
+ assert(total == this.text.size());
+
+ try {
+ executor.shutdown();
+ executor.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
+ this.isProcessed = true;
+ }
+ catch (InterruptedException ex) {
+ System.err.println("Processing interrupted");
+ System.exit(-1);
+ }
+ }
+ return this.wordCounter;
+ }
+
+ public void outputCountByWords(boolean reverse) {
+ Map<String, Integer> wordCounter;
+ wordCounter = new TreeMap<String, Integer>(wordCounts());
+ if (reverse) {
+ wordCounter = ((TreeMap<String, Integer>)wordCounter).descendingMap();
+ }
+ outputCollection(wordCounter.entrySet());
+ }
+
+ public void outputCountByFrequency(boolean reverse) {
+ SortedSet<Map.Entry<String, Integer>> wordCounter;
+ wordCounter = new TreeSet<Map.Entry<String, Integer>>(
+ reverse ? java.util.Collections.reverseOrder(new FrequencyComparator()) : new FrequencyComparator()
+ );
+ wordCounter.addAll(wordCounts().entrySet());
+ outputCollection(wordCounter);
+ }
+
+ private void outputCollection(Collection<Map.Entry<String, Integer>> col) {
+ System.out.println("\n");
+
+ StringBuilder outputString = new StringBuilder();
+ for (Map.Entry<String, Integer> entry : col) {
+ outputString.append(entry.getKey());
+ outputString.append("=");
+ outputString.append(entry.getValue());
+ outputString.append(" ");
}
-
+
+ System.out.println(outputString.toString());
+ System.out.println("Total Iterations: " + globalCounter);
+ System.out.println("Total Unique Words: " + col.size());
}
-
-
+
+ static class FrequencyComparator implements Comparator<Map.Entry<String, Integer>> {
+ public int compare(Map.Entry<String, Integer> e1, Map.Entry<String, Integer> e2) {
+ int compValue = e1.getValue().compareTo(e2.getValue());
+ if (compValue == 0) {
+ return e1.getKey().compareTo(e2.getKey());
+ }
+ else {
+ return compValue;
+ }
+ }
+ }
+
public static void main(String[] args) throws IOException {
long time = System.currentTimeMillis();
// The utility method - is the implementation details
List<String> text = Util.readTextFromFile("monte_cristo.txt");
+ WordCounter counter = new WordCounter(new ArrayList<String>(text), false, 4);
+
System.out.println("Initial reading: " + (System.currentTimeMillis() - time) + "ms");
- countWords(text, true);
- System.out.println("Time to process: " + (System.currentTimeMillis() - time) + "ms");
-
+
+ counter.outputCountByWords(false);
+ System.out.println("Time to process fast (natural order): " + (System.currentTimeMillis() - time) + "ms");
+
+ long timeReverseWord = System.currentTimeMillis();
+ counter.outputCountByWords(true);
+ System.out.println("Time to process fast (reverse natural order): " + (System.currentTimeMillis() - timeReverseWord) + "ms");
+
+ long timeFrequency = System.currentTimeMillis();
+ counter.outputCountByFrequency(false);
+ System.out.println("Time to process fast frequency (low-first): " + (System.currentTimeMillis() - timeFrequency) + "ms");
+
+ long timeFrequencyReverse = System.currentTimeMillis();
+ counter.outputCountByFrequency(true);
+ System.out.println("Time to process fast frequency (high-first): " + (System.currentTimeMillis() - timeFrequencyReverse) + "ms");
+
+ System.out.println("\n\n");
+ long timeSlow = System.currentTimeMillis();
+ countWordsSlow(text, true);
+ System.out.println("Time to process slow: " + (System.currentTimeMillis() - timeSlow) + "ms");
}
-
-}
+}

0 comments on commit 3d4658f

Please sign in to comment.