forked from harvard-lil/cap-examples-old
-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_wordclouds.py
105 lines (85 loc) · 4.07 KB
/
make_wordclouds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import csv
import json
import os
import re
from collections import defaultdict
from glob import glob
from tqdm import tqdm
from wordcloud import WordCloud
from helpers import makedirs
source_dir = "/ftldata/ngrams_summarized"
dest_dir = "/ftldata/wordclouds"
top_words_dir = "/ftldata/top_words"
lowercase_re = re.compile(r'^[a-z][a-z\-\']*$')
stop_words = set("""
subd affd finds pursuant subdivision supra available noted your supreme thing code concurred chap post ante brief
merit alia allegedly alleging asserted hereinafter complainant complainants claimant claimants paragraph insofar
herein aforesaid
""".strip().split())
def process_word_dict(word_dict):
out = []
for word, freq in word_dict.iteritems():
if len(word)>3 and lowercase_re.match(word) and word not in stop_words:
out.append((word, freq))
out.sort(key=lambda x: x[1], reverse=True)
return out[:1000]
def save_wordcloud(freqs, path):
wordcloud = WordCloud(width=800, height=800, background_color="white", random_state=0)
wordcloud.generate_from_frequencies(freqs)
image = wordcloud.to_image()
image.save(path)
def save_top_list(headers, data, path):
with open(path, 'wb') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(headers)
for row in data:
csvwriter.writerow(row)
def wordcloud_all_jurisdictions():
for jurisdiction_path in glob(os.path.join(source_dir, "Illinois")):
print "Processing", jurisdiction_path
out_dir = jurisdiction_path.replace(source_dir, dest_dir, 1)
top_words_out_dir = jurisdiction_path.replace(source_dir, top_words_dir, 1)
makedirs(out_dir)
makedirs(top_words_out_dir)
# write global
global_freqs = process_word_dict(json.load(open(os.path.join(jurisdiction_path, 'totals/1.json'))))
save_wordcloud(global_freqs, os.path.join(out_dir, 'totals.png'))
# load year data
bare_top_words_by_year = {}
for year_path in tqdm(glob(os.path.join(jurisdiction_path, "*"))):
year = os.path.basename(year_path)
if year == 'totals':
continue
year_freqs = process_word_dict(json.load(open(os.path.join(year_path, '1.json'))))
# skip years with few cases (probably typos)
if sum(w[1] for w in year_freqs)<10000:
continue
bare_top_words_by_year[year] = [w[0] for w in year_freqs]
# calculate global rankings
word_to_ranking = defaultdict(lambda: 0)
for year, words in tqdm(bare_top_words_by_year.iteritems()):
for pos, word in enumerate(words):
word_to_ranking[word] += (1000 - pos)
average_rank = dict([(word, ranking/len(bare_top_words_by_year)) for word, ranking in word_to_ranking.iteritems()])
# write average ranks CSV
save_top_list(['word','average_rank'],
[[word, 1000-rank] for word, rank in average_rank.iteritems()],
os.path.join(top_words_out_dir, 'average_ranks.csv'))
# calculate scores by year
year_to_volatile_words = {}
for year, words in tqdm(bare_top_words_by_year.iteritems()):
word_to_ranking_delta = {}
for pos, word in enumerate(words):
rank_for_year = 1000 - pos
word_to_ranking_delta[word] = (rank_for_year, average_rank[word], rank_for_year - average_rank[word])
year_to_volatile_words[year] = sorted(word_to_ranking_delta.items(), key=lambda x: x[1][2], reverse=True)
# wordclouds
for year, words in tqdm(year_to_volatile_words.iteritems()):
freqs = [(w[0], w[1][2]) for w in words[:200]]
save_wordcloud(freqs, os.path.join(out_dir, '%s.png' % year))
# write year ranks CSV
save_top_list(['word', 'absolute_rank', 'relative_score'],
[(w[0], 1000-w[1][0], w[1][2]) for w in words],
os.path.join(top_words_out_dir, '%s_ranks.csv' % year))
if __name__ == "__main__":
wordcloud_all_jurisdictions()