/
get.top.posneg.py
58 lines (52 loc) · 2.55 KB
/
get.top.posneg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from __future__ import print_function
import json
import operator
import math
#from scipy.stats import entropy
categories = ['books', 'restaurants', 'attractions', 'clothing_shoes_jewelry', 'home_kitchen', 'hotels', 'nightlife', 'event_planning_services', 'casinos', 'hairsalons', 'resorts', 'dentists']
with open('toptuples.txt', 'wb') as fw:
for category in categories:
kld = {}
toptuples = {'in': {}, 'out': {}}
for i in range(0, 10):
for star in range(1, 6):
with open('posneg-' + str(i) + '/data-' + category + '-' + str(star) + '.jsonl', 'r') as fh:
for line in fh:
data = json.loads(line)
for em in ['neg', 'pos']:
for item in data[em]:
tup = item[0] + '--' + item[1]
toptuples['in'][tup] = 1 + toptuples['in'].get(tup, 0)
toptuples['out'][tup] = 0
for category2 in categories:
if category != category2:
for i in range(0, 10):
for star in range(1, 6):
with open('posneg-' + str(i) + '/data-' + category2 + '-' + str(star) + '.jsonl', 'r') as fh:
for line in fh:
data = json.loads(line)
for em in ['neg', 'pos']:
for item in data[em]:
tup = item[0] + '--' + item[1]
if tup in toptuples['in']:
toptuples['out'][tup] = 1 + toptuples['out'].get(tup, 0)
intotal = sum(toptuples['in'].values())
outtotal = sum(toptuples['out'].values())
for tup, count in toptuples['in'].iteritems():
pk = float(count) / intotal
qk = float(toptuples['out'][tup]) / outtotal
if count > 1000 and toptuples['out'][tup] > 0:
kld[tup] = pk * math.log(pk / qk)
sortedkld = sorted(kld.items(), key=operator.itemgetter(1), reverse=True)
print(category + ':', end=' ')
fw.write(category + ': ')
printed = 0
top = 5
for tup, count in sortedkld:
printed += 1
if printed <= top:
tokens = tup.split('--')
print('(' + tokens[0] + ', ' + tokens[1] + ', ' + str(count) + ')', end=' ')
fw.write('(' + tokens[0] + ', ' + tokens[1] + ', ' + str(count) + ') ')
print('')
fw.write('\n')