In [15]:
#How to do aggregates using groupby, defaultdict and Counter on flat files
#This Example computes groups for:
#region, count(region), sum(sales), avg(sales), max(sales), min(sales)
#
#By Nestor Nissen

from csv import DictReader
from operator import itemgetter
from itertools import groupby
from collections import defaultdict, Counter

rawdata = '''
pcd_sector_id,technology,cells
1,LTE,5
2,LTE-A,3
2,LTE,7
3,LTE,3
'''.splitlines()[1:]
print(rawdata)

['pcd_sector_id,technology,cells', '1,LTE,5', '2,LTE-A,3', '2,LTE,7', '3,LTE,3']


In [16]:
indata = list(DictReader(rawdata))
print(indata)

[{'cells': '5', 'pcd_sector_id': '1', 'technology': 'LTE'}, {'cells': '3', 'pcd_sector_id': '2', 'technology': 'LTE-A'}, {'cells': '7', 'pcd_sector_id': '2', 'technology': 'LTE'}, {'cells': '3', 'pcd_sector_id': '3', 'technology': 'LTE'}]


In [18]:
print('Using sort and groupby:')

counts = []
sums = []
avgs = []
maxs = []
mins = []
ordered_data = sorted(indata, key=itemgetter('pcd_sector_id'))
for pcd_sector_id, group in groupby(ordered_data, key=itemgetter('pcd_sector_id')):
    group_list = list(group)
    count = sum(1 for technology in group_list)
    total = sum(int(technology['cells']) for technology in group_list)
    maxsale = max(int(technology['cells']) for technology in group_list)
    minsale = min(int(technology['cells']) for technology in group_list)
    counts.append((pcd_sector_id, count))
    sums.append((pcd_sector_id, total))
    avgs.append((pcd_sector_id, total/count))
    maxs.append((pcd_sector_id, maxsale))
    mins.append((pcd_sector_id, minsale))
print('count:',counts, '\nsum:',sums, '\navg:',avgs,
      '\nmax:',maxs, '\nmin:',mins, '\n')


Using sort and groupby:
count: [('1', 1), ('2', 2), ('3', 1)] 
sum: [('1', 5), ('2', 10), ('3', 3)] 
avg: [('1', 5.0), ('2', 5.0), ('3', 3.0)] 
max: [('1', 5), ('2', 7), ('3', 3)] 
min: [('1', 5), ('2', 3), ('3', 3)] 



In [20]:
print('Using defaultdict:')
dd_counts = defaultdict(int)
dd_cells = defaultdict(int)
dd_maxs = defaultdict(int)
dd_mins = defaultdict(lambda: 9**99)
for row in indata:
    pcd_sector_id = row['pcd_sector_id']
    cells = int(row['cells'])
    dd_counts[pcd_sector_id] += 1
    dd_cells[pcd_sector_id] += cells
    dd_maxs[pcd_sector_id] = max(dd_maxs[pcd_sector_id], cells)
    dd_mins[pcd_sector_id] = min(dd_mins[pcd_sector_id], cells)
counts = list(dd_counts.items())
sums = list(dd_cells.items())
avgs = [(key, dd_cells[key]/count) for key, count in dd_counts.items()]
maxs = list(dd_maxs.items())
mins = list(dd_mins.items())
print('count:',counts, '\nsum:',sums, '\navg:',avgs,
      '\nmax:',maxs, '\nmin:',mins, '\n')


Using defaultdict:
count: [('2', 2), ('3', 1), ('1', 1)] 
sum: [('2', 10), ('3', 3), ('1', 5)] 
avg: [('2', 5.0), ('3', 3.0), ('1', 5.0)] 
max: [('2', 7), ('3', 3), ('1', 5)] 
min: [('2', 3), ('3', 3), ('1', 5)] 



In [21]:
print('Using counter:')
counts = list(Counter(map(itemgetter('pcd_sector_id'), indata)).items())
print('count:',counts)

Using counter:
count: [('2', 2), ('3', 1), ('1', 1)]
