### itertools.groupby(iterable, key=None)

<blockquote>The `itertools.groupby()` function makes an iterator that returns consecutive keys and groups from the iterable. The key is a function computing a key value for each element. If not specified or is None, key defaults to an identity function and returns the element unchanged. Generally, the iterable needs to already be sorted on the same key function.</blockquote>

In [None]:
import itertools

chars = 'AAABBBBCCCDDDD'
print(chars)
itertools.groupby(chars)

# list(itertools.groupby(chars))

for key, group in itertools.groupby(chars):
  # print(list(group))
  print(key, len(list(group)))


AAABBBBCCCDDDD
A 3
B 4
C 3
D 4


Grouping.

In [None]:
d = [
    {'name': 'Martin', 'age': 25, 'city': 'Glasgow'},
    {'name': 'John', 'age': 32, 'city': 'New York'},
    {'name': 'Melissa', 'age': 55, 'city': 'New York'},
    {'name': 'James', 'age': 35, 'city': 'Chicago'},
    {'name': 'Michael', 'age': 45, 'city': 'Scranton'},
    {'name': 'Pam', 'age': 29, 'city': 'Scranton'},
    {'name': 'Dwight', 'age': 35, 'city': 'Scranton'},
]

d.sort(key=lambda x: x['city'])

# for key, group in itertools.groupby(d):
#   print(list(group))

# for key, group in itertools.groupby(d, key=lambda x: x.get('city')):
  # print(key, list(group))


for key, group in itertools.groupby(d, key=lambda x: x.get('city')):
  group_data = list(group)
  group_avg = sum(g['age'] for g in group_data) / len(group_data)
  group_max = max(g['age'] for g in group_data)
  print(key, group_avg, group_max)

Chicago 35.0 35
Glasgow 25.0 25
New York 43.5 55
Scranton 36.333333333333336 45


## Pandas - groupby()

Pandas offers excellent tools for grouping data.

In [None]:
import pandas as pd

df = pd.DataFrame(d)
df.head(10)

Unnamed: 0,name,age,city
0,James,35,Chicago
1,Martin,25,Glasgow
2,John,32,New York
3,Melissa,55,New York
4,Michael,45,Scranton
5,Pam,29,Scranton
6,Dwight,35,Scranton


In [None]:
df.groupby('city').mean()  # sum
df.groupby('city').std()
df.groupby('city').size()  # returns a series with counts

import numpy as np
def sum_sqrt_ages(series):
  # print(series)
  return np.sum(np.sqrt(series))

df.groupby('city')['age'].agg(sum_sqrt_ages)

city
Chicago      5.916080
Glasgow      5.000000
New York    13.073053
Scranton    18.009449
Name: age, dtype: float64

In [None]:
# np.sqrt(df['age'].values)
scranton = df[df['city'] == 'Scranton']
np.sqrt(scranton['age']).sum()

Let's analyze some movies!

In [None]:
import os

# os.listdir()

# df = pd.read_csv('IMDB top 1000.csv')
df = pd.read_csv('IMDB top 1000.csv', index_col=0)

df['director'] = df['Cast'].apply(lambda x: x.split("|")[0].split(":")[1].strip())
df.head()

Unnamed: 0,Title,Certificate,Duration,Genre,Rate,Metascore,Description,Cast,Info,director
0,1. The Shawshank Redemption (1994),R,142 min,Drama,9.3,80.0,Two imprisoned men bond over a number of years...,"Director: Frank Darabont | Stars: Tim Robbins,...","Votes: 2,295,987 | Gross: $28.34M",Frank Darabont
1,2. The Godfather (1972),R,175 min,"Crime, Drama",9.2,100.0,The aging patriarch of an organized crime dyna...,Director: Francis Ford Coppola | Stars: Marlon...,"Votes: 1,584,782 | Gross: $134.97M",Francis Ford Coppola
2,3. The Dark Knight (2008),PG-13,152 min,"Action, Crime, Drama",9.0,84.0,When the menace known as the Joker wreaks havo...,Director: Christopher Nolan | Stars: Christian...,"Votes: 2,260,649 | Gross: $534.86M",Christopher Nolan
3,4. The Godfather: Part II (1974),R,202 min,"Crime, Drama",9.0,90.0,The early life and career of Vito Corleone in ...,Director: Francis Ford Coppola | Stars: Al Pac...,"Votes: 1,107,253 | Gross: $57.30M",Francis Ford Coppola
4,5. The Lord of the Rings: The Return of the Ki...,PG-13,201 min,"Action, Adventure, Drama",8.9,94.0,Gandalf and Aragorn lead the World of Men agai...,"Director: Peter Jackson | Stars: Elijah Wood, ...","Votes: 1,614,369 | Gross: $377.85M",Peter Jackson


In [None]:
df.groupby('director').size()
df.groupby('director').size().sort_values()[::-1]

# top 10 directors
df.groupby('director').size().sort_values()[::-1][:10]

director
Akira Kurosawa       16
Frank Capra          15
David Lynch          15
Neeraj Pandey        15
Lars von Trier       14
Ki-duk Kim           14
Brad Bird            14
Stephen Chbosky      14
Billy Wilder         12
Quentin Tarantino    12
dtype: int64

To use `itertools.groupby()`, we can convert the DataFrame to a dictionary with `df.to_dict(orient='records')` - this returns a list of dictionaries.

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_dict.html
# note the 'records' orientation - returns list of dicts
movie_dict = df.to_dict(orient='records')

movie_dict.sort(key=lambda x: x['director'])

counts = {}

# movie_dict[:5]
for key, group in itertools.groupby(movie_dict, lambda x: x['director']):
  num_films = len(list(group))
  counts[key] = num_films

sorted(counts.items(), key=lambda x: x[1], reverse=True)[:10]

counts['Martin Scorsese']

7

In [None]:
df[df['director'] == 'Martin Scorsese']
df[df['director'] == 'Steven Spielberg']

Unnamed: 0,Title,Certificate,Duration,Genre,Rate,Metascore,Description,Cast,Info,director
6,7. Schindler's List (1993),R,195 min,"Biography, Drama, History",8.9,94.0,"In German-occupied Poland during World War II,...",Director: Steven Spielberg | Stars: Liam Neeso...,"Votes: 1,191,793 | Gross: $96.90M",Steven Spielberg
23,24. Saving Private Ryan (1998),R,169 min,"Drama, War",8.6,91.0,"Following the Normandy Landings, a group of U....","Director: Steven Spielberg | Stars: Tom Hanks,...","Votes: 1,212,774 | Gross: $216.54M",Steven Spielberg
71,72. Raiders of the Lost Ark (1981),PG,115 min,"Action, Adventure",8.4,85.0,"In 1936, archaeologist and adventurer Indiana ...",Director: Steven Spielberg | Stars: Harrison F...,"Votes: 870,596 | Gross: $248.16M",Steven Spielberg
166,167. Indiana Jones and the Last Crusade (1989),PG-13,127 min,"Action, Adventure",8.2,65.0,"In 1938, after his father Professor Henry Jone...",Director: Steven Spielberg | Stars: Harrison F...,"Votes: 681,051 | Gross: $197.17M",Steven Spielberg
239,242. Catch Me If You Can (2002),PG-13,141 min,"Biography, Crime, Drama",8.1,75.0,A seasoned FBI agent pursues Frank Abagnale Jr...,Director: Steven Spielberg | Stars: Leonardo D...,"Votes: 808,593 | Gross: $164.62M",Steven Spielberg
257,260. Jurassic Park (1993),PG-13,127 min,"Action, Adventure, Sci-Fi",8.1,68.0,A pragmatic paleontologist visiting an almost ...,"Director: Steven Spielberg | Stars: Sam Neill,...","Votes: 852,358 | Gross: $402.45M",Steven Spielberg
