In [1]:
import pandas as pd

In [2]:
def parse_millisecond_timestamp(ts):
    """Convert ms since Unix epoch to UTC datetime instance."""
    return pd.to_datetime(ts, unit="ms")

df = pd.read_csv(
    "groupby-data/news.csv",
    sep="\t",
    header=None,
    index_col=0,
    names=["title", "url", "outlet", "category", "cluster", "host", "tstamp"],
    parse_dates=["tstamp"],
    date_parser=parse_millisecond_timestamp,
    dtype={
        "outlet": "category",
        "category": "category",
        "cluster": "category",
        "host": "category",
    },
)

  df = pd.read_csv(


In [4]:
df.iloc[0]

title       Fed official says weak data caused by weather,...
url         http://www.latimes.com/business/money/la-fi-mo...
outlet                                      Los Angeles Times
category                                                    b
cluster                         ddUyU0VZz0BRneMioxUPQVP6sIxvM
host                                          www.latimes.com
tstamp                             2014-03-10 16:52:50.698000
Name: 1, dtype: object

In [6]:
df.groupby("outlet", sort=False, observed=False)["title"].apply(
    lambda ser: ser.str.contains("Fed").sum()
).nlargest(10)

outlet
Reuters                         161
NASDAQ                          103
Businessweek                     93
Investing.com                    66
Wall Street Journal \(blog\)     61
MarketWatch                      56
Moneynews                        55
Bloomberg                        53
GlobalPost                       51
Economic Times                   44
Name: title, dtype: int64

In [9]:
## Better approach

In [10]:
import numpy as np

In [12]:
mentions_fed = df["title"].str.contains("Fed")
mentions_fed.groupby(
    df["outlet"], sort=False, observed=False
).sum().nlargest(10).astype(np.uintc)

outlet
Reuters                         161
NASDAQ                          103
Businessweek                     93
Investing.com                    66
Wall Street Journal \(blog\)     61
MarketWatch                      56
Moneynews                        55
Bloomberg                        53
GlobalPost                       51
Economic Times                   44
Name: title, dtype: uint32

In [14]:
import timeit

In [16]:
def test_apply():
    """Version 1: using `.apply()`"""
    df.groupby("outlet", sort=False, observed=False)["title"].apply(
        lambda ser: ser.str.contains("Fed").sum()
    ).nlargest(10)

def test_vectorization():
    """Version 2: using vectorization"""
    mentions_fed = df["title"].str.contains("Fed")
    mentions_fed.groupby(
        df["outlet"], sort=False, observed=False
    ).sum().nlargest(10).astype(np.uintc)

print(f"Version 1: {timeit.timeit(test_apply, number=3)}")
print(f"Version 2: {timeit.timeit(test_vectorization, number=3)}")

Version 1: 7.137802042001567
Version 2: 0.2348733329999959
