# Work with Grouping and Aggregate Functions

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

<dl class="method">
<dt id="pandas.DataFrame.groupby">
<code class="descclassname">DataFrame.</code><code class="descname">groupby</code><span class="sig-paren">(</span><em>by=None</em>, <em>axis=0</em>, <em>level=None</em>, <em>as_index=True</em>, <em>sort=True</em>, <em>group_keys=True</em>, <em>squeeze=False</em>, <em>observed=False</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference external" href="http://github.com/pandas-dev/pandas/blob/v0.23.4/pandas/core/generic.py#L6592-L6665"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pandas.DataFrame.groupby" title="Permalink to this definition">¶</a></dt>
<dd><p>Group series using mapper (dict or key function, apply given function
to group, return result as series) or by a series of columns.</p>
<table class="docutils field-list" frame="void" rules="none">
<colgroup><col class="field-name">
<col class="field-body">
</colgroup><tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><p class="first"><strong>by</strong> : mapping, function, label, or list of labels</p>
<blockquote>
<div><p>Used to determine the groups for the groupby.
If <code class="docutils literal notranslate"><span class="pre">by</span></code> is a function, it’s called on each value of the object’s
index. If a dict or Series is passed, the Series or dict VALUES
will be used to determine the groups (the Series’ values are first
aligned; see <code class="docutils literal notranslate"><span class="pre">.align()</span></code> method). If an ndarray is passed, the
values are used as-is determine the groups. A label or list of
labels may be passed to group by the columns in <code class="docutils literal notranslate"><span class="pre">self</span></code>. Notice
that a tuple is interpreted a (single) key.</p>
</div></blockquote>
<dl class="docutils">
<dt><strong>axis</strong> <span class="classifier-delimiter">:</span> <span class="classifier">int, default 0</span></dt>
<dd></dd>
</dl>
<p><strong>level</strong> : int, level name, or sequence of such, default None</p>
<blockquote>
<div><p>If the axis is a MultiIndex (hierarchical), group by a particular
level or levels</p>
</div></blockquote>
<p><strong>as_index</strong> : boolean, default True</p>
<blockquote>
<div><p>For aggregated output, return object with group labels as the
index. Only relevant for DataFrame input. as_index=False is
effectively “SQL-style” grouped output</p>
</div></blockquote>
<p><strong>sort</strong> : boolean, default True</p>
<blockquote>
<div><p>Sort group keys. Get better performance by turning this off.
Note this does not influence the order of observations within each
group.  groupby preserves the order of rows within each group.</p>
</div></blockquote>
<p><strong>group_keys</strong> : boolean, default True</p>
<blockquote>
<div><p>When calling apply, add group keys to index to identify pieces</p>
</div></blockquote>
<p><strong>squeeze</strong> : boolean, default False</p>
<blockquote>
<div><p>reduce the dimensionality of the return type if possible,
otherwise return a consistent type</p>
</div></blockquote>
<p><strong>observed</strong> : boolean, default False</p>
<blockquote>
<div><p>This only applies if any of the groupers are Categoricals
If True: only show observed values for categorical groupers.
If False: show all values for categorical groupers.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.23.0.</span></p>
</div>
</div></blockquote>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><dl class="first last docutils">
<dt><strong>GroupBy object</strong></dt>
<dd></dd>
</dl>
</td>
</tr>
</tbody>
</table>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><a class="reference internal" href="pandas.DataFrame.resample.html#pandas.DataFrame.resample" title="pandas.DataFrame.resample"><code class="xref py py-obj docutils literal notranslate"><span class="pre">resample</span></code></a></dt>
<dd>Convenience method for frequency conversion and resampling of time series.</dd>
</dl>
</div>
<p class="rubric">Notes</p>
<p>See the <a class="reference external" href="http://pandas.pydata.org/pandas-docs/stable/groupby.html">user guide</a> for more.</p>
<p class="rubric">Examples</p>
<p>DataFrame results</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">groupby</span><span class="p">([</span><span class="s1">'col1'</span><span class="p">,</span> <span class="s1">'col2'</span><span class="p">])[</span><span class="s1">'col3'</span><span class="p">]</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
</pre></div>
</div>
<p>DataFrame with hierarchical index</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">groupby</span><span class="p">([</span><span class="s1">'col1'</span><span class="p">,</span> <span class="s1">'col2'</span><span class="p">])</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
</pre></div>
</div>
</dd></dl>

In [70]:
tips = sns.load_dataset('tips')
print(tips.head(), '\n')
print(tips.describe())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4 

       total_bill         tip        size
count  244.000000  244.000000  244.000000
mean    19.785943    2.998279    2.569672
std      8.902412    1.383638    0.951100
min      3.070000    1.000000    1.000000
25%     13.347500    2.000000    2.000000
50%     17.795000    2.900000    2.000000
75%     24.127500    3.562500    3.000000
max     50.810000   10.000000    6.000000


In [17]:
#Find average tip for men and women
tips.groupby('sex').mean()['tip']

sex
Male      3.089618
Female    2.833448
Name: tip, dtype: float64

### Who are better tippers? i.e. Who leaves a bigger percentage tip? Men or Women?

In [66]:
tips = sns.load_dataset('tips')

# Calculate means
tips_sex = tips.groupby('sex').mean()
tips_sex['tip_pct'] = tips_sex.apply(lambda x: 100 * (x['tip'] / x['total_bill']), axis=1)

# Sort and compare the tip_pcts
tips_sex.sort_values(by='tip_pct', ascending=False, inplace=True)
print(tips_sex)

better_tippers = tips_sex.iloc[0].name
print('{}s tend to be better tippers.'.format(better_tippers))

        total_bill       tip      size    tip_pct
sex                                              
Female   18.056897  2.833448  2.459770  15.691779
Male     20.744076  3.089618  2.630573  14.893976
Females tend to be better tippers.


### Is the result above statistically significant?

In [67]:
################################################################################
# We can use a Z-test here since our population is large enough that we can
# safely approximate population mean and standard deviation by the sample values
################################################################################

from statsmodels.stats.weightstats import ztest


tips = sns.load_dataset('tips')

def tip_pct(row):
    return 100.0 * (row['tip'] / row['total_bill'])
tips['tip_pct'] = tips.apply(tip_pct, axis=1)

tips_male = tips[tips['sex'] == 'Male']['tip_pct']
tips_female = tips[tips['sex'] == 'Female']['tip_pct']
t_stat, p_value = ztest(tips_male, tips_female)

print('We can reject the hypothesis that Men and Women tip the same with {:.2f}% confidence'.format(100*(1-p_value)))

We can reject the hypothesis that Men and Women tip the same with 72.14% confidence


### What is the optimal party type for tip percentages?

In [79]:
tips = sns.load_dataset('tips')

def tip_pct(row):
    return 100.0 * (row['tip'] / row['total_bill'])

tips['tip_pct'] = tips.apply(tip_pct, axis=1)

full_tips = tips.groupby(by=['sex', 'smoker', 'day', 'time', 'size']).mean()
full_tips = full_tips.dropna()

# We see that the highest percentage tippers are women who smoke and are eating
# dinner alone on saturday night. Bleak.
full_tips.loc[full_tips['tip_pct'].idxmax()]

total_bill     3.07000
tip            1.00000
tip_pct       32.57329
Name: (Female, Yes, Sat, Dinner, 1), dtype: float64

In [84]:
# The largest magnitude tip comes from a non-smoking group of 6 on Thursday
# at Lunch. Possibly a business lunch.
full_tips.loc[full_tips['tip'].idxmax()]

total_bill    34.300000
tip            6.700000
tip_pct       19.533528
Name: (Male, No, Thur, Lunch, 6), dtype: float64