In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [4]:
teams_df = DataFrame(np.arange(16).reshape(4, 4),
                   columns=['Wins', 'Loses', 'Ties', 'Runs Scored'],
                   index=['NYM', 'NYY', 'ATL', 'WSH'])
teams_df

Unnamed: 0,Wins,Loses,Ties,Runs Scored
NYM,0,1,2,3
NYY,4,5,6,7
ATL,8,9,10,11
WSH,12,13,14,15


In [13]:
# Adding nan values to the df
# Reminder, a quick way of using the row number and column name is 
# focusing on the iloc for row number and adding the [col name ]
teams_df.iloc[1:2]['Ties']

NYY    6
Name: Ties, dtype: int64

In [34]:
teams_df.loc["NYM"]

Wins           0
Loses          1
Ties           2
Runs Scored    3
Name: NYM, dtype: int64

In [39]:
teams_df.loc["NYM",["Wins","Loses"]] = np.nan

In [40]:
teams_df

Unnamed: 0,Wins,Loses,Ties,Runs Scored
NYM,,,2,3
NYY,4.0,5.0,6,7
ATL,8.0,9.0,10,11
WSH,12.0,13.0,14,15


In [43]:
# Dict. with values
team_map = {'Wins': 'good', 'Loses': 'bad', 'Ties': 'bad','Runs Scored': 'good'}


In [47]:
# groupby and mapping 
# by the column
team_col = teams_df.groupby(team_map, axis=1)
team_col.sum()

Unnamed: 0,bad,good
NYM,2.0,3.0
NYY,11.0,11.0
ATL,19.0,19.0
WSH,27.0,27.0


In [48]:
# making the dict into a series
team_series = Series(team_map)

team_series

Loses           bad
Runs Scored    good
Ties            bad
Wins           good
dtype: object

In [50]:
# using the groupby with the series
# works the same as a dict

groupby_series = teams_df.groupby(team_series,axis=1)
groupby_series.sum()

Unnamed: 0,bad,good
NYM,2.0,3.0
NYY,11.0,11.0
ATL,19.0,19.0
WSH,27.0,27.0


In [51]:
# groupby with functions

In [52]:
teams_df

Unnamed: 0,Wins,Loses,Ties,Runs Scored
NYM,,,2,3
NYY,4.0,5.0,6,7
ATL,8.0,9.0,10,11
WSH,12.0,13.0,14,15


In [54]:
# if we want to laberl the function based on the length of each one
# we can do the following

teams_df.groupby(len).sum()

# noticed all the lengths of the index were 3 thus the were grouped in one columns 
# would change if there had diff. lengths

Unnamed: 0,Wins,Loses,Ties,Runs Scored
3,24.0,27.0,32,36


In [59]:
# combining a list 
keys = ['A', 'B', 'C', 'B']

# Now groupby length of name and the keys to show max values
teams_df.groupby([len, keys]).max()

# it looks like the key must be the same nunber as the index


Unnamed: 0,Unnamed: 1,Wins,Loses,Ties,Runs Scored
3,A,,,2,3
3,B,12.0,13.0,14,15
3,C,8.0,9.0,10,11


In [61]:
# We can also use groupby with hierarchaly index levels

# Create a hierarchal column index
# The first index is the first way we will set up the columns
# Thus, the second is the second
# and the names is assigned to each col. 
# First index to cities, and so on

hier_col = pd.MultiIndex.from_arrays([['NY','NY','NY','SF','SF'],[1,2,3,1,2]],names=['City','sub_value'])
hier_col

MultiIndex(levels=[['NY', 'SF'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]],
           names=['City', 'sub_value'])

In [62]:
# Create a dframe with hierarchal index
dframe_hr = DataFrame(np.arange(25).reshape(5,5),columns=hier_col)
dframe_hr

City,NY,NY,NY,SF,SF
sub_value,1,2,3,1,2
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [64]:
# Multiply values by 100 for clarity
dframe_hr = dframe_hr*100
dframe_hr

City,NY,NY,NY,SF,SF
sub_value,1,2,3,1,2
0,0,10000,20000,30000,40000
1,50000,60000,70000,80000,90000
2,100000,110000,120000,130000,140000
3,150000,160000,170000,180000,190000
4,200000,210000,220000,230000,240000
