# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html) for Pandas (v 2.x.x, as of this writing) but may take a little more than 10 minutes to finish. Lot's of small changes, references etc. The original notebook was downloaded from 
[this repo on github](https://github.com/shauryashaurya/learn-data-munging/blob/main/02-Pandas/01.01-10%2B-minutes-to-Pandas.ipynb).

You can also open this notebook in Google Colab

<a href="https://colab.research.google.com/github/shauryashaurya/learn-data-munging/blob/main/02-Pandas/01.01-10%2B-minutes-to-Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.__version__

'1.26.0'

In [3]:
pd.__version__

'2.1.2'

# Object creation

In [4]:
# Series - pass a list to pd.Series
s1 = pd.Series([1, 2, 3, np.nan, 5])

In [5]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [6]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range("20220619", periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [7]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852
2022-06-20,1.140042,0.594597,1.234421,-1.050373
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703
2022-06-24,-0.72829,1.231819,0.430074,0.94721
2022-06-25,-0.444771,-0.832265,1.604595,0.279618
2022-06-26,0.378996,-0.507757,0.439415,1.480208
2022-06-27,0.000805,-0.164564,-0.033391,0.851357
2022-06-28,-0.679106,1.201701,0.100458,-0.301494


In [8]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index=list(range(6)), dtype="float32"),
        "D": np.array([3] * 6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo",
    }
)

df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [9]:
# the datatype of each of the columns would be different
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
G           object
dtype: object

# Viewing Data

In [10]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852
2022-06-20,1.140042,0.594597,1.234421,-1.050373
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703


In [11]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,0.378996,-0.507757,0.439415,1.480208
2022-06-27,0.000805,-0.164564,-0.033391,0.851357
2022-06-28,-0.679106,1.201701,0.100458,-0.301494


In [12]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [13]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [14]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
# NumPy arrays support a single data type
# so with mixed data types, the data type of
# each element is
df.to_numpy()

array([[-1.54968476e+00, -4.48156065e-01, -9.67573102e-01,
        -1.55685222e+00],
       [ 1.14004175e+00,  5.94596965e-01,  1.23442104e+00,
        -1.05037271e+00],
       [-1.33403417e+00, -2.42770872e+00,  6.48506786e-01,
        -9.95819184e-01],
       [-3.93233317e-01,  1.34498437e+00, -1.10085862e-01,
        -3.41320347e-01],
       [-5.11189455e-01, -6.10946766e-01, -2.34703303e-01,
        -1.08703223e-01],
       [-7.28289786e-01,  1.23181875e+00,  4.30073844e-01,
         9.47210111e-01],
       [-4.44770819e-01, -8.32264943e-01,  1.60459520e+00,
         2.79617547e-01],
       [ 3.78996170e-01, -5.07756663e-01,  4.39415213e-01,
         1.48020785e+00],
       [ 8.05009872e-04, -1.64564085e-01, -3.33911790e-02,
         8.51357037e-01],
       [-6.79105932e-01,  1.20170122e+00,  1.00458012e-01,
        -3.01493735e-01]])

In [15]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [16]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,-0.412047,-0.06183,0.311172,-0.079617
std,0.78401,1.179981,0.74113,0.975578
min,-1.549685,-2.427709,-0.967573,-1.556852
25%,-0.715994,-0.585149,-0.090912,-0.832194
50%,-0.47798,-0.30636,0.265266,-0.205098
75%,-0.097705,1.049925,0.596234,0.708422
max,1.140042,1.344984,1.604595,1.480208


1. **count** = Count number of non-NA/null observations
1. **max** = Maximum of the values in the object
1. **min** = Minimum of the values in the object
1. **mean** = Mean of the values
1. **std** = Standard deviation of the observations
1. **25%** = Default lower percentile
1. **50%** = 50 percentile - same as the median
1. **75%** = Default upper percentile

In [17]:
# change the percentiles
df.describe(percentiles=[0.1, 0.5, 0.9])

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,-0.412047,-0.06183,0.311172,-0.079617
std,0.78401,1.179981,0.74113,0.975578
min,-1.549685,-2.427709,-0.967573,-1.556852
10%,-1.355599,-0.991809,-0.30799,-1.101021
50%,-0.47798,-0.30636,0.265266,-0.205098
90%,0.455101,1.243135,1.271438,1.00051
max,1.140042,1.344984,1.604595,1.480208


In [18]:
# E, F, G in d2 are not numeric, so do not come up in describe()
df2.describe()

Unnamed: 0,A,B,C,D
count,6.0,6,6.0,6.0
mean,1.0,2022-06-19 00:00:00,1.0,3.0
min,1.0,2022-06-19 00:00:00,1.0,3.0
25%,1.0,2022-06-19 00:00:00,1.0,3.0
50%,1.0,2022-06-19 00:00:00,1.0,3.0
75%,1.0,2022-06-19 00:00:00,1.0,3.0
max,1.0,2022-06-19 00:00:00,1.0,3.0
std,0.0,,0.0,0.0


In [19]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,-1.549685,1.140042,-1.334034,-0.393233,-0.511189,-0.72829,-0.444771,0.378996,0.000805,-0.679106
B,-0.448156,0.594597,-2.427709,1.344984,-0.610947,1.231819,-0.832265,-0.507757,-0.164564,1.201701
C,-0.967573,1.234421,0.648507,-0.110086,-0.234703,0.430074,1.604595,0.439415,-0.033391,0.100458
D,-1.556852,-1.050373,-0.995819,-0.34132,-0.108703,0.94721,0.279618,1.480208,0.851357,-0.301494


### [Axis in pandas](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_axis.html?highlight=set_ax#pandas.DataFrame.set_axis)

```axis{0 or ‘index’, 1 or ‘columns’}```

Always remember that when you specify:
* ```axis=0``` or ```axis='index'``` you indicate that the operation should be along the _index_ (aka across the rows). More often than not indexes will go from top to bottom (vertically), but sometimes they may not (for e.g. if you use columns as index or when you have hierarchical or multi-index data).
* ```axis=1``` or ```axis='columns'``` indicates the operation is along the columns

When in doubt use the explicit version ```'index'``` or ```'columns'``` instead of ```0``` or ```1```

In [20]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,-1.556852,-0.967573,-0.448156,-1.549685
2022-06-20,-1.050373,1.234421,0.594597,1.140042
2022-06-21,-0.995819,0.648507,-2.427709,-1.334034
2022-06-22,-0.34132,-0.110086,1.344984,-0.393233
2022-06-23,-0.108703,-0.234703,-0.610947,-0.511189
2022-06-24,0.94721,0.430074,1.231819,-0.72829
2022-06-25,0.279618,1.604595,-0.832265,-0.444771
2022-06-26,1.480208,0.439415,-0.507757,0.378996
2022-06-27,0.851357,-0.033391,-0.164564,0.000805
2022-06-28,-0.301494,0.100458,1.201701,-0.679106


In [21]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,-0.679106,1.201701,0.100458,-0.301494
2022-06-27,0.000805,-0.164564,-0.033391,0.851357
2022-06-26,0.378996,-0.507757,0.439415,1.480208
2022-06-25,-0.444771,-0.832265,1.604595,0.279618
2022-06-24,-0.72829,1.231819,0.430074,0.94721
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819
2022-06-20,1.140042,0.594597,1.234421,-1.050373
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852


In [22]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819
2022-06-24,-0.72829,1.231819,0.430074,0.94721
2022-06-28,-0.679106,1.201701,0.100458,-0.301494
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703
2022-06-25,-0.444771,-0.832265,1.604595,0.279618
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132
2022-06-27,0.000805,-0.164564,-0.033391,0.851357
2022-06-26,0.378996,-0.507757,0.439415,1.480208
2022-06-20,1.140042,0.594597,1.234421,-1.050373


In [23]:
# sort by non-numerical values
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo


In [24]:
# sort by two or more columns
df2.sort_values(by=["F", "E"])

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


# Selection

For production prefer the following instead of other data access methods (typical python methods like ["col"] or [a:b] slices etc.):

```.at, .iat, .loc and .iloc.```

## Getting

In [25]:
# selecting a single column returns a Series object
df["A"]

2022-06-19   -1.549685
2022-06-20    1.140042
2022-06-21   -1.334034
2022-06-22   -0.393233
2022-06-23   -0.511189
2022-06-24   -0.728290
2022-06-25   -0.444771
2022-06-26    0.378996
2022-06-27    0.000805
2022-06-28   -0.679106
Freq: D, Name: A, dtype: float64

In [26]:
# selecting a slice
df[1:5]

Unnamed: 0,A,B,C,D
2022-06-20,1.140042,0.594597,1.234421,-1.050373
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703


## Selection by label

In [27]:
# selecting based on a label
df.loc[dates[0]]

A   -1.549685
B   -0.448156
C   -0.967573
D   -1.556852
Name: 2022-06-19 00:00:00, dtype: float64

In [28]:
# select on a multi-axis by lable
# I honestly do not know what the comment above means...help!
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2022-06-19,-1.549685,-0.448156
2022-06-20,1.140042,0.594597
2022-06-21,-1.334034,-2.427709
2022-06-22,-0.393233,1.344984
2022-06-23,-0.511189,-0.610947
2022-06-24,-0.72829,1.231819
2022-06-25,-0.444771,-0.832265
2022-06-26,0.378996,-0.507757
2022-06-27,0.000805,-0.164564
2022-06-28,-0.679106,1.201701


In [29]:
# specific index value results in reduction of dimensions
res = df.loc["2022-06-20"]
print(res)
print("res.shape = ", res.shape, " vs. df.shape = ", df.shape)

A    1.140042
B    0.594597
C    1.234421
D   -1.050373
Name: 2022-06-20 00:00:00, dtype: float64
res.shape =  (4,)  vs. df.shape =  (10, 4)


In [30]:
# get to a specific scalar:
#
# method one
df.loc[dates[0], "A"]

-1.5496847600166925

In [31]:
#
# method two (slightly faster than method one)
df.at[dates[0], "A"]

-1.5496847600166925

## Selection by position

In [32]:
df.iloc[2]

A   -1.334034
B   -2.427709
C    0.648507
D   -0.995819
Name: 2022-06-21 00:00:00, dtype: float64

In [33]:
# slices - similar to NumPy / Python - [row:slice, col:slice]
df.iloc[1:5, 0:2]

Unnamed: 0,A,B
2022-06-20,1.140042,0.594597
2022-06-21,-1.334034,-2.427709
2022-06-22,-0.393233,1.344984
2022-06-23,-0.511189,-0.610947


In [34]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
df.iloc[[0, 1, 2, 6], [0, 2]]

Unnamed: 0,A,C
2022-06-19,-1.549685,-0.967573
2022-06-20,1.140042,1.234421
2022-06-21,-1.334034,0.648507
2022-06-25,-0.444771,1.604595


In [35]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
# change the order of columns, repeact a column
df.iloc[[0, 1, 2, 6], [2, 1, 0, 2]]

Unnamed: 0,C,B,A,C.1
2022-06-19,-0.967573,-0.448156,-1.549685,-0.967573
2022-06-20,1.234421,0.594597,1.140042,1.234421
2022-06-21,0.648507,-2.427709,-1.334034,0.648507
2022-06-25,1.604595,-0.832265,-0.444771,1.604595


In [36]:
# slice rows explicitly, keep all columns
df.iloc[[1, 2], :]

Unnamed: 0,A,B,C,D
2022-06-20,1.140042,0.594597,1.234421,-1.050373
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819


In [37]:
# slice columns, keep all rows
df.iloc[:, [2, 3]]

Unnamed: 0,C,D
2022-06-19,-0.967573,-1.556852
2022-06-20,1.234421,-1.050373
2022-06-21,0.648507,-0.995819
2022-06-22,-0.110086,-0.34132
2022-06-23,-0.234703,-0.108703
2022-06-24,0.430074,0.94721
2022-06-25,1.604595,0.279618
2022-06-26,0.439415,1.480208
2022-06-27,-0.033391,0.851357
2022-06-28,0.100458,-0.301494


In [38]:
# everything, because you can
df.iloc[:, :]

Unnamed: 0,A,B,C,D
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852
2022-06-20,1.140042,0.594597,1.234421,-1.050373
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703
2022-06-24,-0.72829,1.231819,0.430074,0.94721
2022-06-25,-0.444771,-0.832265,1.604595,0.279618
2022-06-26,0.378996,-0.507757,0.439415,1.480208
2022-06-27,0.000805,-0.164564,-0.033391,0.851357
2022-06-28,-0.679106,1.201701,0.100458,-0.301494


In [39]:
# get to a scalar (2 methods, just like before)
#
# method one: use iloc
df.iloc[1, 2]

1.2344210416860366

In [40]:
#
# method two: use iat
df.iat[1, 2]

1.2344210416860366

## Boolean Indexing

In [41]:
# use a value found in a single col to get data
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2022-06-20,1.140042,0.594597,1.234421,-1.050373
2022-06-26,0.378996,-0.507757,0.439415,1.480208
2022-06-27,0.000805,-0.164564,-0.033391,0.851357


In [42]:
# boolean across the entire DF - vals that don't match go NaN
df[df > 0]

Unnamed: 0,A,B,C,D
2022-06-19,,,,
2022-06-20,1.140042,0.594597,1.234421,
2022-06-21,,,0.648507,
2022-06-22,,1.344984,,
2022-06-23,,,,
2022-06-24,,1.231819,0.430074,0.94721
2022-06-25,,,1.604595,0.279618
2022-06-26,0.378996,,0.439415,1.480208
2022-06-27,0.000805,,,0.851357
2022-06-28,,1.201701,0.100458,


In [43]:
# add another column
df11 = df.copy()
df11["E"] = [
    "one",
    "two",
    "three",
    "four",
    "two",
    "five",
    "one",
    "two",
    "three",
    "four",
]
df11

Unnamed: 0,A,B,C,D,E
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,one
2022-06-20,1.140042,0.594597,1.234421,-1.050373,two
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,three
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,four
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703,two
2022-06-24,-0.72829,1.231819,0.430074,0.94721,five
2022-06-25,-0.444771,-0.832265,1.604595,0.279618,one
2022-06-26,0.378996,-0.507757,0.439415,1.480208,two
2022-06-27,0.000805,-0.164564,-0.033391,0.851357,three
2022-06-28,-0.679106,1.201701,0.100458,-0.301494,four


In [44]:
# the isin() query - basically the in clause
df11[df11["E"].isin(["two", "five"])]

Unnamed: 0,A,B,C,D,E
2022-06-20,1.140042,0.594597,1.234421,-1.050373,two
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703,two
2022-06-24,-0.72829,1.231819,0.430074,0.94721,five
2022-06-26,0.378996,-0.507757,0.439415,1.480208,two


## Setting values

In [45]:
# matching indexes auto-aligns values
s1 = pd.Series(range(11, 21), index=pd.date_range("20220619", periods=10))
s1

2022-06-19    11
2022-06-20    12
2022-06-21    13
2022-06-22    14
2022-06-23    15
2022-06-24    16
2022-06-25    17
2022-06-26    18
2022-06-27    19
2022-06-28    20
Freq: D, dtype: int64

In [46]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11
2022-06-20,1.140042,0.594597,1.234421,-1.050373,12
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703,15
2022-06-24,-0.72829,1.231819,0.430074,0.94721,16
2022-06-25,-0.444771,-0.832265,1.604595,0.279618,17
2022-06-26,0.378996,-0.507757,0.439415,1.480208,18
2022-06-27,0.000805,-0.164564,-0.033391,0.851357,19
2022-06-28,-0.679106,1.201701,0.100458,-0.301494,20


In [47]:
# setting values by label and position
# first let's make a quick copy
df12 = df.copy()
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11
2022-06-20,1.140042,0.594597,1.234421,-1.050373,12
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703,15
2022-06-24,-0.72829,1.231819,0.430074,0.94721,16
2022-06-25,-0.444771,-0.832265,1.604595,0.279618,17
2022-06-26,0.378996,-0.507757,0.439415,1.480208,18
2022-06-27,0.000805,-0.164564,-0.033391,0.851357,19
2022-06-28,-0.679106,1.201701,0.100458,-0.301494,20


In [48]:
# set by label
df12.at[dates[0], "A"] = 0
# set by position
df12.iat[0, 1] = 0
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.967573,-1.556852,11
2022-06-20,1.140042,0.594597,1.234421,-1.050373,12
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703,15
2022-06-24,-0.72829,1.231819,0.430074,0.94721,16
2022-06-25,-0.444771,-0.832265,1.604595,0.279618,17
2022-06-26,0.378996,-0.507757,0.439415,1.480208,18
2022-06-27,0.000805,-0.164564,-0.033391,0.851357,19
2022-06-28,-0.679106,1.201701,0.100458,-0.301494,20


In [49]:
# kinda bigger replacement
df12.loc[:, "D"] = np.array([5] * len(df))
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.967573,5.0,11
2022-06-20,1.140042,0.594597,1.234421,5.0,12
2022-06-21,-1.334034,-2.427709,0.648507,5.0,13
2022-06-22,-0.393233,1.344984,-0.110086,5.0,14
2022-06-23,-0.511189,-0.610947,-0.234703,5.0,15
2022-06-24,-0.72829,1.231819,0.430074,5.0,16
2022-06-25,-0.444771,-0.832265,1.604595,5.0,17
2022-06-26,0.378996,-0.507757,0.439415,5.0,18
2022-06-27,0.000805,-0.164564,-0.033391,5.0,19
2022-06-28,-0.679106,1.201701,0.100458,5.0,20


In [50]:
# setting values using a boolean selection (aka where clause)
df12[df12 > 0] = -df12
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.967573,-5.0,-11
2022-06-20,-1.140042,-0.594597,-1.234421,-5.0,-12
2022-06-21,-1.334034,-2.427709,-0.648507,-5.0,-13
2022-06-22,-0.393233,-1.344984,-0.110086,-5.0,-14
2022-06-23,-0.511189,-0.610947,-0.234703,-5.0,-15
2022-06-24,-0.72829,-1.231819,-0.430074,-5.0,-16
2022-06-25,-0.444771,-0.832265,-1.604595,-5.0,-17
2022-06-26,-0.378996,-0.507757,-0.439415,-5.0,-18
2022-06-27,-0.000805,-0.164564,-0.033391,-5.0,-19
2022-06-28,-0.679106,-1.201701,-0.100458,-5.0,-20


# Missing Data

### _reindex_
change/add/delete index on a specified axis, returns a new dataframe

In [51]:
df13 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["G"])
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11,
2022-06-20,1.140042,0.594597,1.234421,-1.050373,12,
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13,
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14,


### handling missing data
1. _```np.nan```_
1. _```pandas.isna()```_
1. ```df.dropna()```
1. ```df.fillna()```

In [52]:
# missing data in pandas is np.nan
df13.iat[1, 0] = np.nan
df13.iloc[1:, 5] = np.random.randint(1)
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11,
2022-06-20,,0.594597,1.234421,-1.050373,12,0.0
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13,0.0
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14,0.0


In [53]:
# get a boolean mask where values are NaN
df131 = pd.isna(df13)
df131

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [54]:
# or just
pd.isna(df13)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [55]:
# the original is still there
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11,
2022-06-20,,0.594597,1.234421,-1.050373,12,0.0
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13,0.0
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14,0.0


In [56]:
# we are going to drop / replace values now, let's make a couple of copies of the dataframe
df132 = df13.copy()
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11,
2022-06-20,,0.594597,1.234421,-1.050373,12,0.0
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13,0.0
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14,0.0


In [57]:
# drop rows/columns that have missing data
# by default it returns a new dataframe, you may want to specify inplace=True for modifying current dataframe:
df_no_na = df132.dropna(how="any")
# how=‘any’ : If any NA values are present, drop that row or column.
# how=‘all’ : If all values are NA, drop that row or column.

In [58]:
# all rows/cols with missing data stripped
df_no_na

Unnamed: 0,A,B,C,D,F,G
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13,0.0
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14,0.0


In [59]:
# original still intact
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11,
2022-06-20,,0.594597,1.234421,-1.050373,12,0.0
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13,0.0
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14,0.0


In [60]:
# drop missing data from original
df132.dropna(how="any", inplace=True)
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13,0.0
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14,0.0


In [61]:
# fill missing data
df133 = df13.copy()
df133.fillna(np.pi * 1000)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11,3141.592654
2022-06-20,3141.592654,0.594597,1.234421,-1.050373,12,0.0
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13,0.0
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14,0.0


# Align and Join

There needs to be a bigger notebook for this topic.
You need to know there's "joins" in Pandas just like in the SQL world,
like join and left join and right join and inner and outer and all that...
Here we'll explore these in the context of the align function - 'aligning' the indexes of two dataframes.
Later we'll see these again in the context of _merging_ two data frames.

## Create two datasets with _mismatched_ indexes

In [62]:
# date range indexes
idx1 = pd.date_range("2022-01-01", periods=10)
# 2022-01-01', '2022-01-02' don't exist in idx2
# '2022-01-11', '2022-01-12' don't exist in idx1
idx2 = pd.date_range("2022-01-03", periods=10)

In [63]:
# dataframes from indexes
d1 = pd.DataFrame(
    index=idx1,
    data={"A": np.random.rand(10), "B": np.random.randint(1, high=25, size=10)},
)

d2 = pd.DataFrame(
    index=idx2,
    data={"A": np.random.rand(10), "B": np.random.randint(1, high=25, size=10)},
)

## Quick Aside: Rendering two dataframes side-by-side

The trick of rendering 2 dataframes side-by-side was from [this](https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side) stackoverflow question.

In [64]:
# we gon need to display both data frames side by side, so...
from IPython.display import display_html


def render_df_side_by_side(a, b, a_title="", b_title=""):
    a_styler = a.style.set_table_attributes("style='display:inline'").set_caption(
        a_title
    )
    b_styler = b.style.set_table_attributes("style='display:inline'").set_caption(
        b_title
    )
    display_html(a_styler._repr_html_() + b_styler._repr_html_(), raw=True)

In [65]:
render_df_side_by_side(d1, d2)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.351908,7
2022-01-02 00:00:00,0.224626,11
2022-01-03 00:00:00,0.797751,13
2022-01-04 00:00:00,0.178406,3
2022-01-05 00:00:00,0.991701,1
2022-01-06 00:00:00,0.325965,17
2022-01-07 00:00:00,0.966269,7
2022-01-08 00:00:00,0.220385,17
2022-01-09 00:00:00,0.796824,18
2022-01-10 00:00:00,0.711788,5

Unnamed: 0,A,B
2022-01-03 00:00:00,0.561641,15
2022-01-04 00:00:00,0.49023,11
2022-01-05 00:00:00,0.558596,3
2022-01-06 00:00:00,0.075284,4
2022-01-07 00:00:00,0.003883,12
2022-01-08 00:00:00,0.953586,6
2022-01-09 00:00:00,0.571732,7
2022-01-10 00:00:00,0.800342,5
2022-01-11 00:00:00,0.506769,21
2022-01-12 00:00:00,0.630302,6


Table rendering/styling options is [a bigger discussion](https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html#) to be tackled later.

## .add, .sub, .mul, .div etc.

flexible wrappers (```add, sub, mul, div, mod, pow```) to arithmetic operators: ```+, -, *, /, //, %, **```

In [66]:
# the +, -, *, /, //, %, ** operations align indexes to create a UNION of indexes
d3 = d1 + d2
d4 = d1.sub(d2)  # equivalent to d1-d2
d5 = d1 * d2
d6 = d1.div(d2)

In [67]:
# indexs not present in the other dataframe get a NaN

# pandas v1.x
# d3.style.highlight_null(null_color='red')

# pandas v2.x+
d3.style.highlight_null(color="red")

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,1.359392,28.0
2022-01-04 00:00:00,0.668637,14.0
2022-01-05 00:00:00,1.550296,4.0
2022-01-06 00:00:00,0.401249,21.0
2022-01-07 00:00:00,0.970152,19.0
2022-01-08 00:00:00,1.173972,23.0
2022-01-09 00:00:00,1.368556,25.0
2022-01-10 00:00:00,1.51213,10.0


In [68]:
# sub
d4.style.highlight_null(color="red")

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.23611,-2.0
2022-01-04 00:00:00,-0.311824,-8.0
2022-01-05 00:00:00,0.433105,-2.0
2022-01-06 00:00:00,0.250681,13.0
2022-01-07 00:00:00,0.962385,-5.0
2022-01-08 00:00:00,-0.733201,11.0
2022-01-09 00:00:00,0.225092,11.0
2022-01-10 00:00:00,-0.088554,0.0


In [69]:
# mul
d5.style.highlight_null(color="red")

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.448049,195.0
2022-01-04 00:00:00,0.08746,33.0
2022-01-05 00:00:00,0.55396,3.0
2022-01-06 00:00:00,0.02454,68.0
2022-01-07 00:00:00,0.003752,84.0
2022-01-08 00:00:00,0.210157,102.0
2022-01-09 00:00:00,0.45557,126.0
2022-01-10 00:00:00,0.569674,25.0


In [70]:
# div
d6.style.highlight_null(color="red")

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,1.420394,0.866667
2022-01-04 00:00:00,0.363924,0.272727
2022-01-05 00:00:00,1.775346,0.333333
2022-01-06 00:00:00,4.329794,4.25
2022-01-07 00:00:00,248.827544,0.583333
2022-01-08 00:00:00,0.231112,2.833333
2022-01-09 00:00:00,1.393701,2.571429
2022-01-10 00:00:00,0.889354,1.0


### removing the NaN values from the resultant dataframe

In [71]:
# the + operation is UNION of indexes
d31 = d1 + d2
# indexs not present in the other dataframe get a NaN
# do you can obvs remove nan values like before
d3.dropna().style.highlight_null(color="red")

Unnamed: 0,A,B
2022-01-03 00:00:00,1.359392,28.0
2022-01-04 00:00:00,0.668637,14.0
2022-01-05 00:00:00,1.550296,4.0
2022-01-06 00:00:00,0.401249,21.0
2022-01-07 00:00:00,0.970152,19.0
2022-01-08 00:00:00,1.173972,23.0
2022-01-09 00:00:00,1.368556,25.0
2022-01-10 00:00:00,1.51213,10.0


## df.align()

Pandas [align](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.align.html) works to join the dataframes in interesting ways.

In [72]:
# align returns two dataframes - left and right as a result of a join
# default join is outer - the union of indices will be used as index for left and right
left, right = d1.align(d2, join="outer")
render_df_side_by_side(left, right)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.351908,7.0
2022-01-02 00:00:00,0.224626,11.0
2022-01-03 00:00:00,0.797751,13.0
2022-01-04 00:00:00,0.178406,3.0
2022-01-05 00:00:00,0.991701,1.0
2022-01-06 00:00:00,0.325965,17.0
2022-01-07 00:00:00,0.966269,7.0
2022-01-08 00:00:00,0.220385,17.0
2022-01-09 00:00:00,0.796824,18.0
2022-01-10 00:00:00,0.711788,5.0

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.561641,15.0
2022-01-04 00:00:00,0.49023,11.0
2022-01-05 00:00:00,0.558596,3.0
2022-01-06 00:00:00,0.075284,4.0
2022-01-07 00:00:00,0.003883,12.0
2022-01-08 00:00:00,0.953586,6.0
2022-01-09 00:00:00,0.571732,7.0
2022-01-10 00:00:00,0.800342,5.0


In [73]:
# inner join
left, right = d1.align(d2, join="inner")
render_df_side_by_side(d1, d2)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.351908,7
2022-01-02 00:00:00,0.224626,11
2022-01-03 00:00:00,0.797751,13
2022-01-04 00:00:00,0.178406,3
2022-01-05 00:00:00,0.991701,1
2022-01-06 00:00:00,0.325965,17
2022-01-07 00:00:00,0.966269,7
2022-01-08 00:00:00,0.220385,17
2022-01-09 00:00:00,0.796824,18
2022-01-10 00:00:00,0.711788,5

Unnamed: 0,A,B
2022-01-03 00:00:00,0.561641,15
2022-01-04 00:00:00,0.49023,11
2022-01-05 00:00:00,0.558596,3
2022-01-06 00:00:00,0.075284,4
2022-01-07 00:00:00,0.003883,12
2022-01-08 00:00:00,0.953586,6
2022-01-09 00:00:00,0.571732,7
2022-01-10 00:00:00,0.800342,5
2022-01-11 00:00:00,0.506769,21
2022-01-12 00:00:00,0.630302,6


In [74]:
# we can fill the NaNs with a certain value if needed
# let's fill with -100 so we can see it clearly
left, right = d1.align(d2, join="outer", fill_value=-100)
render_df_side_by_side(left, right)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.351908,7
2022-01-02 00:00:00,0.224626,11
2022-01-03 00:00:00,0.797751,13
2022-01-04 00:00:00,0.178406,3
2022-01-05 00:00:00,0.991701,1
2022-01-06 00:00:00,0.325965,17
2022-01-07 00:00:00,0.966269,7
2022-01-08 00:00:00,0.220385,17
2022-01-09 00:00:00,0.796824,18
2022-01-10 00:00:00,0.711788,5

Unnamed: 0,A,B
2022-01-01 00:00:00,-100.0,-100
2022-01-02 00:00:00,-100.0,-100
2022-01-03 00:00:00,0.561641,15
2022-01-04 00:00:00,0.49023,11
2022-01-05 00:00:00,0.558596,3
2022-01-06 00:00:00,0.075284,4
2022-01-07 00:00:00,0.003883,12
2022-01-08 00:00:00,0.953586,6
2022-01-09 00:00:00,0.571732,7
2022-01-10 00:00:00,0.800342,5


In [75]:
# left join: left index is added to right, but not the other way
left, right = d1.align(d2, join="left")
render_df_side_by_side(left, right)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.351908,7
2022-01-02 00:00:00,0.224626,11
2022-01-03 00:00:00,0.797751,13
2022-01-04 00:00:00,0.178406,3
2022-01-05 00:00:00,0.991701,1
2022-01-06 00:00:00,0.325965,17
2022-01-07 00:00:00,0.966269,7
2022-01-08 00:00:00,0.220385,17
2022-01-09 00:00:00,0.796824,18
2022-01-10 00:00:00,0.711788,5

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.561641,15.0
2022-01-04 00:00:00,0.49023,11.0
2022-01-05 00:00:00,0.558596,3.0
2022-01-06 00:00:00,0.075284,4.0
2022-01-07 00:00:00,0.003883,12.0
2022-01-08 00:00:00,0.953586,6.0
2022-01-09 00:00:00,0.571732,7.0
2022-01-10 00:00:00,0.800342,5.0


In [76]:
# right join: right index is added to left, but not the other way
left, right = d1.align(d2, join="right")
render_df_side_by_side(left, right)

Unnamed: 0,A,B
2022-01-03 00:00:00,0.797751,13.0
2022-01-04 00:00:00,0.178406,3.0
2022-01-05 00:00:00,0.991701,1.0
2022-01-06 00:00:00,0.325965,17.0
2022-01-07 00:00:00,0.966269,7.0
2022-01-08 00:00:00,0.220385,17.0
2022-01-09 00:00:00,0.796824,18.0
2022-01-10 00:00:00,0.711788,5.0
2022-01-11 00:00:00,,
2022-01-12 00:00:00,,

Unnamed: 0,A,B
2022-01-03 00:00:00,0.561641,15
2022-01-04 00:00:00,0.49023,11
2022-01-05 00:00:00,0.558596,3
2022-01-06 00:00:00,0.075284,4
2022-01-07 00:00:00,0.003883,12
2022-01-08 00:00:00,0.953586,6
2022-01-09 00:00:00,0.571732,7
2022-01-10 00:00:00,0.800342,5
2022-01-11 00:00:00,0.506769,21
2022-01-12 00:00:00,0.630302,6


There's more to explore in align, and it can get confusing, so beginners be careful, take time and try to work out the result before you execute to build intuition.

# Operations on data

## Stats

Operations in general *exclude* missing data

In [77]:
# arithmetic mean, for each column (axis = 0)
df.mean()

A    -0.412047
B    -0.061830
C     0.311172
D    -0.079617
F    15.500000
dtype: float64

In [78]:
# mean across a row (axis = 1)
df.mean(1)

2022-06-19    1.295547
2022-06-20    2.783737
2022-06-21    1.778189
2022-06-22    2.900069
2022-06-23    2.706891
2022-06-24    3.576163
2022-06-25    3.521435
2022-06-26    3.958173
2022-06-27    3.930841
2022-06-28    4.064312
Freq: D, dtype: float64

## Apply

In [79]:
# apply custom lambdas
df.apply(lambda x: x.max() - x.min())

A    2.689727
B    3.772693
C    2.572168
D    3.037060
F    9.000000
dtype: float64

In [80]:
df

Unnamed: 0,A,B,C,D,F
2022-06-19,-1.549685,-0.448156,-0.967573,-1.556852,11
2022-06-20,1.140042,0.594597,1.234421,-1.050373,12
2022-06-21,-1.334034,-2.427709,0.648507,-0.995819,13
2022-06-22,-0.393233,1.344984,-0.110086,-0.34132,14
2022-06-23,-0.511189,-0.610947,-0.234703,-0.108703,15
2022-06-24,-0.72829,1.231819,0.430074,0.94721,16
2022-06-25,-0.444771,-0.832265,1.604595,0.279618,17
2022-06-26,0.378996,-0.507757,0.439415,1.480208,18
2022-06-27,0.000805,-0.164564,-0.033391,0.851357,19
2022-06-28,-0.679106,1.201701,0.100458,-0.301494,20


In [81]:
# each subsequent value is a sum of all values before it in the respective column
cumsum_df = df.apply(np.cumsum)
render_df_side_by_side(df, cumsum_df, "OG Dataframe", "Cumulative Sum")

Unnamed: 0,A,B,C,D,F
2022-06-19 00:00:00,-1.549685,-0.448156,-0.967573,-1.556852,11
2022-06-20 00:00:00,1.140042,0.594597,1.234421,-1.050373,12
2022-06-21 00:00:00,-1.334034,-2.427709,0.648507,-0.995819,13
2022-06-22 00:00:00,-0.393233,1.344984,-0.110086,-0.34132,14
2022-06-23 00:00:00,-0.511189,-0.610947,-0.234703,-0.108703,15
2022-06-24 00:00:00,-0.72829,1.231819,0.430074,0.94721,16
2022-06-25 00:00:00,-0.444771,-0.832265,1.604595,0.279618,17
2022-06-26 00:00:00,0.378996,-0.507757,0.439415,1.480208,18
2022-06-27 00:00:00,0.000805,-0.164564,-0.033391,0.851357,19
2022-06-28 00:00:00,-0.679106,1.201701,0.100458,-0.301494,20

Unnamed: 0,A,B,C,D,F
2022-06-19 00:00:00,-1.549685,-0.448156,-0.967573,-1.556852,11
2022-06-20 00:00:00,-0.409643,0.146441,0.266848,-2.607225,23
2022-06-21 00:00:00,-1.743677,-2.281268,0.915355,-3.603044,36
2022-06-22 00:00:00,-2.136911,-0.936283,0.805269,-3.944364,50
2022-06-23 00:00:00,-2.6481,-1.54723,0.570566,-4.053068,65
2022-06-24 00:00:00,-3.37639,-0.315411,1.000639,-3.105858,81
2022-06-25 00:00:00,-3.821161,-1.147676,2.605235,-2.82624,98
2022-06-26 00:00:00,-3.442164,-1.655433,3.04465,-1.346032,116
2022-06-27 00:00:00,-3.441359,-1.819997,3.011259,-0.494675,135
2022-06-28 00:00:00,-4.120465,-0.618296,3.111717,-0.796169,155


## Histogramming

Frequencies, 'nuff said

```value_counts()```

In [82]:
series1 = pd.Series(np.random.randint(0, 5, size=25))
series1.value_counts()

1    7
4    7
0    5
3    4
2    2
Name: count, dtype: int64

## String Methods

In [83]:
# build a series
some_string = "SERIes and Index are EqUIppeD WITh A seT Of stRINg PROCESSING METHoDs In tHe ```sTR``` attrIBute THAT MakE IT eASy To oPerATE ON EACh ELEmeNt OF The aRrAY. NOtE thAt PAtTERN-maTching in ```sTR``` gENeralLY UsES reGUlAr eXpreSsiONs bY DEfault (aND In Some cases AlwayS uSEs tHeM)."
str_series = pd.Series(some_string.split())  # split splits on whitespace by default

In [84]:
# manipulating the case - lower, upper etc.
# also the length of each string, split, replace, yada yada...
# very important when cleaning data (column names may have stupid whitespace, bad case, spacing etc.)
low = str_series.str.lower()
up = str_series.str.upper()

low_up_df = pd.DataFrame(
    {
        "og": str_series,
        "length": str_series.str.len(),
        "low": low,
        "up": up,
        "split_low_on_i": low.str.split(
            "i", expand=False
        ),  # expand = True will break this usecase, try elsewhere
        "replace_a_with_star_in_low": low.str.replace(
            "a", "*", case=False, regex=True
        ),  # yeah, you can mess with regular expns
    }
)

low_up_df.tail(10)

Unnamed: 0,og,length,low,up,split_low_on_i,replace_a_with_star_in_low
36,eXpreSsiONs,11,expressions,EXPRESSIONS,"[express, ons]",expressions
37,bY,2,by,BY,[by],by
38,DEfault,7,default,DEFAULT,[default],def*ult
39,(aND,4,(and,(AND,[(and],(*nd
40,In,2,in,IN,"[, n]",in
41,Some,4,some,SOME,[some],some
42,cases,5,cases,CASES,[cases],c*ses
43,AlwayS,6,always,ALWAYS,[always],*lw*ys
44,uSEs,4,uses,USES,[uses],uses
45,tHeM).,6,them).,THEM).,[them).],them).


See [here](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#text-string-methods) for more string operations.

# Merge

Combine series and dataframes, use set logic for indexes and relational algebra for joins/merges

## Concat

Chill, we've done this before. The same join = inner/outer/left/right drill.

In [85]:
# let's make some more dataframes
concatDF = pd.DataFrame(
    np.random.randn(10, 4)
)  # 10 rows and 4 columns of random numbers
pieces = [concatDF[:3], concatDF[3:7], concatDF[7:]]
reconstructedDF = pd.concat(pieces)
display("broken DFs: ", pieces)
render_df_side_by_side(concatDF, reconstructedDF, "OG", "Concatenated")

'broken DFs: '

[          0         1         2         3
 0  0.516235 -1.081323 -0.588541  0.156023
 1  0.058886 -0.415804  1.168519 -2.389338
 2 -1.340235  0.166527 -0.604575 -0.245738,
           0         1         2         3
 3 -1.378420 -1.072956 -1.084109  0.252270
 4  0.187984  0.685665  0.644224 -0.029137
 5 -0.156130 -0.059620 -0.486168  0.313524
 6 -0.592934 -0.538696 -1.327662 -0.651596,
           0         1         2         3
 7  0.909241  0.052438 -0.178248  0.050835
 8 -0.142048 -0.965511  1.688367 -0.601909
 9  1.207313  0.360771 -1.359410  0.319240]

Unnamed: 0,0,1,2,3
0,0.516235,-1.081323,-0.588541,0.156023
1,0.058886,-0.415804,1.168519,-2.389338
2,-1.340235,0.166527,-0.604575,-0.245738
3,-1.37842,-1.072956,-1.084109,0.25227
4,0.187984,0.685665,0.644224,-0.029137
5,-0.15613,-0.05962,-0.486168,0.313524
6,-0.592934,-0.538696,-1.327662,-0.651596
7,0.909241,0.052438,-0.178248,0.050835
8,-0.142048,-0.965511,1.688367,-0.601909
9,1.207313,0.360771,-1.35941,0.31924

Unnamed: 0,0,1,2,3
0,0.516235,-1.081323,-0.588541,0.156023
1,0.058886,-0.415804,1.168519,-2.389338
2,-1.340235,0.166527,-0.604575,-0.245738
3,-1.37842,-1.072956,-1.084109,0.25227
4,0.187984,0.685665,0.644224,-0.029137
5,-0.15613,-0.05962,-0.486168,0.313524
6,-0.592934,-0.538696,-1.327662,-0.651596
7,0.909241,0.052438,-0.178248,0.050835
8,-0.142048,-0.965511,1.688367,-0.601909
9,1.207313,0.360771,-1.35941,0.31924


_From the [page](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#concat)_:
Adding a column to a DataFrame is relatively fast. However, adding a row requires a copy, and may be expensive. We recommend passing a pre-built list of records to the DataFrame constructor instead of building a DataFrame by iteratively appending records to it.

## Join
SQL style merges.

In [86]:
# two data frames
sl = pd.Series(np.random.randint(10, size=6))
sr = sl.shift(2).fillna(0).astype(int)
left = pd.DataFrame({"key": list("ABCDEF"), "lval": sl})

right = pd.DataFrame({"key": list("ABBCDE"), "rval": sr})

In [87]:
render_df_side_by_side(left, right, "left", "right")

Unnamed: 0,key,lval
0,A,7
1,B,6
2,C,6
3,D,7
4,E,4
5,F,6

Unnamed: 0,key,rval
0,A,0
1,B,0
2,B,7
3,C,6
4,D,6
5,E,7


In [88]:
result = pd.merge(left, right, on="key")
result

Unnamed: 0,key,lval,rval
0,A,7,0
1,B,6,0
2,B,6,7
3,C,6,6
4,D,7,6
5,E,4,7


# Grouping

"Group By" really means one or more of the following happening:
1. **Split**: data is broken into groups based on some criteria
2. **Apply**: a data operation is executed on each of those groups
3. **Combine**: the results are combined back into a data structure

In [89]:
grp_df = pd.DataFrame(
    {
        "colA": list("AABCAABDAA"),
        "colB": list("1234567890"),
        "colC": np.random.randn(10),
        "colD": np.random.randn(10),
    }
)
grp_df

Unnamed: 0,colA,colB,colC,colD
0,A,1,-0.917516,0.371238
1,A,2,0.728666,0.823692
2,B,3,-1.223632,0.563674
3,C,4,-0.042648,-0.109007
4,A,5,1.274853,-0.222768
5,A,6,-0.701205,0.059788
6,B,7,-0.487083,0.194251
7,D,8,-1.254922,0.239682
8,A,9,0.336358,2.08744
9,A,0,1.525196,-0.970353


In [90]:
# a DataFrameGroupBy object object
grpby = grp_df.groupby("colA")

# list all the groups found
g = grpby.groups
g

{'A': [0, 1, 4, 5, 8, 9], 'B': [2, 6], 'C': [3], 'D': [7]}

In [91]:
# to create a dataframe from the group use something like
grpby.get_group("A")

Unnamed: 0,colA,colB,colC,colD
0,A,1,-0.917516,0.371238
1,A,2,0.728666,0.823692
4,A,5,1.274853,-0.222768
5,A,6,-0.701205,0.059788
8,A,9,0.336358,2.08744
9,A,0,1.525196,-0.970353


### Find all the groups

In [92]:
# but we may not know all the groups in the dataframe.
# so got to find a way to list all groups as dataframes.
#
# some basic python helps create a list of groupby names
grpby_labels_list = [*g.keys()]  # using the unpacking operator *
# alternatively
# grpby_labels_list = list(g.keys())
grpby_labels_list

['A', 'B', 'C', 'D']

In [93]:
# create a dict where the group label is the key and the group dataframe is the value
grps = {lbl: grpby.get_group(lbl) for lbl in grpby_labels_list}

# now use the dict to access individual dataframes created out of the groupby operation
grps["B"]

# question: can we create a dataframe of dataframes?

Unnamed: 0,colA,colB,colC,colD
2,B,3,-1.223632,0.563674
6,B,7,-0.487083,0.194251


In [94]:
# You can see the datatype of the value is a pandas dataframe
type(grps["A"])

pandas.core.frame.DataFrame

In [95]:
# make a function of it all...
def get_all_groups(df_grpby):
    return {lbl: df_grpby.get_group(lbl) for lbl in list(df_grpby.groups.keys())}

In [96]:
grps = get_all_groups(grp_df.groupby("colA"))
grps

{'A':   colA colB      colC      colD
 0    A    1 -0.917516  0.371238
 1    A    2  0.728666  0.823692
 4    A    5  1.274853 -0.222768
 5    A    6 -0.701205  0.059788
 8    A    9  0.336358  2.087440
 9    A    0  1.525196 -0.970353,
 'B':   colA colB      colC      colD
 2    B    3 -1.223632  0.563674
 6    B    7 -0.487083  0.194251,
 'C':   colA colB      colC      colD
 3    C    4 -0.042648 -0.109007,
 'D':   colA colB      colC      colD
 7    D    8 -1.254922  0.239682}

In [97]:
# get a sum of values from each group
grp_df.groupby("colA").sum()

Unnamed: 0_level_0,colB,colC,colD
colA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,125690,2.246353,2.149036
B,37,-1.710715,0.757926
C,4,-0.042648,-0.109007
D,8,-1.254922,0.239682


In [98]:
# let's add one more column to the dataframe
# we can try grouping on two columns
grp_df["colE"] = list("1231231231")
grp_df

Unnamed: 0,colA,colB,colC,colD,colE
0,A,1,-0.917516,0.371238,1
1,A,2,0.728666,0.823692,2
2,B,3,-1.223632,0.563674,3
3,C,4,-0.042648,-0.109007,1
4,A,5,1.274853,-0.222768,2
5,A,6,-0.701205,0.059788,3
6,B,7,-0.487083,0.194251,1
7,D,8,-1.254922,0.239682,2
8,A,9,0.336358,2.08744,3
9,A,0,1.525196,-0.970353,1


In [99]:
g2 = grp_df.groupby(["colA", "colE"])
grps2 = get_all_groups(g2)
grps2

{('A',
  '1'):   colA colB      colC      colD colE
 0    A    1 -0.917516  0.371238    1
 9    A    0  1.525196 -0.970353    1,
 ('A',
  '2'):   colA colB      colC      colD colE
 1    A    2  0.728666  0.823692    2
 4    A    5  1.274853 -0.222768    2,
 ('A',
  '3'):   colA colB      colC      colD colE
 5    A    6 -0.701205  0.059788    3
 8    A    9  0.336358  2.087440    3,
 ('B',
  '1'):   colA colB      colC      colD colE
 6    B    7 -0.487083  0.194251    1,
 ('B',
  '3'):   colA colB      colC      colD colE
 2    B    3 -1.223632  0.563674    3,
 ('C',
  '1'):   colA colB      colC      colD colE
 3    C    4 -0.042648 -0.109007    1,
 ('D',
  '2'):   colA colB      colC      colD colE
 7    D    8 -1.254922  0.239682    2}

In [100]:
render_df_side_by_side(grps2[("A", "1")], grps2[("A", "2")], "A,1", "A,2")

Unnamed: 0,colA,colB,colC,colD,colE
0,A,1,-0.917516,0.371238,1
9,A,0,1.525196,-0.970353,1

Unnamed: 0,colA,colB,colC,colD,colE
1,A,2,0.728666,0.823692,2
4,A,5,1.274853,-0.222768,2


# Reshaping

Stacking and Pivot Tables.
This really warrants more practice, at least go through [the page](https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html#)

## Stacking

In [101]:
tuples = list(
    zip(
        *[
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
    )
)

If the syntax [*expression](https://docs.python.org/dev/reference/expressions.html#calls) appears in the function call, expression must evaluate to an iterable. Elements from these iterables are treated as if they were additional positional arguments.

...although the *expression syntax may appear after explicit keyword arguments, it is processed before the keyword arguments (and any **expression arguments)

In [102]:
# starred expressions...
def f(a, b):
    print(a, " ", b)


f(
    b=1, *(2,)
)  # works, the tuple (iterable) is expanded, 2 is assigned to a, then b fills the next argument slot
# f (a=1, *(2,)) # FAILS, multiple values for argument 'a'

2   1


In [103]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [104]:
stack_idx = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [105]:
stack_df = pd.DataFrame(np.random.randn(8, 2), index=stack_idx, columns=["a", "b"])
stack_df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.569116,0.417175
bar,two,0.205407,0.373984
baz,one,0.931429,-1.780329
baz,two,0.213578,0.399523
foo,one,1.019377,0.252729
foo,two,-0.038853,0.501629
qux,one,-2.224998,-0.01924
qux,two,-1.789701,0.408075


In [106]:
# stack really just combines the columns
stacked = stack_df.stack()
stacked

first  second   
bar    one     a   -1.569116
               b    0.417175
       two     a    0.205407
               b    0.373984
baz    one     a    0.931429
               b   -1.780329
       two     a    0.213578
               b    0.399523
foo    one     a    1.019377
               b    0.252729
       two     a   -0.038853
               b    0.501629
qux    one     a   -2.224998
               b   -0.019240
       two     a   -1.789701
               b    0.408075
dtype: float64

In [107]:
# unstacked brings back the two columns
# by default unstacks the last level
unstacked = stacked.unstack()
unstacked

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.569116,0.417175
bar,two,0.205407,0.373984
baz,one,0.931429,-1.780329
baz,two,0.213578,0.399523
foo,one,1.019377,0.252729
foo,two,-0.038853,0.501629
qux,one,-2.224998,-0.01924
qux,two,-1.789701,0.408075


In [108]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,a,-1.569116,0.205407
bar,b,0.417175,0.373984
baz,a,0.931429,0.213578
baz,b,-1.780329,0.399523
foo,a,1.019377,-0.038853
foo,b,0.252729,0.501629
qux,a,-2.224998,-1.789701
qux,b,-0.01924,0.408075


In [109]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz,foo,qux
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,a,-1.569116,0.931429,1.019377,-2.224998
one,b,0.417175,-1.780329,0.252729,-0.01924
two,a,0.205407,0.213578,-0.038853,-1.789701
two,b,0.373984,0.399523,0.501629,0.408075


In [110]:
# you can basically pivot the columns and rows using a combination of stack() and unstack()
unstacked.unstack()

Unnamed: 0_level_0,a,a,b,b
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,-1.569116,0.205407,0.417175,0.373984
baz,0.931429,0.213578,-1.780329,0.399523
foo,1.019377,-0.038853,0.252729,0.501629
qux,-2.224998,-1.789701,-0.01924,0.408075


## Pivot Tables

In [111]:
# we're now just using data created in the pandas documentation - just cause, time.
pivot_tables_df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
pivot_tables_df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.075034,0.951973
1,one,B,foo,-1.36565,-1.32674
2,two,C,foo,-1.000317,0.126081
3,three,A,bar,-1.557018,-0.823072
4,one,B,bar,-0.717401,-0.980407
5,one,C,bar,0.372764,0.991359
6,two,A,foo,-1.225971,-2.494427
7,three,B,foo,-1.383177,0.889015
8,one,C,foo,-0.793209,0.234141
9,one,A,bar,0.535059,0.668283


In [112]:
# simple pivot tables - data, values to aggregate, index, columns to pivot along
pd.pivot_table(pivot_tables_df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.535059,-0.075034
one,B,-0.717401,-1.36565
one,C,0.372764,-0.793209
three,A,-1.557018,
three,B,,-1.383177
three,C,-0.880131,
two,A,,-1.225971
two,B,-1.666599,
two,C,,-1.000317


In [113]:
# more'n one columns of values
# mark all NaN as zero (0)
pd.pivot_table(
    pivot_tables_df, values=["D", "E"], index=["A", "B"], columns=["C"]
).fillna(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,E,E
Unnamed: 0_level_1,C,bar,foo,bar,foo
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
one,A,0.535059,-0.075034,0.668283,0.951973
one,B,-0.717401,-1.36565,-0.980407,-1.32674
one,C,0.372764,-0.793209,0.991359,0.234141
three,A,-1.557018,0.0,-0.823072,0.0
three,B,0.0,-1.383177,0.0,0.889015
three,C,-0.880131,0.0,-0.244029,0.0
two,A,0.0,-1.225971,0.0,-2.494427
two,B,-1.666599,0.0,0.163312,0.0
two,C,0.0,-1.000317,0.0,0.126081


# Time Series

## Resample
We may need to resample data to a different time-frequency.
Like get data every second and resample to data every 5 minutes and so on...

In [114]:
rng = pd.date_range("01/01/2022", periods=1000, freq="S")

In [115]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

In [116]:
ts.resample("5min").sum()

2022-01-01 00:00:00    74272
2022-01-01 00:05:00    71742
2022-01-01 00:10:00    73105
2022-01-01 00:15:00    25507
Freq: 5T, dtype: int64

read up on [date offset strings](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects) like '5min' etc.

## Timezones

In [117]:
# timezone representation
rng2 = pd.date_range("01/01/2022", periods=500, freq="D")
ts2 = pd.Series(np.random.randint(0, 365, len(rng2)), index=rng2)
ts2_utc = ts2.tz_localize("UTC")

In [118]:
# convert to US/Eastern
ts2_utc.tz_convert("US/Eastern")

2021-12-31 19:00:00-05:00     55
2022-01-01 19:00:00-05:00    193
2022-01-02 19:00:00-05:00    181
2022-01-03 19:00:00-05:00    354
2022-01-04 19:00:00-05:00     95
                            ... 
2023-05-10 20:00:00-04:00     72
2023-05-11 20:00:00-04:00    155
2023-05-12 20:00:00-04:00     39
2023-05-13 20:00:00-04:00    236
2023-05-14 20:00:00-04:00    212
Freq: D, Length: 500, dtype: int64

## Time span representations

Converting between periods and timestamps.

Period is a _timespan_ - span of time between two timestamps.

In [119]:
# month - end of each
rng3 = pd.date_range("01/01/2022", periods=12, freq="M")
rng3

DatetimeIndex(['2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31'],
              dtype='datetime64[ns]', freq='M')

In [120]:
ts3 = pd.Series(np.random.randn(len(rng3)), index=rng3)
ts3

2022-01-31    0.098827
2022-02-28    0.083582
2022-03-31   -1.007415
2022-04-30    0.786591
2022-05-31   -0.163765
2022-06-30    0.605526
2022-07-31   -0.802065
2022-08-31    0.103782
2022-09-30    1.338962
2022-10-31   -0.090781
2022-11-30    0.048555
2022-12-31    0.136162
Freq: M, dtype: float64

In [121]:
ps = ts3.to_period()
ps

2022-01    0.098827
2022-02    0.083582
2022-03   -1.007415
2022-04    0.786591
2022-05   -0.163765
2022-06    0.605526
2022-07   -0.802065
2022-08    0.103782
2022-09    1.338962
2022-10   -0.090781
2022-11    0.048555
2022-12    0.136162
Freq: M, dtype: float64

In [122]:
ps.to_timestamp()

2022-01-01    0.098827
2022-02-01    0.083582
2022-03-01   -1.007415
2022-04-01    0.786591
2022-05-01   -0.163765
2022-06-01    0.605526
2022-07-01   -0.802065
2022-08-01    0.103782
2022-09-01    1.338962
2022-10-01   -0.090781
2022-11-01    0.048555
2022-12-01    0.136162
Freq: MS, dtype: float64

A more complex use-case:
convert a _quarterly frequency_ with _year ending in November_ to _9am_ of _the end of the month_ _following_ the _quarter end_

In [123]:
prng = pd.period_range("1990Q1", "2022Q4", freq="Q-NOV")
type(prng)

pandas.core.indexes.period.PeriodIndex

In [124]:
ts4 = pd.Series(np.random.randn(len(prng)), index=prng)
ts4.head()

1990Q1    0.217424
1990Q2    0.104070
1990Q3   -0.328954
1990Q4    0.441217
1991Q1   -0.554199
Freq: Q-NOV, dtype: float64

In [125]:
ts4.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9
ts4.head()

1990-03-01 09:00    0.217424
1990-06-01 09:00    0.104070
1990-09-01 09:00   -0.328954
1990-12-01 09:00    0.441217
1991-03-01 09:00   -0.554199
Freq: H, dtype: float64

# Categoricals

[This page](https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#categorical-data).

##### Note: With v2.0+ some of these are breaking - so this sections needs an update.  

See [this issue on GitHub](https://github.com/pandas-dev/pandas/issues/52593)

For e.g., gender, social class, blood type, country affiliation, observation time or rating via Likert scales.  

Categorical data. Categories allow (usually) fixed number of possible values. Categorical data may have some order, but not support numerical operations (for e.g., 'strongly agree', 'agree', 'disagree', 'strongly disagree').

In [126]:
cat_df = pd.DataFrame({"id": list("123456"), "raw_grade": list("abbaae")})
cat_df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [127]:
# convert the raw_grade col to categories
cat_df["grade"] = cat_df["raw_grade"].astype("category")
cat_df

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [128]:
# list of categories
cat_df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [129]:
# this breaks with v2.0+
# [TODO] Find a fix

# can give more meaningful names here
# direct assignment to Series.cat.categories()
# cat_df["grade"].cat.categories = ["very good", "meh", "very bad"]
# cat_df["grade"]

In [130]:
cat_df

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [131]:
# list of categories
list(cat_df["grade"].cat.categories)

# categorical.CategoricalAccessor encapsulates all changes to categoricals
# for e.g., set_categories or remove_categories
# dir(cat_df['grade'].cat)

['a', 'b', 'e']

In [132]:
# add more categories
cat_df["grade"] = cat_df["grade"].cat.set_categories(
    ["very bad", "bad", "meh", "good", "very good"]
)
# new list of categories
list(cat_df["grade"].cat.categories)

['very bad', 'bad', 'meh', 'good', 'very good']

In [133]:
# now when we can sort the dataframe in order of categories (not lexical)
sorted_cat_df = cat_df.sort_values(by="grade")
render_df_side_by_side(cat_df, sorted_cat_df, "OG", "Sorted")

Unnamed: 0,id,raw_grade,grade
0,1,a,
1,2,b,
2,3,b,
3,4,a,
4,5,a,
5,6,e,

Unnamed: 0,id,raw_grade,grade
0,1,a,
1,2,b,
2,3,b,
3,4,a,
4,5,a,
5,6,e,


In [134]:
# grouping
# cat_grp = cat_df.groupby("grade")
# cat_grp.get_group("very good")

In [135]:
# count elements
# cat_grp.size()