## Configure Options and Settings at Interpreter Startup

In [1]:
import pandas

In [None]:
def start():
    options = {
        'display':{
            'max_columns':None,
            'max_colwidth':25,
            'expand_frame_repr':False,
            'max_rows':14,
            'max_seq_items':50,
            'precision':4,
            'show_dimensions':False
        },
        'mode':{
            'chained_assignment':None
        }
    }
    for category, option in options.items():
        for op, value in options.items():
            pd.set_option(f'{category}.{op}', value)

## Make Toy Data Structures With Pandas' Testing Module

In [3]:
import pandas.util.testing as tm
import numpy as np

tm.N, tm.K = 15,3
np.random.seed(444)
tm.makeTimeDataFrame(freq='M').head()

Unnamed: 0,A,B,C,D
2000-01-31,0.35744,0.266873,0.353728,-0.536561
2000-02-29,0.377538,-0.480331,-0.433926,-0.886787
2000-03-31,1.382338,0.300781,-0.498028,0.107101
2000-04-30,1.175549,-0.179054,0.228771,-0.74089
2000-05-31,-0.939276,1.183669,-0.650078,-0.075697


In [4]:
tm.makeDataFrame()

Unnamed: 0,A,B,C,D
b8jgVbQbug,-0.748504,-0.099509,-0.060078,0.03531
OKCyyhkEvY,0.498427,0.798287,-0.169375,-1.487501
RtcTWq0AMT,-0.148212,0.507709,-0.089451,-0.716834
vtdamOujY0,-0.348742,0.273927,1.551892,-0.054453
tW49Zqe3lC,0.161808,0.839752,0.690683,1.536011
YS55K1HAQ9,-0.741327,-0.832826,-0.004938,-1.347949
VqoFBJHR6Y,1.260449,-0.971597,0.393538,0.079307
26geH1wGRS,-0.145185,-0.474486,0.876715,1.376789
e9rN6H4BgQ,0.624359,-2.646801,-0.276459,-0.655241
j0qdIMVG9P,-0.314556,0.988053,-0.626824,1.045932


In [6]:
[i for i in dir(tm) if i.startswith('make')]

['makeBoolIndex',
 'makeCategoricalIndex',
 'makeCustomDataframe',
 'makeCustomIndex',
 'makeDataFrame',
 'makeDateIndex',
 'makeFloatIndex',
 'makeFloatSeries',
 'makeIntIndex',
 'makeIntervalIndex',
 'makeMissingDataframe',
 'makeMixedDataFrame',
 'makeMultiIndex',
 'makeObjectSeries',
 'makePeriodFrame',
 'makePeriodIndex',
 'makePeriodSeries',
 'makeRangeIndex',
 'makeStringIndex',
 'makeStringSeries',
 'makeTimeDataFrame',
 'makeTimeSeries',
 'makeTimedeltaIndex',
 'makeUIntIndex',
 'makeUnicodeIndex']

## Take Advantage of Accessor Methods

In [7]:
import pandas as pd

In [8]:
# these are the Accessor methods
# str => string
# dt  => date time
# cat => categorical
pd.Series._accessors

{'cat', 'dt', 'sparse', 'str'}

### str

In [9]:
addr = pd.Series([
    'Washington, D.C. 20003',
    'Brooklyn, NY 11211-1755',
    'Omaha, NE 68154',
    'Pittsburgh, PA 15211'
])

In [10]:
addr.str.upper()

0     WASHINGTON, D.C. 20003
1    BROOKLYN, NY 11211-1755
2            OMAHA, NE 68154
3       PITTSBURGH, PA 15211
dtype: object

In [11]:
addr.str.count(r'\d')

0    5
1    9
2    5
3    5
dtype: int64

### dt

In [13]:
# create dataframe with dates
daterng = pd.Series(pd.date_range('2019', periods=9, freq='Q'))
daterng

0   2019-03-31
1   2019-06-30
2   2019-09-30
3   2019-12-31
4   2020-03-31
5   2020-06-30
6   2020-09-30
7   2020-12-31
8   2021-03-31
dtype: datetime64[ns]

In [14]:
# view day name
daterng.dt.day_name()

0       Sunday
1       Sunday
2       Monday
3      Tuesday
4      Tuesday
5      Tuesday
6    Wednesday
7     Thursday
8    Wednesday
dtype: object

In [16]:
# view dates that are in the 3rd and 4th quarter
daterng[daterng.dt.quarter > 2]

2   2019-09-30
3   2019-12-31
6   2020-09-30
7   2020-12-31
dtype: datetime64[ns]

In [18]:
# view end of year dates
daterng[daterng.dt.is_year_end]

3   2019-12-31
7   2020-12-31
dtype: datetime64[ns]

## Create a DatetimeIndex From Component Columns

In [20]:
import pandas as pd
import numpy as np
from itertools import product

datecols = ['year', 'month', 'day']
df = pd.DataFrame(list(product([2019, 2020], [1,2], [1,2,3])), columns=datecols)
df

Unnamed: 0,year,month,day
0,2019,1,1
1,2019,1,2
2,2019,1,3
3,2019,2,1
4,2019,2,2
5,2019,2,3
6,2020,1,1
7,2020,1,2
8,2020,1,3
9,2020,2,1


In [21]:
df['data'] = np.random.randn(len(df))
df

Unnamed: 0,year,month,day,data
0,2019,1,1,-0.26111
1,2019,1,2,0.028835
2,2019,1,3,0.122392
3,2019,2,1,-0.438345
4,2019,2,2,0.612122
5,2019,2,3,-2.50608
6,2020,1,1,-1.040233
7,2020,1,2,-0.967498
8,2020,1,3,0.595033
9,2020,2,1,0.873375


In [23]:
# create date column from other columns
df.index = pd.to_datetime(df[datecols])
df

Unnamed: 0,year,month,day,data
2019-01-01,2019,1,1,-0.26111
2019-01-02,2019,1,2,0.028835
2019-01-03,2019,1,3,0.122392
2019-02-01,2019,2,1,-0.438345
2019-02-02,2019,2,2,0.612122
2019-02-03,2019,2,3,-2.50608
2020-01-01,2020,1,1,-1.040233
2020-01-02,2020,1,2,-0.967498
2020-01-03,2020,1,3,0.595033
2020-02-01,2020,2,1,0.873375


In [24]:
df = df.drop(datecols, axis=1).squeeze()
df

2019-01-01   -0.261110
2019-01-02    0.028835
2019-01-03    0.122392
2019-02-01   -0.438345
2019-02-02    0.612122
2019-02-03   -2.506080
2020-01-01   -1.040233
2020-01-02   -0.967498
2020-01-03    0.595033
2020-02-01    0.873375
2020-02-02   -0.892723
2020-02-03    2.196084
Name: data, dtype: float64

## Use Categorical Data to Save Time and Space

In [25]:
colors = pd.Series([
    'periwinkle',
    'mint green',
    'burnt orange',
    'periwinkle',
    'burnt orange',
    'rose',
    'rose',
    'mint green',
    'rose',
    'navy'    
])

In [26]:
colors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: object

In [27]:
# create a mapper funciton 
mapper = {v: k for k,v in enumerate(colors.unique())}
mapper

{'periwinkle': 0, 'mint green': 1, 'burnt orange': 2, 'rose': 3, 'navy': 4}

In [28]:
# map the colors column
as_int = colors.map(mapper)
as_int

0    0
1    1
2    2
3    0
4    2
5    3
6    3
7    1
8    3
9    4
dtype: int64

In [29]:
ccolors = colors.astype('category')
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (5, object): ['burnt orange', 'mint green', 'navy', 'periwinkle', 'rose']

In [31]:
ccolors.cat.codes

0    3
1    1
2    0
3    3
4    0
5    4
6    4
7    1
8    4
9    2
dtype: int8

In [32]:
# cannot just add new cat to df
ccolors.iloc[5] = 'olive green'

ValueError: Cannot setitem on a Categorical with a new category, set the categories first

In [33]:
ccolors = ccolors.cat.add_categories(['olive green'])

In [34]:
ccolors.iloc[5] = 'olive green'

In [35]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5     olive green
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (6, object): ['burnt orange', 'mint green', 'navy', 'periwinkle', 'rose', 'olive green']

## Introspect Groupby Objects via Iteration

In [36]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
cols = ['sex', 'length', 'diam', 'height', 'weight', 'rings']
abalone = pd.read_csv(url, usecols=[0,1,2,3,4,8], names=cols)
abalone

Unnamed: 0,sex,length,diam,height,weight,rings
0,M,0.455,0.365,0.095,0.5140,15
1,M,0.350,0.265,0.090,0.2255,7
2,F,0.530,0.420,0.135,0.6770,9
3,M,0.440,0.365,0.125,0.5160,10
4,I,0.330,0.255,0.080,0.2050,7
...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,11
4173,M,0.590,0.440,0.135,0.9660,10
4174,M,0.600,0.475,0.205,1.1760,9
4175,F,0.625,0.485,0.150,1.0945,10


In [40]:
# create column by splitting rings into 4 buckets
abalone['ring_quartile'] = pd.qcut(abalone['rings'], q=4, labels=range(1,5))
abalone

Unnamed: 0,sex,length,diam,height,weight,rings,ring_quartile
0,M,0.455,0.365,0.095,0.5140,15,4
1,M,0.350,0.265,0.090,0.2255,7,1
2,F,0.530,0.420,0.135,0.6770,9,2
3,M,0.440,0.365,0.125,0.5160,10,3
4,I,0.330,0.255,0.080,0.2050,7,1
...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,11,3
4173,M,0.590,0.440,0.135,0.9660,10,3
4174,M,0.600,0.475,0.205,1.1760,9,2
4175,F,0.625,0.485,0.150,1.0945,10,3


In [42]:
grouped = abalone.groupby('ring_quartile')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x133b6f220>

In [43]:
for index, frame in grouped:
    print(f'Ring Quartile: {index}')
    print('-'*16)
    # show the largest 3 by weight
    print(frame.nlargest(3, 'weight'), end="\n\n")

Ring Quartile: 1
----------------
     sex  length   diam  height  weight  rings ring_quartile
2619   M   0.690  0.540   0.185  1.7100      8             1
1044   M   0.690  0.525   0.175  1.7005      8             1
1026   M   0.645  0.520   0.175  1.5610      8             1

Ring Quartile: 2
----------------
     sex  length  diam  height  weight  rings ring_quartile
2811   M   0.725  0.57   0.190  2.3305      9             2
1426   F   0.745  0.57   0.215  2.2500      9             2
1821   F   0.720  0.55   0.195  2.0730      9             2

Ring Quartile: 3
----------------
     sex  length  diam  height  weight  rings ring_quartile
1209   F   0.780  0.63   0.215   2.657     11             3
1051   F   0.735  0.60   0.220   2.555     11             3
3715   M   0.780  0.60   0.210   2.548     11             3

Ring Quartile: 4
----------------
     sex  length   diam  height  weight  rings ring_quartile
891    M   0.730  0.595    0.23  2.8255     17             4
1763   M   0.77

In [46]:
# view the group keys
grouped.groups.keys()

dict_keys([1, 2, 3, 4])

In [48]:
# get group
grouped.get_group(2)

Unnamed: 0,sex,length,diam,height,weight,rings,ring_quartile
2,F,0.530,0.420,0.135,0.6770,9,2
8,M,0.475,0.370,0.125,0.5095,9,2
19,M,0.450,0.320,0.100,0.3810,9,2
23,F,0.550,0.415,0.135,0.7635,9,2
39,M,0.355,0.290,0.090,0.3275,9,2
...,...,...,...,...,...,...,...
4134,F,0.595,0.455,0.140,0.9140,9,2
4136,F,0.615,0.495,0.155,1.0805,9,2
4159,F,0.560,0.440,0.135,0.8025,9,2
4167,M,0.500,0.380,0.125,0.5770,9,2


In [52]:
# aggregate function on groups
grouped['height', 'weight'].agg(['mean', 'median'])

  grouped['height', 'weight'].agg(['mean', 'median'])


Unnamed: 0_level_0,height,height,weight,weight
Unnamed: 0_level_1,mean,median,mean,median
ring_quartile,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,0.106596,0.105,0.432374,0.3685
2,0.142721,0.145,0.852013,0.844
3,0.157203,0.155,1.06688,1.0645
4,0.164813,0.165,1.114892,1.0655


## Mapping Trick for Membership Binning

In [53]:
countries = pd.Series([
    'United States',
    'Canada',
    'Mexico',
    'Belgium',
    'United Kingdom',
    'Thailand'
])

groups = {
    'North America': ('United States', 'Canada', 'Mexico', 'Greenland'),
    'Europe': ('France', 'Germany', 'United Kingdom', 'Belgium')
}

In [54]:
from typing import Any

In [56]:
def membership_map(s: pd.Series, groups: dict, fillvalue: Any=-1) -> pd.Series:
    groups = {x: k for k,v in groups.items() for x in v}
    return s.map(groups).fillna(fillvalue)

In [57]:
membership_map(countries, groups, fillvalue='other')

0    North America
1    North America
2    North America
3           Europe
4           Europe
5            other
dtype: object

## How Pandas Uses Boolean Operators

In [58]:
and, not, or  # boolean operators

<, <=, >, >=, !=, ==  # arithmetic operators

&, |, ~  # bitwise operators

SyntaxError: invalid syntax (<ipython-input-58-b536fa7f6a53>, line 1)

In [59]:
4 < 3 and 5 > 4

False

In [60]:
4 < (3 and 5) > 4

True

In [61]:
pd.Series([True, True, False]) & pd.Series([True, False, False])

0     True
1    False
2    False
dtype: bool

In [62]:
pd.Series([True, True, False]) and pd.Series([True, False, False])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [64]:
s = pd.Series(range(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [68]:
s % 2 == 0 & s > 3

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [69]:
(s % 2 == 0) & (s > 3)

0    False
1    False
2    False
3    False
4     True
5    False
6     True
7    False
8     True
9    False
dtype: bool

## Load Data From the Clipboard

In [72]:
# just need to go to excel or google sheet and do ctrl+c
df = pd.read_clipboard(
    na_values=[None],
)
df

Unnamed: 0,Lynch,GPS,some
0,94.92,132.274,37.354
1,101.15,138.626,37.476


## Write Pandas Objects Directly to Compressed Formats

In [73]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
cols = ['sex', 'length', 'diam', 'height', 'weight', 'rings']
abalone = pd.read_csv(url, usecols=[0,1,2,3,4,8], names=cols)

In [74]:
abalone.to_json('df.json.gz', orient='records', lines=True, compression='gzip')

In [75]:
abalone.to_json('df.json', orient='records', lines=True)

In [81]:
import os

# check size difference of compression
print(f"Uncompressed: {os.path.getsize('df.json')}")
print(f"Compressed: {os.path.getsize('df.json.gz')}")
print(f"Ratio: {(os.path.getsize('df.json.gz')/os.path.getsize('df.json'))*100}% the size of uncompressed")

Uncompressed: 330723
Compressed: 33388
Ratio: 10.095457527900994% the size of uncompressed
