#### Importing the pandas and Numpy

In [21]:
import numpy as np
import pandas as pd

#### Creating a Series Object

In [22]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

#### Creating a DataFrame 

#### By passing a Numpy array, with a datetime index and labeled columns

In [23]:
dates = pd.date_range("20130101",periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))

In [24]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.261348,0.747041,0.921136,0.307684
2013-01-02,0.77809,0.067704,-1.786591,-1.131291
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436
2013-01-04,-1.246415,0.267463,-0.140986,0.221474
2013-01-05,-0.939582,1.251244,0.870032,1.130564
2013-01-06,0.559351,0.051935,-0.800447,1.490041


##### Creating a DataFrame by passing a dict of objects 

In [25]:
dic = {
    "A": 1.0,
    "B":pd.Timestamp(20130102),
    "C":pd.Series(1,index=list(range(4)),dtype='float32'),
    "D":np.array([3]*4,dtype='int32'),
    "E":pd.Categorical(["test","train","test","train"]),
    "F":"foo"
      }
df2 = pd.DataFrame(dic)
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,1970-01-01 00:00:00.020130102,1.0,3,test,foo
1,1.0,1970-01-01 00:00:00.020130102,1.0,3,train,foo
2,1.0,1970-01-01 00:00:00.020130102,1.0,3,test,foo
3,1.0,1970-01-01 00:00:00.020130102,1.0,3,train,foo


In [26]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### Viewing Data

In [27]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.261348,0.747041,0.921136,0.307684
2013-01-02,0.77809,0.067704,-1.786591,-1.131291
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436
2013-01-04,-1.246415,0.267463,-0.140986,0.221474
2013-01-05,-0.939582,1.251244,0.870032,1.130564


In [28]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.246415,0.267463,-0.140986,0.221474
2013-01-05,-0.939582,1.251244,0.870032,1.130564
2013-01-06,0.559351,0.051935,-0.800447,1.490041


In [29]:
print(df.index)
print(df.columns)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


**Numpy arrays have one dtype for the entire array, while pandas DataFrame have one dtype per column**

**Showing quick statistic summary of data**

In [30]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.175713,0.30851,-0.406739,0.247339
std,0.805085,0.619183,1.159834,0.983921
min,-1.246415,-0.53433,-1.786591,-1.131291
25%,-0.770024,0.055877,-1.327793,-0.345459
50%,-0.10286,0.167584,-0.470716,0.264579
75%,0.43342,0.627147,0.617277,0.924844
max,0.77809,1.251244,0.921136,1.490041


**Transposing data**

In [31]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.261348,0.77809,0.055628,-1.246415,-0.939582,0.559351
B,0.747041,0.067704,-0.53433,0.267463,1.251244,0.051935
C,0.921136,-1.786591,-1.503576,-0.140986,0.870032,-0.800447
D,0.307684,-1.131291,-0.534436,0.221474,1.130564,1.490041


**Sorting by axes**

In [32]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.307684,0.921136,0.747041,-0.261348
2013-01-02,-1.131291,-1.786591,0.067704,0.77809
2013-01-03,-0.534436,-1.503576,-0.53433,0.055628
2013-01-04,0.221474,-0.140986,0.267463,-1.246415
2013-01-05,1.130564,0.870032,1.251244,-0.939582
2013-01-06,1.490041,-0.800447,0.051935,0.559351


**Sorting by values**

In [33]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436
2013-01-06,0.559351,0.051935,-0.800447,1.490041
2013-01-02,0.77809,0.067704,-1.786591,-1.131291
2013-01-04,-1.246415,0.267463,-0.140986,0.221474
2013-01-01,-0.261348,0.747041,0.921136,0.307684
2013-01-05,-0.939582,1.251244,0.870032,1.130564


### Selection

Pandas Data selections methods are : .at, .iat, .loc, .iloc

**Selecting a single column, which yields a Series**

In [34]:
df["A"]

2013-01-01   -0.261348
2013-01-02    0.778090
2013-01-03    0.055628
2013-01-04   -1.246415
2013-01-05   -0.939582
2013-01-06    0.559351
Freq: D, Name: A, dtype: float64

**Selecting via [ ]**

In [35]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.261348,0.747041,0.921136,0.307684
2013-01-02,0.77809,0.067704,-1.786591,-1.131291
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436


In [36]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,0.77809,0.067704,-1.786591,-1.131291
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436
2013-01-04,-1.246415,0.267463,-0.140986,0.221474


#### Selecting by Label

**For getting a cross section using a label**

In [37]:
df.loc[dates[0]] #selecting first data row values

A   -0.261348
B    0.747041
C    0.921136
D    0.307684
Name: 2013-01-01 00:00:00, dtype: float64

**Selecting on a multi-axis label**

In [38]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,-0.261348,0.747041
2013-01-02,0.77809,0.067704
2013-01-03,0.055628,-0.53433
2013-01-04,-1.246415,0.267463
2013-01-05,-0.939582,1.251244
2013-01-06,0.559351,0.051935


**Showing label slicing, both endpoints are included**

In [39]:
df.loc["20130102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-02,0.77809,0.067704
2013-01-03,0.055628,-0.53433
2013-01-04,-1.246415,0.267463


**Reduction in the dimensions of the returned object**

In [40]:
df.loc["20130102",["A","B"]]

A    0.778090
B    0.067704
Name: 2013-01-02 00:00:00, dtype: float64

In [41]:
#Getting scalar value
df.loc[dates[0],"A"]

-0.2613476746451604

**For getting fast access to a scalar**

In [42]:
df.at[dates[0],"A"]

-0.2613476746451604

### Selection by Position

**Select via the position of the passed integers**

In [43]:
df.iloc[3]

A   -1.246415
B    0.267463
C   -0.140986
D    0.221474
Name: 2013-01-04 00:00:00, dtype: float64

**By integer slices**

In [44]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-1.246415,0.267463
2013-01-05,-0.939582,1.251244


**By list of integer position locations**

In [45]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.77809,-1.786591
2013-01-03,0.055628,-1.503576
2013-01-05,-0.939582,0.870032


**For getting a value explicitly**

In [46]:
df.iloc[1,2]

-1.7865913537441922

In [47]:
# for getting fast access to a scalar 
df.iat[1,2]

-1.7865913537441922

### Boolean Indexing

**Using a single column's values to select data**

In [48]:
df[df["A"]>0]

Unnamed: 0,A,B,C,D
2013-01-02,0.77809,0.067704,-1.786591,-1.131291
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436
2013-01-06,0.559351,0.051935,-0.800447,1.490041


**Selecting values from a DataFrame where a boolean condition is met**

In [49]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.747041,0.921136,0.307684
2013-01-02,0.77809,0.067704,,
2013-01-03,0.055628,,,
2013-01-04,,0.267463,,0.221474
2013-01-05,,1.251244,0.870032,1.130564
2013-01-06,0.559351,0.051935,,1.490041


In [50]:
df2 = df.copy()
df2["E"] = ["one","one","two","three","four","three"]
df2 

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.261348,0.747041,0.921136,0.307684,one
2013-01-02,0.77809,0.067704,-1.786591,-1.131291,one
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436,two
2013-01-04,-1.246415,0.267463,-0.140986,0.221474,three
2013-01-05,-0.939582,1.251244,0.870032,1.130564,four
2013-01-06,0.559351,0.051935,-0.800447,1.490041,three


In [51]:
df2[df2["E"].isin(["two","four"])] #.isin() method is best suitable for selecting column values 

Unnamed: 0,A,B,C,D,E
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436,two
2013-01-05,-0.939582,1.251244,0.870032,1.130564,four


### Setting

**Setting a new column automatically aligns the data by the indexes**

In [52]:
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range("20130102",periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

**Setting values by label**

In [53]:
df["F"]=s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.261348,0.747041,0.921136,0.307684,
2013-01-02,0.77809,0.067704,-1.786591,-1.131291,1.0
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436,2.0
2013-01-04,-1.246415,0.267463,-0.140986,0.221474,3.0
2013-01-05,-0.939582,1.251244,0.870032,1.130564,4.0
2013-01-06,0.559351,0.051935,-0.800447,1.490041,5.0


**Setting Values by label**

In [54]:
df.at[dates[0],"A"]=0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.747041,0.921136,0.307684,
2013-01-02,0.77809,0.067704,-1.786591,-1.131291,1.0
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436,2.0
2013-01-04,-1.246415,0.267463,-0.140986,0.221474,3.0
2013-01-05,-0.939582,1.251244,0.870032,1.130564,4.0
2013-01-06,0.559351,0.051935,-0.800447,1.490041,5.0


**Setting values by position**

In [55]:
df.iat[0,4]=0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.747041,0.921136,0.307684,0.0
2013-01-02,0.77809,0.067704,-1.786591,-1.131291,1.0
2013-01-03,0.055628,-0.53433,-1.503576,-0.534436,2.0
2013-01-04,-1.246415,0.267463,-0.140986,0.221474,3.0
2013-01-05,-0.939582,1.251244,0.870032,1.130564,4.0
2013-01-06,0.559351,0.051935,-0.800447,1.490041,5.0


### Handling Missing Data

**Pandas primarily uses *np.nan* to represent missing data.**

In [56]:
df = pd.DataFrame(
    np.random.rand(5,3),
    index = ['a','c','e','f','h'],
    columns= ["One","Two","Three"]
)
df

Unnamed: 0,One,Two,Three
a,0.418462,0.446874,0.717475
c,0.799525,0.519534,0.143103
e,0.202188,0.319742,0.672458
f,0.950466,0.333609,0.805361
h,0.239111,0.964203,0.65411


In [57]:
df['four'] = "bar"
df["five"]= df["One"]>0
df

Unnamed: 0,One,Two,Three,four,five
a,0.418462,0.446874,0.717475,bar,True
c,0.799525,0.519534,0.143103,bar,True
e,0.202188,0.319742,0.672458,bar,True
f,0.950466,0.333609,0.805361,bar,True
h,0.239111,0.964203,0.65411,bar,True


In [58]:
df2 = df.reindex(["a","b","c","d","e","f","g","h"])
df2

Unnamed: 0,One,Two,Three,four,five
a,0.418462,0.446874,0.717475,bar,True
b,,,,,
c,0.799525,0.519534,0.143103,bar,True
d,,,,,
e,0.202188,0.319742,0.672458,bar,True
f,0.950466,0.333609,0.805361,bar,True
g,,,,,
h,0.239111,0.964203,0.65411,bar,True


**To make detecting missing values, pandas provide *isna()* and *notna()* methods**

In [59]:
df2["One"]

a    0.418462
b         NaN
c    0.799525
d         NaN
e    0.202188
f    0.950466
g         NaN
h    0.239111
Name: One, dtype: float64

In [60]:
pd.isna(df2['One'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: One, dtype: bool

In [61]:
df2["four"].notna()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

In [62]:
df2.isna()

Unnamed: 0,One,Two,Three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


### Datetimes

**For datetime types, NaT represents missing values. Pandas objects provide compatibiliy**

In [63]:
df2 = df.copy()

In [64]:
df2["timestamp"]=pd.Timestamp("20120101")
df2

Unnamed: 0,One,Two,Three,four,five,timestamp
a,0.418462,0.446874,0.717475,bar,True,2012-01-01
c,0.799525,0.519534,0.143103,bar,True,2012-01-01
e,0.202188,0.319742,0.672458,bar,True,2012-01-01
f,0.950466,0.333609,0.805361,bar,True,2012-01-01
h,0.239111,0.964203,0.65411,bar,True,2012-01-01


In [65]:
df2.loc[["a","c","h"],["One","timestamp"]]=np.nan
df2

Unnamed: 0,One,Two,Three,four,five,timestamp
a,,0.446874,0.717475,bar,True,NaT
c,,0.519534,0.143103,bar,True,NaT
e,0.202188,0.319742,0.672458,bar,True,2012-01-01
f,0.950466,0.333609,0.805361,bar,True,2012-01-01
h,,0.964203,0.65411,bar,True,NaT


In [66]:
df2.dtypes.value_counts()

float64           3
bool              1
datetime64[ns]    1
object            1
dtype: int64

### Inserting Missing Data

**We can insert missing values by simply assigning to containers. The actual missing value used will be chosen based on dtype**

In [67]:
s = pd.Series([1,2,3])
s.loc[0]=None
s

0    NaN
1    2.0
2    3.0
dtype: float64

In [68]:
s = pd.Series(["a","b","c"])
s.loc[0]=None
s.loc[1]=np.nan
s

0    None
1     NaN
2       c
dtype: object

### Calculations With Missing Data 

In [69]:
a= df2.loc[:,["One","Two"]]
b = df2.loc[:,["One","Two","Three"]]
a

Unnamed: 0,One,Two
a,,0.446874
c,,0.519534
e,0.202188,0.319742
f,0.950466,0.333609
h,,0.964203


In [70]:
b

Unnamed: 0,One,Two,Three
a,,0.446874,0.717475
c,,0.519534,0.143103
e,0.202188,0.319742,0.672458
f,0.950466,0.333609,0.805361
h,,0.964203,0.65411


In [71]:
a+b

Unnamed: 0,One,Three,Two
a,,,0.893748
c,,,1.039068
e,0.404377,,0.639485
f,1.900931,,0.667218
h,,,1.928405


**The descriptive statistics and computation methods are all written to account for missing data.
For Ex:** 
* When summing data, NA(missing) values will be treated as zero.
* If the data are all NA, the result will be 0
* Cumulative methods like cumsum() and cumprod() ignore NA values by default, but preserve them in the resulting arrays.

In [72]:
df2

Unnamed: 0,One,Two,Three,four,five,timestamp
a,,0.446874,0.717475,bar,True,NaT
c,,0.519534,0.143103,bar,True,NaT
e,0.202188,0.319742,0.672458,bar,True,2012-01-01
f,0.950466,0.333609,0.805361,bar,True,2012-01-01
h,,0.964203,0.65411,bar,True,NaT


In [73]:
df2["One"].sum()

1.1526539520642598

In [74]:
df2.mean(1)

  df2.mean(1)


a    0.721450
c    0.554212
e    0.548597
f    0.772359
h    0.872771
dtype: float64

### NA values in GroupBy
NA groups in GroupBy are automatically excluded.This behaviour is consistent with R.

In [75]:
df

Unnamed: 0,One,Two,Three,four,five
a,0.418462,0.446874,0.717475,bar,True
c,0.799525,0.519534,0.143103,bar,True
e,0.202188,0.319742,0.672458,bar,True
f,0.950466,0.333609,0.805361,bar,True
h,0.239111,0.964203,0.65411,bar,True


In [76]:
df.groupby("One").mean()

Unnamed: 0_level_0,Two,Three,five
One,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.202188,0.319742,0.672458,True
0.239111,0.964203,0.65411,True
0.418462,0.446874,0.717475,True
0.799525,0.519534,0.143103,True
0.950466,0.333609,0.805361,True


### GroupBy: Split-apply-combine

By **group by** we are referring to a process involving one or more of the following steps:-
* **splitting** the data into groups based on some criteria
* **Applying** a function to each group independently 
* **Combining** the results into a data structure

In the apply step, we might wish to do one of the following:-
* **Aggregation:** Compute a summary statistics for each group.
 1. Compute group sums or means
 2. compute group sizes or counts
* **Transformation:** Perform some group-specific computations and return a like-indexed object.
 1. Standardized data(zscore) within a group.
 2. Filling NAs with a value derived from each group.
* **Filtration:** Discard some groups, according to a group-wise computation that evaluates.
 1. Discard data that belongs to groups with only a few members.
 2. Filter out data based on the group sum  or mean
 


In [77]:
df = pd.DataFrame(
    [("bird","Falconiformers",389.0),
     ("bird","Psittaciformers",24.0),
     ("mammal","Carnivora",80.2),
     ("mammal","Primates",np.nan),
     ("mammal","Carnivora",58)
    ],
    index = ["falcon","parrot","lion","monkey","leopard"],
    columns= ("class","order","max_speed")
)
df

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformers,389.0
parrot,bird,Psittaciformers,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [78]:
grouped = df.groupby("class")
grouped.all()

Unnamed: 0_level_0,order,max_speed
class,Unnamed: 1_level_1,Unnamed: 2_level_1
bird,True,True
mammal,True,True


In [79]:
df = pd.DataFrame(
        {
            "A":["foo","bar","foo","bar","foo","bar","foo","foo"],
            "B":["one","one","two","three","two","two","one","three"],
            "C":np.random.randn(8),
            "D":np.random.randn(8),
        }
)
df.head()

Unnamed: 0,A,B,C,D
0,foo,one,-0.529192,1.116354
1,bar,one,0.877447,0.28665
2,foo,two,1.825796,1.21362
3,bar,three,1.13277,-0.943839
4,foo,two,2.00455,-0.220117


In [80]:
grouped = df.groupby("A")
grouped.first()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.877447,0.28665
foo,one,-0.529192,1.116354


### GroupBy sorting 

In [81]:
df2 = pd.DataFrame(
    {
        "X":["B","B","A","A"],
        "Y":[1,2,3,4]
    }
)
df2.groupby(["X"]).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
A,7
B,3


In [82]:
df2.groupby(["X"],sort=False).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
B,3
A,7


In [83]:
df3 = pd.DataFrame(
    {
        "X":["A","B","A","B"],
                   "Y":[1,4,3,2]
    }
  )
print(df3.groupby(["X"]).get_group("A"))
print(df3.groupby(["X"]).get_group("B"))

   X  Y
0  A  1
2  A  3
   X  Y
1  B  4
3  B  2


### GroupBy dropna
By default NA values are excluded from group keys during the **groupby** operation.In case we want to include NA values in group keys, we could pass **dropna = False** to achive it.

In [84]:
df_list = [[1,2,3],[1,None,4],[2,1,3],[1,2,3]]
df_dropna = pd.DataFrame(df_list,columns = ["a","b","c"])
df_dropna

Unnamed: 0,a,b,c
0,1,2.0,3
1,1,,4
2,2,1.0,3
3,1,2.0,3


In [85]:
# Default 'dropna' is set to True, which will exclude nan in keys
df_dropna.groupby(by = ["b"],dropna = True).sum()

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,6


In [86]:
#In order to allow NaN in keys, set dropna to False
df_dropna.groupby(by=["b"],dropna=False).sum()

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,6
,1,4


### GroupBy object attributes


In [87]:
df.groupby("A").groups

{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

In [88]:
grouped = df.groupby(["A","B"])
print(grouped.groups)
print(len(grouped))

{('bar', 'one'): [1], ('bar', 'three'): [3], ('bar', 'two'): [5], ('foo', 'one'): [0, 6], ('foo', 'three'): [7], ('foo', 'two'): [2, 4]}
6


In [89]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.529192,1.116354
1,bar,one,0.877447,0.28665
2,foo,two,1.825796,1.21362
3,bar,three,1.13277,-0.943839
4,foo,two,2.00455,-0.220117
5,bar,two,0.867813,-0.658172
6,foo,one,0.894102,-1.658989
7,foo,three,-0.794052,-0.452923


### GroupBy with MutliIndex

In [90]:
arrays = [
        ["bar","bar","baz","baz","foo","foo","qux","qux"],
        ["one","two","one","two","one","two","one","two"],
]

In [91]:
index = pd.MultiIndex.from_arrays(arrays,names = ["first","second"])

In [92]:
s=pd.Series(np.random.randn(8),index = index)
s

first  second
bar    one      -1.601582
       two      -0.316487
baz    one      -0.580029
       two      -0.383310
foo    one      -0.820790
       two       0.621915
qux    one       0.313200
       two       0.243742
dtype: float64

We can group by one of the levels in S

In [93]:
grouped = s.groupby(level =0)
grouped.sum()

first
bar   -1.918069
baz   -0.963339
foo   -0.198875
qux    0.556942
dtype: float64

In [94]:
grouped = s.groupby(level =1)
grouped.sum()

second
one   -2.689201
two    0.165860
dtype: float64

In [95]:
s

first  second
bar    one      -1.601582
       two      -0.316487
baz    one      -0.580029
       two      -0.383310
foo    one      -0.820790
       two       0.621915
qux    one       0.313200
       two       0.243742
dtype: float64

In [96]:
s.groupby(["first"]).sum()

first
bar   -1.918069
baz   -0.963339
foo   -0.198875
qux    0.556942
dtype: float64

 ### Selecting a group

In [106]:
df.groupby(["A","B"]).get_group(("bar","one"))

Unnamed: 0,A,B,C,D
1,bar,one,0.877447,0.28665


In [111]:
grouped.get_group("one")

first  second
bar    one      -1.601582
baz    one      -0.580029
foo    one      -0.820790
qux    one       0.313200
dtype: float64

### Aggregation

In [114]:
grouped = df.groupby("A")
grouped.first()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.877447,0.28665
foo,one,-0.529192,1.116354


In [115]:
grouped.aggregate(np.sum)

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,2.878029,-1.315361
foo,3.401204,-0.002056


In [118]:
dir(grouped)

['A',
 'B',
 'C',
 'D',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_accessors',
 '_agg_examples_doc',
 '_agg_general',
 '_aggregate',
 '_aggregate_frame',
 '_aggregate_item_by_item',
 '_aggregate_multiple_funcs',
 '_apply_allowlist',
 '_apply_filter',
 '_apply_to_column_groupbys',
 '_assure_grouper',
 '_bool_agg',
 '_builtin_table',
 '_choose_path',
 '_concat_objects',
 '_constructor',
 '_cumcount_array',
 '_cython_agg_blocks',
 '_cython_agg_general',
 '_cython_table',
 '_cython_transform',
 '_define_paths',
 '_dep

In [132]:
grouped = df.groupby(["A","B"])
grouped.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.877447,0.28665
bar,three,1.13277,-0.943839
bar,two,0.867813,-0.658172
foo,one,-0.529192,1.116354
foo,three,-0.794052,-0.452923
foo,two,1.825796,1.21362


In [133]:
grouped.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.877447,0.28665
bar,three,1.13277,-0.943839
bar,two,0.867813,-0.658172
foo,one,0.36491,-0.542635
foo,three,-0.794052,-0.452923
foo,two,3.830346,0.993503


![Capture.JPG](attachment:Capture.JPG)

The aggregating functions above will exclude NA values. Any function which reduces a **Series** to a scalar value is an aggregating function.

### Applying multiple functions at once

In [135]:
grouped = df.groupby("A")
grouped.first()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.877447,0.28665
foo,one,-0.529192,1.116354


In [141]:
grouped["C"].agg([np.sum,np.mean,np.std])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2.878029,0.959343,0.150269
foo,3.401204,0.680241,1.298855


In [142]:
grouped.count()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,3,3,3
foo,5,5,5


On a grouped **DataFrame**, we can pass a list of functions to apply to each column, which produces an aggregating result with a hierarichal index:-

In [143]:
grouped.agg([np.sum,np.mean,np.std])

Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,2.878029,0.959343,0.150269,-1.315361,-0.438454,0.643997
foo,3.401204,0.680241,1.298855,-0.002056,-0.000411,1.196344


#### Renaming column

In [144]:
(
    grouped.agg([np.sum,np.mean,np.std]).rename(
        columns = {"sum":"foo","mean":"bar","std":"baz"}
    )
)

Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,foo,bar,baz,foo,bar,baz
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,2.878029,0.959343,0.150269,-1.315361,-0.438454,0.643997
foo,3.401204,0.680241,1.298855,-0.002056,-0.000411,1.196344


In [149]:
animals = pd.DataFrame(
    {
        "kind":["cat","dog","cat","dog"],
        "height":[9.1,6.0,9.5,34.0],
        "weight" : [7.9,7.5,9.9,198.0],
    }
)

In [150]:
animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [152]:
animals.groupby("kind").agg(
    min_height=pd.NamedAgg(column = "height",aggfunc = "min"),
    max_height = pd.NamedAgg(column="height",aggfunc="max"),
    avg_height = pd.NamedAgg(column="weight",aggfunc=np.mean)
)

Unnamed: 0_level_0,min_height,max_height,avg_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


**we can also use plain tuples**

In [153]:
animals.groupby("kind").agg(
    min_height = ("height","min"),
    max_height = ("height","max"),
    agg_weight = ("weight",np.mean)
)

Unnamed: 0_level_0,min_height,max_height,agg_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


### Applying different functions to DataFrame columns