In [1]:
import pandas as pd
import numpy as np

In [10]:
index = pd.date_range(start='1/1/2020', periods=5, freq='M')
index

DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31'],
              dtype='datetime64[ns]', freq='M')

In [11]:
data = np.random.randint(0, 9, size=(5, 5))
data

array([[6, 7, 5, 4, 1],
       [5, 5, 0, 7, 7],
       [7, 0, 5, 2, 5],
       [5, 4, 3, 5, 6],
       [8, 8, 8, 0, 0]])

In [12]:
df = pd.DataFrame(data, index=index, columns=list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
2020-01-31,6,7,5,4,1
2020-02-29,5,5,0,7,7
2020-03-31,7,0,5,2,5
2020-04-30,5,4,3,5,6
2020-05-31,8,8,8,0,0


In [29]:
row = df.loc['2020-01-31']
row

A    6
B    7
C    5
D    4
E    1
Name: 2020-01-31 00:00:00, dtype: int64

In [30]:
column = df['A']
column

2020-01-31    6
2020-02-29    5
2020-03-31    7
2020-04-30    5
2020-05-31    8
Freq: M, Name: A, dtype: int64

In [31]:
df - row

Unnamed: 0,A,B,C,D,E
2020-01-31,0,0,0,0,0
2020-02-29,-1,-2,-5,3,6
2020-03-31,1,-7,0,-2,4
2020-04-30,-1,-3,-2,1,5
2020-05-31,2,1,3,-4,-1


In [25]:
df.sub(row, axis='columns')

Unnamed: 0,A,B,C,D,E
2020-01-31,0,0,0,0,0
2020-02-29,-1,-2,-5,3,6
2020-03-31,1,-7,0,-2,4
2020-04-30,-1,-3,-2,1,5
2020-05-31,2,1,3,-4,-1


In [32]:
df - column

Unnamed: 0,2020-01-31 00:00:00,2020-02-29 00:00:00,2020-03-31 00:00:00,2020-04-30 00:00:00,2020-05-31 00:00:00,A,B,C,D,E
2020-01-31,,,,,,,,,,
2020-02-29,,,,,,,,,,
2020-03-31,,,,,,,,,,
2020-04-30,,,,,,,,,,
2020-05-31,,,,,,,,,,


In [33]:
df.sub(column, axis='index')

Unnamed: 0,A,B,C,D,E
2020-01-31,0,1,-1,-2,-5
2020-02-29,0,0,-5,2,2
2020-03-31,0,-7,-2,-5,-2
2020-04-30,0,-1,-2,0,1
2020-05-31,0,0,0,-8,-8


In [21]:
df.sub(row, axis='index') 

Unnamed: 0,A,B,C,D,E
2020-01-31 00:00:00,,,,,
2020-02-29 00:00:00,,,,,
2020-03-31 00:00:00,,,,,
2020-04-30 00:00:00,,,,,
2020-05-31 00:00:00,,,,,
A,,,,,
B,,,,,
C,,,,,
D,,,,,
E,,,,,


by using **sub** you can specify **axis**

### Missing data / operations with fill values


In [37]:
df = pd.DataFrame(
    np.random.randint(0, 10, size=(3, 4)), 
    columns=list('ABCD'),
)
df

Unnamed: 0,A,B,C,D
0,2,0,2,6
1,5,0,7,7
2,4,1,0,8


In [40]:
df2 = pd.DataFrame(
    np.random.randint(0, 10, size=(4, 3)), 
    columns=list('ABC'),
)
df2

Unnamed: 0,A,B,C
0,3,8,3
1,6,2,9
2,8,3,6
3,1,0,0


In [41]:
df + df2

Unnamed: 0,A,B,C,D
0,5.0,8.0,5.0,
1,11.0,2.0,16.0,
2,12.0,4.0,6.0,
3,,,,


here we have some NaN values 

In [51]:
df.add(df2, fill_value=0) # it doesn't set the NaN to 0, it will replace it with the data frame that it's cell is not NaN

Unnamed: 0,A,B,C,D
0,5.0,8.0,5.0,6.0
1,11.0,2.0,16.0,7.0
2,12.0,4.0,6.0,8.0
3,1.0,0.0,0.0,


if one of df OR df2 is has NaN, and you set fill_value to 0, the output will be the value of the data frame<br>
that <ins>doesn't</ins> have the NaN, <br>
but if both of them has NaN for some cell, the result will be NaN, and here you should use fillna() :

In [52]:
df.add(df2, fill_value=0).fillna(0)

Unnamed: 0,A,B,C,D
0,5.0,8.0,5.0,6.0
1,11.0,2.0,16.0,7.0
2,12.0,4.0,6.0,8.0
3,1.0,0.0,0.0,0.0


### Boolean reductions


You can apply the reductions: **empty**, **any**(), **all**(), and **bool**() to provide a way to summarize a boolean result.



In [53]:
df

Unnamed: 0,A,B,C,D
0,2,0,2,6
1,5,0,7,7
2,4,1,0,8


In [54]:
df>2

Unnamed: 0,A,B,C,D
0,False,False,False,True
1,True,False,True,True
2,True,False,False,True


In [55]:
(df>2).all()

A    False
B    False
C    False
D     True
dtype: bool

In [56]:
(df>2).all(axis=1)

0    False
1    False
2    False
dtype: bool

In [57]:
(df>2).any()

A     True
B    False
C     True
D     True
dtype: bool

In [59]:
(df>2).any(axis=1)

0    True
1    True
2    True
dtype: bool

In [62]:
(df > 2).all().all() # checks if all the values in data frame are greater than 2

False

In [63]:
(df > 2).any().any() # checks wether there is any value in data frame that is greater than 2

True

In [66]:
df.empty

False

In [69]:
pd.DataFrame(
    {}
).empty

True

### Comparing if objects are equivalent


In [None]:
dic = {
    'A': pd.Series([1, 2], index=['a', 'b']),
    'B': pd.Series([3, 4, 5], index=['a', 'b', 'c']),
}
df = pd.DataFrame(dic)
df

Unnamed: 0,A,B
a,1.0,3
b,2.0,4
c,,5


In [None]:
df + df

Unnamed: 0,A,B
a,2.0,6
b,4.0,8
c,,10


In [102]:
df * 2

Unnamed: 0,A,B
a,2.0,6
b,4.0,8
c,,10


you might say that these two expressions are equal, let's checkt that :

In [105]:
((df + df) == (df * 2)).all().all()

False

they are not, equal!! <br>but why? <br>lets look a little bit closer...

In [106]:
((df + df) == (df * 2))

Unnamed: 0,A,B
a,True,True
b,True,True
c,False,True


In [109]:
df.loc['c', 'A']

nan

the index where we have **NaN** returns false, why ? :<br>
This is because **NaNs do not compare as equals**:

In [110]:
np.NaN == np.NaN

False

In [115]:
(df + df).equals(df * 2)

True

##### Note that the Series or DataFrame index needs to be in the same order for equality to be True:


In [151]:
df1 = pd.DataFrame([
    {'a': 1,
     'b': 2
    },
    {'a': 3,
     'b': 4
    },
    {'a': 5,
     'b': 6
    },
])
df1

Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6


In [152]:
df2 = df1[::-1]
df2

Unnamed: 0,a,b
2,5,6
1,3,4
0,1,2


In [153]:
df1.equals(df2) # although the corresponding indexes in df1 is equals to df2, but it returns False

False

In [154]:
df1.equals(df2.sort_index())

True

### Comparing array-like objects


In [160]:
pd.Series(["foo", "bar", "baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

pandas also handles element-wise comparisons between different array-like objects of the same length:

In [162]:
pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

### Combining overlapping data sets


A problem occasionally arising is the combination of two similar data sets where **values in one are preferred over the other**. An example would be two data series representing a particular economic indicator where one is considered to be of “higher quality”. However, the lower quality series might extend further back in history or have more complete data coverage. As such, we would like to combine two DataFrame objects where missing values in one DataFrame are conditionally filled with like-labeled values from the other DataFrame. The function implementing this operation is **combine_first()**, which we illustrate:

In [2]:
df1 = pd.DataFrame(
    {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
)

In [None]:
df2 = pd.DataFrame(
    {
        "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
        "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
    }
)

In [167]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [169]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [170]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


### General DataFrame combine


The combine_first() method above calls the more general DataFrame.combine(). This method takes another DataFrame and a combiner function, aligns the input DataFrame and then passes the combiner function pairs of Series (i.e., columns whose names are the same).

In [177]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [178]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [198]:
# combiner function gets 2 Series as arguments
def combiner(s1: pd.Series, s2: pd.Series) :
    return np.where(pd.isna(s1) , s2, s1)

In [197]:
df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


In [199]:
### Attention !!!
# Dont use  this expression instead of pd.isna()
# because as we learned earlier, nan != nan

df1 == np.NaN

Unnamed: 0,A,B
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
