In [84]:
import pandas as pd
import numpy as np

# Creation

In [190]:
right = pd.DataFrame({
    'id':[3,4,5], 
    'val_right':[300,300,500], 
    'val2_right':[-7,np.nan,-3]})

left = pd.DataFrame({
    'id':[1,2,3,4], 
    'val_left':[20,10,30,40], 
    'val2_left':[-3,np.nan,-2.1,-1],
    'str_left': ['hello','there','dude',np.nan],
    'date_string': ['2025-08-01T13:45:00',
                    '2025-08-02T11:47:00',
                    '2025-08-05T16:42:00',
                    '2025-08-09T09:45:00']
})
left['date_type'] = pd.to_datetime(left['date_string'], errors='coerce')

print(left.dtypes)
left

id                      int64
val_left                int64
val2_left             float64
str_left               object
date_string            object
date_type      datetime64[ns]
dtype: object


Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [191]:
right

Unnamed: 0,id,val_right,val2_right
0,3,300,-7.0
1,4,300,
2,5,500,-3.0


# Filtering

In [192]:
right[right['val2_right'] > -2]

Unnamed: 0,id,val_right,val2_right


In [193]:
right.sort_values('val2_right')

Unnamed: 0,id,val_right,val2_right
0,3,300,-7.0
2,5,500,-3.0
1,4,300,


In [194]:
right.sort_values('val2_right', ascending=False)

Unnamed: 0,id,val_right,val2_right
2,5,500,-3.0
0,3,300,-7.0
1,4,300,


In [195]:
right.sort_values(['val_right', 'val2_right'], ascending=[False, False]) 

Unnamed: 0,id,val_right,val2_right
2,5,500,-3.0
0,3,300,-7.0
1,4,300,


In [196]:
left['val_left']

0    20
1    10
2    30
3    40
Name: val_left, dtype: int64

In [197]:
left['val_left'].reset_index()

Unnamed: 0,index,val_left
0,0,20
1,1,10
2,2,30
3,3,40


In [198]:
left[['val_left']]

Unnamed: 0,val_left
0,20
1,10
2,30
3,40


In [199]:
left.query('val_left >= 20')

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [200]:
left.iloc[1]

id                               2
val_left                        10
val2_left                      NaN
str_left                     there
date_string    2025-08-02T11:47:00
date_type      2025-08-02 11:47:00
Name: 1, dtype: object

In [201]:
left.iloc[1]['val_left']

np.int64(10)

In [202]:
left.loc[:, 'val2_left']

0   -3.0
1    NaN
2   -2.1
3   -1.0
Name: val2_left, dtype: float64

In [203]:
left.loc[3, :]

id                               4
val_left                        40
val2_left                     -1.0
str_left                       NaN
date_string    2025-08-09T09:45:00
date_type      2025-08-09 09:45:00
Name: 3, dtype: object

In [204]:
left

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [205]:
left[
    (left['val_left'] <= 20) & 
    (left['val2_left'].notnull())
]

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00


In [206]:
left[
    (left['val_left'] >= 30) | 
    (left['val2_left'].isnull())
]

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [207]:
left.dropna()

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00


In [208]:
left.dropna(subset=['val_left'])

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [209]:
left.dropna(subset=['val2_left'])

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [210]:
left.fillna(1000)

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
1,2,10,1000.0,there,2025-08-02T11:47:00,2025-08-02 11:47:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,1000,2025-08-09T09:45:00,2025-08-09 09:45:00


In [211]:
left.fillna({'val2_left':1000})

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
1,2,10,1000.0,there,2025-08-02T11:47:00,2025-08-02 11:47:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [212]:
left[
    left['str_left'] == 'hello'
]

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00


Remember to likely always include 'na=False' in the args to contains and the like, because without it, the comparison will fail with a 'ValueError: Cannot mask with non-boolean array containing NA / NaN values' message. With 'na=False', then nulls are treated as false in the resulting boolean array.

In [213]:
left[
    #left['str_left'].str.contains('he') # causes exception
    left['str_left'].str.contains('he', na=False)
]

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00


In [214]:
left[
    left['str_left'].str.contains('ER', na=False, case=False)
]

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00


In [215]:
left[
    left['val_left'].between(15, 35)
]

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00


In [216]:
left[
    left['val_left'].isin([20, 40])
]

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [217]:
left

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [218]:
left[left['date_string'] > '2025-08-04'] # works i think because this is the str order

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [219]:
left[left['date_type'] > '2025-08-04'] # i think '2025-08-04' is cast to a date

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [220]:
(
    left['date_type'].dt.year, 
    left['date_type'].dt.month, 
    left['date_type'].dt.day
)
# no dt.week

(0    2025
 1    2025
 2    2025
 3    2025
 Name: date_type, dtype: int32,
 0    8
 1    8
 2    8
 3    8
 Name: date_type, dtype: int32,
 0    1
 1    2
 2    5
 3    9
 Name: date_type, dtype: int32)

In [221]:
left['date_type'].dt.floor('D') # one way to drop any times (don't have here because these are datetime64 objs)

0   2025-08-01
1   2025-08-02
2   2025-08-05
3   2025-08-09
Name: date_type, dtype: datetime64[ns]

In [222]:
left['date_type'].dt.normalize() # another way

0   2025-08-01
1   2025-08-02
2   2025-08-05
3   2025-08-09
Name: date_type, dtype: datetime64[ns]

In [223]:
left['date_type'].dt.date # not datetime64, just vanilla python datetime.date objs

0    2025-08-01
1    2025-08-02
2    2025-08-05
3    2025-08-09
Name: date_type, dtype: object

In [224]:
# left['date_type'].dt.floor('MS') # for trunc, this or with 'M' gives errors - see next cell 

In [225]:
left['date_type'].dt.to_period('M').dt.to_timestamp()

0   2025-08-01
1   2025-08-01
2   2025-08-01
3   2025-08-01
Name: date_type, dtype: datetime64[ns]

In [226]:
left['date_type'].dt.to_period('W').dt.to_timestamp() # also supports week

0   2025-07-28
1   2025-07-28
2   2025-08-04
3   2025-08-04
Name: date_type, dtype: datetime64[ns]

Remember to use Decimal type for exactness, since floating point can get comparisons wrong. np.isclose is an option, but requires a mask since I think it returns a boolean array?

In [227]:
np.isclose(left['val2_left'], -2.0) 

array([False, False, False, False])

In [229]:
left[np.isclose(left['val2_left'], -2.1)]

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00


In [231]:
left[left['val2_left'] == -2.1] # looks like this one works, but we shouldn't rely on it?

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00


# Joins

In [233]:
left

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00


In [234]:
right

Unnamed: 0,id,val_right,val2_right
0,3,300,-7.0
1,4,300,
2,5,500,-3.0


In [237]:
pd.merge(left, right, on='id') # looks like how='inner' is the default

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type,val_right,val2_right
0,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,300,-7.0
1,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,300,


In [239]:
pd.merge(left, right, how='left')

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type,val_right,val2_right
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,,
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,,
2,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,300.0,-7.0
3,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,300.0,


In [241]:
pd.merge(left, right, how='right', indicator=True)

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type,val_right,val2_right,_merge
0,3,30.0,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,300,-7.0,both
1,4,40.0,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,300,,both
2,5,,,,,NaT,500,-3.0,right_only


In [242]:
pd.merge(left, right, how='right', suffixes=['_left','_right'])

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type,val_right,val2_right
0,3,30.0,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,300,-7.0
1,4,40.0,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,300,
2,5,,,,,NaT,500,-3.0


In [243]:
pd.merge(left, right, how='outer', indicator=True)

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type,val_right,val2_right,_merge
0,1,20.0,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,,,left_only
1,2,10.0,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,,,left_only
2,3,30.0,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,300.0,-7.0,both
3,4,40.0,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,300.0,,both
4,5,,,,,NaT,500.0,-3.0,right_only


Remember there's also a 'validate' param that takes 'one_to_many' (keys can repeat on the right), 'one_to_one' (keys can't repeat on either side), 'many_to_one' (keys can repeat on the left), and 'many_to_many' (keys can repeat on both sides). If the dataframe(s) violate these, then pandas will raise an exception. 

In [244]:
pd.merge(left, right, how='cross')

Unnamed: 0,id_x,val_left,val2_left,str_left,date_string,date_type,id_y,val_right,val2_right
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,3,300,-7.0
1,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,4,300,
2,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,5,500,-3.0
3,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,3,300,-7.0
4,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,4,300,
5,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,5,500,-3.0
6,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,3,300,-7.0
7,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,4,300,
8,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,5,500,-3.0
9,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,3,300,-7.0


Also, you can join on the DF index with 'left.join(right, how='left').

In [245]:
pd.concat([left, right])

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type,val_right,val2_right
0,1,20.0,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,,
1,2,10.0,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,,
2,3,30.0,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,,
3,4,40.0,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,,
0,3,,,,,NaT,300.0,-7.0
1,4,,,,,NaT,300.0,
2,5,,,,,NaT,500.0,-3.0


Can do 'pd.concat([left, right]).drop_duplicates() for UNION ALL.

Also, you can not use pd.merge and instead use left.merge(right...) in general.

In [251]:
# EXCEPT, anti-join - rows in left that aren't in right
m = pd.merge(left, right, how='left', indicator=True)
m[m['_merge'] == 'left_only']
# m[m['_merge'] == 'left_only'].drop(columns='_merge') # drop optional, to get rid of _merge col 

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type,val_right,val2_right,_merge
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,,,left_only
1,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,,,left_only


In [249]:
# above in contrast to the inner join
pd.merge(left, right, how='inner', indicator=True)

Unnamed: 0,id,val_left,val2_left,str_left,date_string,date_type,val_right,val2_right,_merge
0,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,300,-7.0,both
1,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,300,,both


# 

Unnamed: 0,id_x,val_left,val2_left,str_left,date_string,date_type,id_y,val_right,val2_right
0,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,3,300,-7.0
1,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,4,300,
2,1,20,-3.0,hello,2025-08-01T13:45:00,2025-08-01 13:45:00,5,500,-3.0
3,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,3,300,-7.0
4,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,4,300,
5,2,10,,there,2025-08-02T11:47:00,2025-08-02 11:47:00,5,500,-3.0
6,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,3,300,-7.0
7,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,4,300,
8,3,30,-2.1,dude,2025-08-05T16:42:00,2025-08-05 16:42:00,5,500,-3.0
9,4,40,-1.0,,2025-08-09T09:45:00,2025-08-09 09:45:00,3,300,-7.0


In [None]:
pd.