## Move to the next Friday

In [1]:
import pandas as pd
import numpy as np

# 示例日期列
df = pd.DataFrame({
    'date': pd.to_datetime([
        '2024-01-01',  # 周一
        '2024-01-03',  # 周三
        '2024-01-05',  # 周五
        '2024-01-06',  # 周六
        '2024-01-07',  # 周日
    ])
})

# 移动到下一个周五（weekday=4 表示周五）
df['next_friday'] = df['date'] + pd.to_timedelta((4 - df['date'].dt.weekday + 7) % 7, unit='D')

print(df)

        date next_friday
0 2024-01-01  2024-01-05
1 2024-01-03  2024-01-05
2 2024-01-05  2024-01-05
3 2024-01-06  2024-01-12
4 2024-01-07  2024-01-12


## Move to the Month End

In [2]:
import pandas as pd

# 示例数据
df = pd.DataFrame({
    'date': pd.to_datetime([
        '2024-01-01',
        '2024-01-15',
        '2024-01-31',  # 已经是月末
        '2024-02-01',
        '2024-02-28',
        '2024-02-29',  # 闰年
    ])
})

# 移动到当月月末（如果已经是月末，不变）
df['month_end'] = df['date'] + pd.tseries.offsets.MonthEnd(0)

print(df)


        date  month_end
0 2024-01-01 2024-01-31
1 2024-01-15 2024-01-31
2 2024-01-31 2024-01-31
3 2024-02-01 2024-02-29
4 2024-02-28 2024-02-29
5 2024-02-29 2024-02-29


## Differences between two dates

In [4]:
import pandas as pd

df = pd.DataFrame({
    'start_date': pd.to_datetime([
        '2024-01-01', '2024-01-05', '2024-02-01', '2024-03-15'
    ]),
    'end_date': pd.to_datetime([
        '2024-01-20', '2024-02-02', '2024-02-28', '2024-04-01'
    ])
})

In [8]:
df['weeks_floor'] = ((df['end_date'] - df['start_date']).dt.days // 7).astype(int)
df['weeks_round'] = ((df['end_date'] - df['start_date']).dt.days / 7).round().astype(int)

In [9]:
df

Unnamed: 0,start_date,end_date,weeks_floor,weeks_round
0,2024-01-01,2024-01-20,2,3
1,2024-01-05,2024-02-02,4,4
2,2024-02-01,2024-02-28,3,4
3,2024-03-15,2024-04-01,2,2


## Polars and Pandas

In [19]:
import pandas as pd
import numpy as np
import polars as pl
import time

# 生成 1000 万行测试数据
N = 10_000_000
np.random.seed(0)
group_data = np.random.choice(['A', 'B', 'C', 'D'], size=N)
value_data = np.random.randn(N)

# pandas DataFrame
df_pd = pd.DataFrame({'group': group_data, 'value': value_data})

# polars DataFrame
df_pl = pl.DataFrame({'group': group_data, 'value': value_data})

# pandas 计时
start = time.time()
result_pd = df_pd.groupby('group')['value'].mean()
print("pandas 用时：", round(time.time() - start, 3), "秒")

# polars 计时
start = time.time()
result_pl = df_pl.group_by('group').agg(pl.col('value').mean())
print("polars 用时：", round(time.time() - start, 3), "秒")


pandas 用时： 0.363 秒
polars 用时： 0.076 秒


In [1]:
import pandas as pd
import polars as pl
import numpy as np
import time

# 创建 pandas DataFrame
N = 10_000_000
df_pd = pd.DataFrame({
    'group': np.random.choice(['A', 'B', 'C', 'D'], size=N),
    'value': np.random.randn(N)
})

# pandas → polars
start = time.time()
df_pl = pl.DataFrame(df_pd)
print("pandas → polars 耗时：", round(time.time() - start, 3), "秒")

# polars → pandas
start = time.time()
df_pd_back = df_pl.to_pandas()
print("polars → pandas 耗时：", round(time.time() - start, 3), "秒")


pandas → polars 耗时： 0.852 秒
polars → pandas 耗时： 0.339 秒


## Agg用于汇总，返回缩减的一个结果

In [10]:
import pandas as pd

df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B'],
    'value': [10, 20, 30, 40]
})

In [11]:
result = df.groupby('group').agg({
    'value': ['mean', 'sum', 'max']
})
print(result)


      value        
       mean sum max
group              
A      15.0  30  20
B      35.0  70  40


In [12]:
result = df.groupby('group').agg(
    avg_value=('value', 'mean'),
    range_value=('value', lambda x: x.max() - x.min())
)
print(result)

       avg_value  range_value
group                        
A           15.0           10
B           35.0           10


In [13]:
def my_stats(x):
    return pd.Series({
        'range': x.max() - x.min(),
        'mean_sq': (x ** 2).mean()
    })

result = df.groupby('group')['value'].apply(my_stats).unstack()
print(result)


       range  mean_sq
group                
A       10.0    250.0
B       10.0   1250.0


## Transform用于返回与原数据框相同长度的结果

In [17]:
import pandas as pd

df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B'],
    'value': [10, 20, 30, 40]
})

df['group_mean'] = df.groupby('group')['value'].transform('mean')
print(df)

  group  value  group_mean
0     A     10        15.0
1     A     20        15.0
2     B     30        35.0
3     B     40        35.0


In [15]:
df['zscore'] = df.groupby('group')['value'].transform(lambda x: (x - x.mean()) / x.std())
print(df)

  group  value  group_mean    zscore
0     A     10        15.0 -0.707107
1     A     20        15.0  0.707107
2     B     30        35.0 -0.707107
3     B     40        35.0  0.707107


In [16]:
df['group_rank'] = df.groupby('group')['value'].transform('rank')
print(df)

  group  value  group_mean    zscore  group_rank
0     A     10        15.0 -0.707107         1.0
1     A     20        15.0  0.707107         2.0
2     B     30        35.0 -0.707107         1.0
3     B     40        35.0  0.707107         2.0


## 组内填补缺失值

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'group': ['A', 'A', 'A', 'B', 'B', 'B'],
    'value': [1, np.nan, 3, np.nan, 5, 6]
})

In [20]:
df['value_filled'] = df['value'].fillna(
    df.groupby('group')['value'].transform('mean')
)
print(df)

  group  value  value_filled
0     A    1.0           1.0
1     A    NaN           2.0
2     A    3.0           3.0
3     B    NaN           5.5
4     B    5.0           5.0
5     B    6.0           6.0


In [21]:
df['value_fullfill'] = df.groupby('group')['value'].transform(lambda x: x.ffill().bfill())
print(df)

  group  value  value_filled  value_fullfill
0     A    1.0           1.0             1.0
1     A    NaN           2.0             1.0
2     A    3.0           3.0             3.0
3     B    NaN           5.5             5.0
4     B    5.0           5.0             5.0
5     B    6.0           6.0             6.0
