In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import pendulum as plm

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Groupby & Aggs

In [2]:
# 1 groupby

df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 30,
                   'col2': np.random.rand(90),
                   'col3': np.random.randint(0, 15, 90)})

# groupby.agg(new_col_name = ('colname',agg_func on pd.Series))

(
    df
    .groupby('col1',as_index=False)
    .agg(
        sum_col2 = ('col2',np.sum), # perform aggregation
        distict_col2 = ('col2',lambda x: x.nunique()),
        collect_list_col3 = ('col3',lambda x : x.tolist()),
        collect_list_sampled_5_col3 = ('col3',lambda x : x.sample(5,random_state=42).tolist())
    )
)

Unnamed: 0,col1,sum_col2,distict_col2,collect_list_col3,collect_list_sampled_5_col3
0,apple,17.152439,30.0,"[4, 11, 11, 0, 9, 1, 6, 5, 5, 13, 10, 8, 1, 12...","[11, 2, 11, 3, 5]"
1,banana,15.567389,30.0,"[2, 10, 0, 12, 0, 14, 5, 7, 10, 8, 0, 6, 14, 6...","[9, 11, 13, 7, 10]"
2,orange,15.407019,30.0,"[9, 13, 4, 11, 4, 2, 12, 3, 0, 3, 4, 8, 0, 1, ...","[9, 11, 11, 4, 0]"


In [3]:
# case when pattern

df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 30,
                   'col2': np.random.rand(90),
                   'col3': np.random.randint(0, 15, 90)})

df = (
    df
    .assign(
        col4 = (
           np.where(df['col2'] <= 5, '0_5',
           np.where(df['col2'] <= 15,'6_15',
           np.where(df['col2'] <= 25,'16_25',
           np.where(df['col2'] <= 50,'25_50',
           np.where(df['col2'] <= 100,'50_100',
            '100_'
                    )))))))
)

df.head()

Unnamed: 0,col1,col2,col3,col4
0,apple,0.787933,7,0_5
1,banana,0.391749,6,0_5
2,orange,0.75491,10,0_5
3,apple,0.226709,2,0_5
4,banana,0.861418,13,0_5


# Where (Query)

In [4]:
# where pattern


def create_datetime(start = plm.DateTime(2023,1,1),ndays = 90):
    start = start
    ndays = 90
    end = start.add(ndays)
    return [dt for dt in plm.period(start,end).range('days')]



df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 30,
                   'col2': np.random.rand(90),
                   'col3': np.random.randint(0, 15, 90),
                   'time_col' : create_datetime()[:90]
                  })

df.info()

(
    df
    .where(
    pd.to_datetime(df['time_col'],infer_datetime_format=True) < '2023-01-02'
    )
    .dropna(subset=['time_col'])
)
                  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   col1      90 non-null     object        
 1   col2      90 non-null     float64       
 2   col3      90 non-null     int64         
 3   time_col  90 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 2.9+ KB


Unnamed: 0,col1,col2,col3,time_col
0,apple,0.601081,12.0,2023-01-01


# Assign new column, rename, and join
* merge
* assign - lambda df
* rename

In [5]:
# 定義 video_company_pdf DataFrame
data_video_company = {
    "video_id_c": [1, 2, 3, 4],
    "recommend_id_c": ["A", "B", "C", "D"],
    "company_id": [101, 102, 103, 104],
    "status": ["active", "inactive", "active", "inactive"]
}

video_company_pdf = pd.DataFrame(data_video_company)

# 定義 approved_video_pdf DataFrame
data_approved_video = {
    "video_id": [1, 3],
    "status": ["approved", "rejected"]
}

approved_video_pdf = pd.DataFrame(data_approved_video)


In [7]:
result_df = (
    video_company_pdf.filter(
        ["video_id_c", "recommend_id_c", "company_id", "status"]
    )
    .rename(columns={"status": "lagged_status"})
    .merge(
        (
            approved_video_pdf.filter(["video_id", "status"]).rename(
                columns={"status": "latest_status"}
            )
        ),
        left_on=["video_id_c"],
        right_on=["video_id"],
        how="left",
    )
    .assign(
        status=lambda df: df["latest_status"].combine_first(df["lagged_status"])
    )
    .filter(["video_id_c", "recommend_id_c", "company_id", "status"])
)

result_df

Unnamed: 0,video_id_c,recommend_id_c,company_id,status
0,1,A,101,approved
1,2,B,102,inactive
2,3,C,103,rejected
3,4,D,104,inactive
