### Kaggle Food Delivery Dataset
https://www.kaggle.com/datasets/gauravmalik26/food-delivery-dataset?resource=download

---
#### Q. 코호트 분석을 통해 라이더 운영형태 파악
---

In [8]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [9]:
df = pd.read_pickle('data/df.pkl')
print(df.shape)
df.head(2)

(41368, 25)


Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Weatherconditions,...,Festival,City,Time_taken(min),order_time,pick_time,orpi_time_diff_m,distance_km,speed_km/h,dayoftheweek,city_code
0,0x4607,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,Sunny,...,No,Urban,24.0,2022-03-19 11:30:00,2022-03-19 11:45:00,15.0,3.0,7.5,SAT,INDO
1,0xb379,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,Stormy,...,No,Metropolitian,33.0,2022-03-25 19:45:00,2022-03-25 19:50:00,5.0,20.2,36.7,FRI,BANG


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41368 entries, 0 to 41367
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   ID                           41368 non-null  object        
 1   Delivery_person_ID           41368 non-null  object        
 2   Delivery_person_Age          41368 non-null  float64       
 3   Delivery_person_Ratings      41368 non-null  float64       
 4   Restaurant_latitude          41368 non-null  float64       
 5   Restaurant_longitude         41368 non-null  float64       
 6   Delivery_location_latitude   41368 non-null  float64       
 7   Delivery_location_longitude  41368 non-null  float64       
 8   Order_Date                   41368 non-null  datetime64[ns]
 9   Weatherconditions            41368 non-null  object        
 10  Road_traffic_density         41368 non-null  object        
 11  Vehicle_condition            41368 non-nu

---
#### 1. cohort 분석을 위한 기간 세팅 
---

In [12]:
## 전체 기간 파악 : 최초, 최종, 기간 
import datetime
display(df['Order_Date'].min())
display(df['Order_Date'].max())
display(df['Order_Date'].max() - df['Order_Date'].min() + datetime.timedelta(days=1) )

Timestamp('2022-02-11 00:00:00')

Timestamp('2022-04-06 00:00:00')

Timedelta('55 days 00:00:00')

In [14]:
## 연중 몇번째 주차인지를 파악  
df['weekofyear'] = df['Order_Date'].dt.weekofyear
df[['Order_Date', 'dayoftheweek', 'weekofyear']]

Unnamed: 0,Order_Date,dayoftheweek,weekofyear
0,2022-03-19,SAT,11
1,2022-03-25,FRI,12
2,2022-03-19,SAT,11
3,2022-04-05,TUE,14
4,2022-03-26,SAT,12
...,...,...,...
41363,2022-03-24,THU,12
41364,2022-02-16,WED,7
41365,2022-03-11,FRI,10
41366,2022-03-07,MON,10


In [16]:
## 시작일인 2월 11일 (6주차)를, 1주차가 되도록 변환 
df[df['Order_Date'] == df['Order_Date'].min()][:1]

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Weatherconditions,...,City,Time_taken(min),order_time,pick_time,orpi_time_diff_m,distance_km,speed_km/h,dayoftheweek,city_code,weekofyear
30,0xd936,GOARES15DEL02,26.0,4.3,15.51315,73.78346,15.56315,73.83346,2022-02-11,Sandstorms,...,Urban,21.0,2022-02-11 23:25:00,2022-02-11 23:35:00,10.0,7.7,22.0,FRI,GOA,6


In [21]:
df['weekofyear'] = df['Order_Date'].dt.weekofyear - 5
display(df[['Order_Date', 'dayoftheweek', 'weekofyear']])

Unnamed: 0,Order_Date,dayoftheweek,weekofyear
0,2022-03-19,SAT,6
1,2022-03-25,FRI,7
2,2022-03-19,SAT,6
3,2022-04-05,TUE,9
4,2022-03-26,SAT,7
...,...,...,...
41363,2022-03-24,THU,7
41364,2022-02-16,WED,2
41365,2022-03-11,FRI,5
41366,2022-03-07,MON,5


In [23]:
## 전체 기간은 몇주인가? 
df['weekofyear'].max() - df['weekofyear'].min() +1

9

---
#### 2. cohort 분석을 위한 데이터셋 생성
- Delivery_person_ID, weekofyear 변수만 추출 (index = Delivery_person_ID) 
- Delivery_person_ID별로 weeofyear 변수를 groupby (라이더별 최초 시작일 확인) 
- 라이더 별 최초 배달 시작 주 파악 후, 첫 배달을 시작했던 주를 기준으로 주가 지날수록 라이더가 얼마큼 유지되는지 확인
- cohort 분석을 위한 9x9 매트릭스 구성 
---

In [24]:
## 1. Delivery_person_ID, weekofyear 변수만 추출 (index = Delivery_person_ID)
coh_df = df[['Delivery_person_ID', 'weekofyear']]
coh_df.set_index('Delivery_person_ID', inplace = True)
coh_df

Unnamed: 0_level_0,weekofyear
Delivery_person_ID,Unnamed: 1_level_1
INDORES13DEL02,6
BANGRES18DEL02,7
BANGRES19DEL01,6
COIMBRES13DEL02,9
CHENRES12DEL01,7
...,...
JAPRES04DEL01,7
AGRRES16DEL01,2
CHENRES08DEL03,5
COIMBRES11DEL01,5


In [25]:
## 2. Delivery_person_ID별로 weeofyear 변수를 groupby (라이더별 최초 시작일 확인)
firstdelivery = coh_df.groupby(coh_df.index)['weekofyear'].min()
firstdelivery

Delivery_person_ID
AGRRES010DEL01     1
AGRRES010DEL02     1
AGRRES010DEL03     1
AGRRES01DEL01      1
AGRRES01DEL02      1
                  ..
VADRES19DEL02      4
VADRES19DEL03      4
VADRES20DEL01      4
VADRES20DEL02      4
VADRES20DEL03      4
Name: weekofyear, Length: 1320, dtype: int64

In [26]:
## 1320명의 라이더 존재, 라이더별 최초 배달 시작 주차를 변수로 할당. 
coh_df['firstdelivery'] = firstdelivery
coh_df.reset_index(inplace = True)
coh_df

Unnamed: 0,Delivery_person_ID,weekofyear,firstdelivery
0,INDORES13DEL02,6,4
1,BANGRES18DEL02,7,4
2,BANGRES19DEL01,6,4
3,COIMBRES13DEL02,9,4
4,CHENRES12DEL01,7,4
...,...,...,...
41363,JAPRES04DEL01,7,4
41364,AGRRES16DEL01,2,1
41365,CHENRES08DEL03,5,4
41366,COIMBRES11DEL01,5,4


In [27]:
## 3.첫 배달을 시작했던 주를 기준으로 주가 지날수록 라이더가 얼마큼 유지되는지 확인
grouped = coh_df.groupby(['firstdelivery', 'weekofyear'])
cohorts=grouped['Delivery_person_ID'].nunique()
cohorts=cohorts.reset_index()
cohorts.rename({'Delivery_person_ID' : 'totaldelivery'}, axis =1, inplace =True)
cohorts

Unnamed: 0,firstdelivery,weekofyear,totaldelivery
0,1,1,599
1,1,2,599
2,2,2,1
3,4,4,720
4,4,5,720
5,4,6,720
6,4,7,720
7,4,8,720
8,4,9,719


In [29]:
## 4.cohort 분석을 위한 9x9 매트릭스 구성
s= 3
for i in range(7) :
    cohorts.loc[cohorts.shape[0],'firstdelivery'] = 1
    cohorts.loc[cohorts.shape[0]-1,'weekofyear'] = s
    cohorts.loc[cohorts.shape[0]-1,'totaldelivery'] = 0
    s = s+1
s= 3
for i in range(7) :
    cohorts.loc[cohorts.shape[0],'firstdelivery'] = 2
    cohorts.loc[cohorts.shape[0]-1,'weekofyear'] = s
    cohorts.loc[cohorts.shape[0]-1,'totaldelivery'] = 0
    s = s+1
    
s= 3
for i in range(7) :
    cohorts.loc[cohorts.shape[0],'firstdelivery'] = 3
    cohorts.loc[cohorts.shape[0]-1,'weekofyear'] = s
    cohorts.loc[cohorts.shape[0]-1,'totaldelivery'] = 0
    s = s+1    

s= 5
for i in range(5) :
    cohorts.loc[cohorts.shape[0],'firstdelivery'] = 5
    cohorts.loc[cohorts.shape[0]-1,'weekofyear'] = s
    cohorts.loc[cohorts.shape[0]-1,'totaldelivery'] = 0
    s = s+1    

s= 6
for i in range(4) :
    cohorts.loc[cohorts.shape[0],'firstdelivery'] = 6
    cohorts.loc[cohorts.shape[0]-1,'weekofyear'] = s
    cohorts.loc[cohorts.shape[0]-1,'totaldelivery'] = 0
    s = s+1    

s= 7
for i in range(3) :
    cohorts.loc[cohorts.shape[0],'firstdelivery'] = 7
    cohorts.loc[cohorts.shape[0]-1,'weekofyear'] = s
    cohorts.loc[cohorts.shape[0]-1,'totaldelivery'] = 0
    s = s+1    

s= 8
for i in range(2) :
    cohorts.loc[cohorts.shape[0],'firstdelivery'] = 8
    cohorts.loc[cohorts.shape[0]-1,'weekofyear'] = s
    cohorts.loc[cohorts.shape[0]-1,'totaldelivery'] = 0
    s = s+1    

s= 9
for i in range(1) :
    cohorts.loc[cohorts.shape[0],'firstdelivery'] = 9
    cohorts.loc[cohorts.shape[0]-1,'weekofyear'] = s
    cohorts.loc[cohorts.shape[0]-1,'totaldelivery'] = 0
    s = s+1    
cohorts = cohorts.sort_values(['firstdelivery','weekofyear'])
each_period=cohorts['firstdelivery'].value_counts().sort_index()

In [30]:
cohortperiod = []
for x in each_period : 
    for y in range(x) :
        cohortperiod.append(y)
cohorts['cohortperiod'] = cohortperiod
cohorts.set_index(['firstdelivery','cohortperiod'],inplace =True)

In [31]:
cohorts =cohorts['totaldelivery'].unstack(1)
cohorts

cohortperiod,0,1,2,3,4,5,6,7,8
firstdelivery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,599.0,599.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4.0,720.0,720.0,720.0,720.0,720.0,719.0,,,
5.0,0.0,0.0,0.0,0.0,0.0,,,,
6.0,0.0,0.0,0.0,0.0,,,,,
7.0,0.0,0.0,0.0,,,,,,
8.0,0.0,0.0,,,,,,,
9.0,0.0,,,,,,,,


- 1주차에 599명, 2주차에 1명, 4주차에 720명의 라이더와 계약하였음 