# 고객의 전체모습 파악
1. 데이터 읽고 확인
2. 고객 데이터 가공
3. 고객 데이터 집계
4. 최근 고객데이터 집계
5. 이용이력 집계
6. 이용이력 데이터로부터 정기이용 여부 플래그를 작성

In [235]:
import pandas as pd

## 1. 데이터 읽고 확인

In [236]:
campaign = pd.read_csv('./data/campaign_master.csv')
class_master = pd.read_csv('./data/class_master.csv')
customer = pd.read_csv('./data/customer_master.csv')
use_log = pd.read_csv('./data/use_log.csv')

In [237]:
campaign.head()

Unnamed: 0,campaign_id,campaign_name
0,CA1,2_일반
1,CA2,0_입회비반액할인
2,CA3,1_입회비무료


In [238]:
class_master.head()

Unnamed: 0,class,class_name,price
0,C01,0_종일,10500
1,C02,1_주간,7500
2,C03,2_야간,6000


In [239]:
use_log.head()

Unnamed: 0,log_id,customer_id,usedate
0,L00000049012330,AS009373,2018-04-01
1,L00000049012331,AS015315,2018-04-01
2,L00000049012332,AS040841,2018-04-01
3,L00000049012333,AS046594,2018-04-01
4,L00000049012334,AS073285,2018-04-01


In [240]:
# 회원데이터
# is_deleted : 탈퇴 회원 정보 파악

print(customer.head())
print(customer.info())

  customer_id   name class gender           start_date end_date campaign_id  \
0    OA832399   XXXX   C01      F  2015-05-01 00:00:00      NaN         CA1   
1    PL270116  XXXXX   C01      M  2015-05-01 00:00:00      NaN         CA1   
2    OA974876  XXXXX   C01      M  2015-05-01 00:00:00      NaN         CA1   
3    HD024127  XXXXX   C01      F  2015-05-01 00:00:00      NaN         CA1   
4    HD661448  XXXXX   C03      F  2015-05-01 00:00:00      NaN         CA1   

   is_deleted  
0           0  
1           0  
2           0  
3           0  
4           0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4192 entries, 0 to 4191
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  4192 non-null   object
 1   name         4192 non-null   object
 2   class        4192 non-null   object
 3   gender       4192 non-null   object
 4   start_date   4192 non-null   object
 5   end_date     1350 non-null   ob

In [241]:
customer['end_date'].value_counts()

end_date
2019-02-28 00:00:00    149
2019-01-31 00:00:00    129
2018-05-31 00:00:00    125
2018-04-30 00:00:00    121
2018-08-31 00:00:00    113
2018-12-31 00:00:00    111
2019-03-31 00:00:00    111
2018-06-30 00:00:00    106
2018-09-30 00:00:00    104
2018-07-31 00:00:00    102
2018-10-31 00:00:00     90
2018-11-30 00:00:00     89
Name: count, dtype: int64

## 2. 고객 데이터 가공

### customer_df = customer + class_master

In [242]:
customer_df = pd.merge(customer,class_master,how='left',on='class')
customer_df.head()

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price
0,OA832399,XXXX,C01,F,2015-05-01 00:00:00,,CA1,0,0_종일,10500
1,PL270116,XXXXX,C01,M,2015-05-01 00:00:00,,CA1,0,0_종일,10500
2,OA974876,XXXXX,C01,M,2015-05-01 00:00:00,,CA1,0,0_종일,10500
3,HD024127,XXXXX,C01,F,2015-05-01 00:00:00,,CA1,0,0_종일,10500
4,HD661448,XXXXX,C03,F,2015-05-01 00:00:00,,CA1,0,2_야간,6000


### customer_df = customer_df + capaign 

In [243]:
customer_df = pd.merge(customer_df, campaign, how='left', on='campaign_id')
customer_df.head()

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name
0,OA832399,XXXX,C01,F,2015-05-01 00:00:00,,CA1,0,0_종일,10500,2_일반
1,PL270116,XXXXX,C01,M,2015-05-01 00:00:00,,CA1,0,0_종일,10500,2_일반
2,OA974876,XXXXX,C01,M,2015-05-01 00:00:00,,CA1,0,0_종일,10500,2_일반
3,HD024127,XXXXX,C01,F,2015-05-01 00:00:00,,CA1,0,0_종일,10500,2_일반
4,HD661448,XXXXX,C03,F,2015-05-01 00:00:00,,CA1,0,2_야간,6000,2_일반


### merge 확인

In [244]:
print(len(customer))
print(len(customer_df))

4192
4192


In [245]:
# enddate만 결측치 존재 -> 아직 유지중인 고객은 enddate값이 없음

customer_df.isnull().sum()

customer_id         0
name                0
class               0
gender              0
start_date          0
end_date         2842
campaign_id         0
is_deleted          0
class_name          0
price               0
campaign_name       0
dtype: int64

## 3. 고객 데이터 집계

1. 많은 회원 유형(클래스)
2. 캠페인 유형
3. 남녀 비율?
4. 현재 유지 회원 / 탈퇴 회원

5. 언제 입회/탈퇴가 많았는지
6. 탈퇴할 때까지의 기간은 어느 정도인지

* 1. 많은 회원 유형(클래스)

In [246]:
customer_df.groupby(['class','gender']).count()['customer_id']

class  gender
C01    F          822
       M         1223
C02    F          835
       M          184
C03    F          326
       M          802
Name: customer_id, dtype: int64

* 2. 캠페인 유형

In [247]:
customer_df.groupby('campaign_id').count()['campaign_name']

campaign_id
CA1    3050
CA2     650
CA3     492
Name: campaign_name, dtype: int64

* 3. 남녀 비율?

In [248]:
customer_total = customer_df.groupby('gender').count()['customer_id']
f_ratio = customer_total[0]/ customer_total.sum() 
m_ratio = customer_total[0]/ customer_total.sum() 
print(f'f_ration = {f_ratio}, m_ratio = {m_ratio}')

f_ration = 0.47304389312977096, m_ratio = 0.47304389312977096


* 4. 현재 유지 회원 / 탈퇴 회원

In [249]:
mask = pd.isnull(customer_df['end_date'])       # 탈퇴 안 함
customer_df.loc[mask,:]

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name
0,OA832399,XXXX,C01,F,2015-05-01 00:00:00,,CA1,0,0_종일,10500,2_일반
1,PL270116,XXXXX,C01,M,2015-05-01 00:00:00,,CA1,0,0_종일,10500,2_일반
2,OA974876,XXXXX,C01,M,2015-05-01 00:00:00,,CA1,0,0_종일,10500,2_일반
3,HD024127,XXXXX,C01,F,2015-05-01 00:00:00,,CA1,0,0_종일,10500,2_일반
4,HD661448,XXXXX,C03,F,2015-05-01 00:00:00,,CA1,0,2_야간,6000,2_일반
...,...,...,...,...,...,...,...,...,...,...,...
4187,HD676663,XXXX,C01,M,2019-03-14 00:00:00,,CA1,0,0_종일,10500,2_일반
4188,HD246549,XXXXX,C01,F,2019-03-14 00:00:00,,CA1,0,0_종일,10500,2_일반
4189,GD037007,XXXXX,C03,M,2019-03-14 00:00:00,,CA1,0,2_야간,6000,2_일반
4190,OA953150,XXXXX,C01,M,2019-03-14 00:00:00,,CA1,0,0_종일,10500,2_일반


In [250]:
customer_df['is_deleted'].value_counts()

is_deleted
0    2842
1    1350
Name: count, dtype: int64

In [251]:
customer_df.groupby('is_deleted').count()['customer_id']

is_deleted
0    2842
1    1350
Name: customer_id, dtype: int64

* 5. 언제 입회/탈퇴가 많았는지

In [252]:
# 2019년 2월에 가장 많이 탈퇴

customer_df.groupby('end_date')['is_deleted'].count()

end_date
2018-04-30 00:00:00    121
2018-05-31 00:00:00    125
2018-06-30 00:00:00    106
2018-07-31 00:00:00    102
2018-08-31 00:00:00    113
2018-09-30 00:00:00    104
2018-10-31 00:00:00     90
2018-11-30 00:00:00     89
2018-12-31 00:00:00    111
2019-01-31 00:00:00    129
2019-02-28 00:00:00    149
2019-03-31 00:00:00    111
Name: is_deleted, dtype: int64

* 6. 탈퇴할 때까지의 기간은 어느 정도인지 : is_delete 열

In [253]:
customer_df['start_date'] = pd.to_datetime(customer_df['start_date'])
customer_df['end_date'] = pd.to_datetime(customer_df['end_date'])
quit_customers = customer_df.loc[customer_df['is_deleted'] == 1,:]

In [254]:
quit_customers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1350 entries, 708 to 4099
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   customer_id    1350 non-null   object        
 1   name           1350 non-null   object        
 2   class          1350 non-null   object        
 3   gender         1350 non-null   object        
 4   start_date     1350 non-null   datetime64[ns]
 5   end_date       1350 non-null   datetime64[ns]
 6   campaign_id    1350 non-null   object        
 7   is_deleted     1350 non-null   int64         
 8   class_name     1350 non-null   object        
 9   price          1350 non-null   int64         
 10  campaign_name  1350 non-null   object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 126.6+ KB


In [255]:
quit_customers['customer_keep'] = quit_customers['end_date'] - quit_customers['start_date']
quit_customers['customer_keep'].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quit_customers['customer_keep'] = quit_customers['end_date'] - quit_customers['start_date']


Timedelta('270 days 05:38:08')

## 4. 최근 고객데이터 집계

In [256]:
# 20180401 이후 가입한 고객

customer_start = customer_df.loc[customer_df['start_date'] > pd.to_datetime('20180401')]
print(len(customer_start))

1361


* 2019년 3월의 고객 데이터 파악:
enddate가 3월 1일 이전인 고객을 제외 + enddate가 na인 고객

In [257]:
customer_march = customer_df.loc[(customer_df['end_date'] > pd.to_datetime('20190331')) | customer_df['end_date'].isna()]
customer_march

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name
0,OA832399,XXXX,C01,F,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반
1,PL270116,XXXXX,C01,M,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반
2,OA974876,XXXXX,C01,M,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반
3,HD024127,XXXXX,C01,F,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반
4,HD661448,XXXXX,C03,F,2015-05-01,NaT,CA1,0,2_야간,6000,2_일반
...,...,...,...,...,...,...,...,...,...,...,...
4187,HD676663,XXXX,C01,M,2019-03-14,NaT,CA1,0,0_종일,10500,2_일반
4188,HD246549,XXXXX,C01,F,2019-03-14,NaT,CA1,0,0_종일,10500,2_일반
4189,GD037007,XXXXX,C03,M,2019-03-14,NaT,CA1,0,2_야간,6000,2_일반
4190,OA953150,XXXXX,C01,M,2019-03-14,NaT,CA1,0,0_종일,10500,2_일반


In [258]:
customer_march.groupby('class_name').count()['customer_id']

class_name
0_종일    1381
1_주간     675
2_야간     786
Name: customer_id, dtype: int64

In [259]:
customer_march.groupby('campaign_name').count()['customer_id']

campaign_name
0_입회비반액할인     282
1_입회비무료       219
2_일반         2341
Name: customer_id, dtype: int64

In [260]:
customer_march.groupby('gender').count()['customer_id']

gender
F    1352
M    1490
Name: customer_id, dtype: int64

## 5. 이용이력 집계
* 1. 전일 이용 횟수의 변화
* 2. 회원이 스포츠센터에 정기적으로 오고 있는지

* datetime 변환을 통해 월별 집계기준 usemonth 생성

In [261]:
use_log.head()

Unnamed: 0,log_id,customer_id,usedate
0,L00000049012330,AS009373,2018-04-01
1,L00000049012331,AS015315,2018-04-01
2,L00000049012332,AS040841,2018-04-01
3,L00000049012333,AS046594,2018-04-01
4,L00000049012334,AS073285,2018-04-01


In [262]:
import datetime
use_log['usedate'] = pd.to_datetime(use_log['usedate'])
use_log['usemonth'] = use_log['usedate'].dt.strftime('%Y%m')
use_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197428 entries, 0 to 197427
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   log_id       197428 non-null  object        
 1   customer_id  197428 non-null  object        
 2   usedate      197428 non-null  datetime64[ns]
 3   usemonth     197428 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 6.0+ MB


In [263]:
use_log.head()

Unnamed: 0,log_id,customer_id,usedate,usemonth
0,L00000049012330,AS009373,2018-04-01,201804
1,L00000049012331,AS015315,2018-04-01,201804
2,L00000049012332,AS040841,2018-04-01,201804
3,L00000049012333,AS046594,2018-04-01,201804
4,L00000049012334,AS073285,2018-04-01,201804


* 월 이용 횟수의 평균값, 중앙값, 최댓값, 최솟값

In [264]:
use_log.groupby('usemonth').count()['usedate'].describe()

count       12.000000
mean     16452.333333
std        393.031420
min      15697.000000
25%      16311.000000
50%      16421.500000
75%      16733.750000
max      16994.000000
Name: usedate, dtype: float64

In [265]:
(use_log.groupby(['usemonth','customer_id']).count()['usedate']).groupby('customer_id').mean()

customer_id
AS002855    4.500000
AS008805    4.000000
AS009013    2.000000
AS009373    5.083333
AS015233    7.545455
              ...   
TS995853    9.500000
TS998593    8.142857
TS999079    4.916667
TS999231    4.666667
TS999855    4.583333
Name: usedate, Length: 4192, dtype: float64

* 고객마다 월 이용 횟수 집계

In [266]:
use_log_monthly = use_log.pivot_table(index='customer_id', columns = 'usemonth',aggfunc='size',fill_value=0)
use_log_monthly

usemonth,201804,201805,201806,201807,201808,201809,201810,201811,201812,201901,201902,201903
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AS002855,4,5,5,5,3,7,3,5,2,6,5,4
AS008805,0,0,8,7,5,2,2,3,5,1,6,1
AS009013,2,0,0,0,0,0,0,0,0,0,0,0
AS009373,3,4,4,7,6,6,5,4,4,7,6,5
AS015233,0,7,7,5,11,9,7,7,9,9,4,8
...,...,...,...,...,...,...,...,...,...,...,...,...
TS995853,0,0,0,0,0,0,0,0,0,0,11,8
TS998593,0,0,0,0,0,9,9,9,7,8,7,8
TS999079,7,5,6,2,3,4,6,6,9,6,2,3
TS999231,6,4,1,5,3,4,5,5,8,3,6,6


In [267]:
use_log_monthly.describe()

usemonth,201804,201805,201806,201807,201808,201809,201810,201811,201812,201901,201902,201903
count,4192.0,4192.0,4192.0,4192.0,4192.0,4192.0,4192.0,4192.0,4192.0,4192.0,4192.0,4192.0
mean,3.926527,4.03459,4.053912,4.04437,3.977576,3.929151,3.81417,3.744513,3.908158,3.903387,3.90625,3.853769
std,3.142423,3.108013,3.041815,3.00003,2.968453,2.944567,2.959531,2.946649,2.998991,2.99494,3.035075,3.059084
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
max,13.0,12.0,14.0,14.0,13.0,12.0,13.0,13.0,13.0,13.0,14.0,12.0


##### 교수님 버전  

In [268]:
# 고객별 월별 이용 횟수 집계

uselog_months = use_log.groupby(['usemonth','customer_id'],as_index=False).count()
uselog_months.rename(columns={'log_id':'count'},inplace=True)
# del uselog_months = ['u']
uselog_months.head()

Unnamed: 0,usemonth,customer_id,count,usedate
0,201804,AS002855,4,4
1,201804,AS009013,2,2
2,201804,AS009373,3,3
3,201804,AS015315,6,6
4,201804,AS015739,7,7


In [269]:
uselog_customer = uselog_months.groupby('customer_id').agg(['mean','median','max','min'])['count']
uselog_customer = uselog_customer.reset_index(drop=False)
uselog_customer.head()

Unnamed: 0,customer_id,mean,median,max,min
0,AS002855,4.5,5.0,7,2
1,AS008805,4.0,4.0,8,1
2,AS009013,2.0,2.0,2,2
3,AS009373,5.083333,5.0,7,3
4,AS015233,7.545455,7.0,11,4


## 6. 이용이력 데이터로부터 정기이용 여부 플래그를 작성

### 월별 정기적 이용 여부: 고객마다 월/요일별 집계 최댓값이 4 이상인 요일이 있으면 플래그 1로 처리

#### 고객마다 월/요일별 집계

In [270]:
use_log.head()

Unnamed: 0,log_id,customer_id,usedate,usemonth
0,L00000049012330,AS009373,2018-04-01,201804
1,L00000049012331,AS015315,2018-04-01,201804
2,L00000049012332,AS040841,2018-04-01,201804
3,L00000049012333,AS046594,2018-04-01,201804
4,L00000049012334,AS073285,2018-04-01,201804


In [271]:
use_log_weekday = []
for day in use_log['usedate']:
    use_log_weekday.append(day.weekday())

use_log['weekday'] = use_log_weekday
use_log.head()

Unnamed: 0,log_id,customer_id,usedate,usemonth,weekday
0,L00000049012330,AS009373,2018-04-01,201804,6
1,L00000049012331,AS015315,2018-04-01,201804,6
2,L00000049012332,AS040841,2018-04-01,201804,6
3,L00000049012333,AS046594,2018-04-01,201804,6
4,L00000049012334,AS073285,2018-04-01,201804,6


#### 교수님버전

In [272]:
use_log['weekday_prof'] = use_log['usedate'].dt.weekday
use_log_weekday = use_log.groupby(['customer_id','usemonth','weekday'], as_index=False).count()[['customer_id','usemonth','weekday','log_id']]
use_log_weekday.rename(columns={'log_id':'count'},inplace=True)             # log_id를 count로 rename
use_log_weekday.head()

Unnamed: 0,customer_id,usemonth,weekday,count
0,AS002855,201804,5,4
1,AS002855,201805,2,1
2,AS002855,201805,5,4
3,AS002855,201806,5,5
4,AS002855,201807,1,1


In [273]:
use_log_weekday = use_log_weekday.groupby('customer_id', as_index=False).max()[['customer_id','count']]
use_log_weekday['routine_flg'] = 0
use_log_weekday['routine_flg'] = use_log_weekday['routine_flg'].where(use_log_weekday['count']<4, 1)
use_log_weekday

Unnamed: 0,customer_id,count,routine_flg
0,AS002855,5,1
1,AS008805,4,1
2,AS009013,2,0
3,AS009373,5,1
4,AS015233,5,1
...,...,...,...
4187,TS995853,5,1
4188,TS998593,5,1
4189,TS999079,5,1
4190,TS999231,5,1


#### 나

In [274]:
# 고객마다 월/요일별 집계 최댓값이 4 이상인 요일이 있으면 플래그 1로 처리

customer_count_cal = use_log_weekday.groupby('customer_id').max()
customer_count_cal.reset_index(inplace=True)
customer_count_cal

Unnamed: 0,customer_id,count,routine_flg
0,AS002855,5,1
1,AS008805,4,1
2,AS009013,2,0
3,AS009373,5,1
4,AS015233,5,1
...,...,...,...
4187,TS995853,5,1
4188,TS998593,5,1
4189,TS999079,5,1
4190,TS999231,5,1


In [275]:
use_log_weekday['customer_id'][0]

'AS002855'

In [276]:
regular = {}
for idx, customer in enumerate(customer_count_cal['customer_id']):
    customer_count = customer_count_cal['count'][idx]
    if customer_count > 4:
        regular[customer]  = 1
    else:
        regular[customer] = 0
        
regular = pd.Series(regular)

In [277]:
regular

AS002855    1
AS008805    0
AS009013    0
AS009373    1
AS015233    1
           ..
TS995853    1
TS998593    1
TS999079    1
TS999231    1
TS999855    1
Length: 4192, dtype: int64

In [278]:
regular = pd.DataFrame(regular)
regular.reset_index(inplace=True)
regular.rename(columns={0:'regular','index':'customer_id'},inplace=True)
customer_df = pd.merge(customer_df, regular, how='left',on='customer_id')
customer_df.head()

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name,regular
0,OA832399,XXXX,C01,F,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,0
1,PL270116,XXXXX,C01,M,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1
2,OA974876,XXXXX,C01,M,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1
3,HD024127,XXXXX,C01,F,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1
4,HD661448,XXXXX,C03,F,2015-05-01,NaT,CA1,0,2_야간,6000,2_일반,1


In [279]:
regular = regular.reset_index(name='regular')
regular

TypeError: DataFrame.reset_index() got an unexpected keyword argument 'name'

In [None]:
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4192 entries, 0 to 4191
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   customer_id    4192 non-null   object        
 1   name           4192 non-null   object        
 2   class          4192 non-null   object        
 3   gender         4192 non-null   object        
 4   start_date     4192 non-null   datetime64[ns]
 5   end_date       1350 non-null   datetime64[ns]
 6   campaign_id    4192 non-null   object        
 7   is_deleted     4192 non-null   int64         
 8   class_name     4192 non-null   object        
 9   price          4192 non-null   int64         
 10  campaign_name  4192 non-null   object        
 11  regular        4192 non-null   int64         
dtypes: datetime64[ns](2), int64(3), object(7)
memory usage: 393.1+ KB


## 7. 고객 데이터와 이용이력데이터 결합

In [280]:
# customer_df랑 합치기

customer_df = customer_df.merge(use_log_weekday, how='left', on='customer_id')
customer_df.head()

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name,regular,count,routine_flg
0,OA832399,XXXX,C01,F,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,0,4,1
1,PL270116,XXXXX,C01,M,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1,5,1
2,OA974876,XXXXX,C01,M,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1,5,1
3,HD024127,XXXXX,C01,F,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1,5,1
4,HD661448,XXXXX,C03,F,2015-05-01,NaT,CA1,0,2_야간,6000,2_일반,1,5,1


In [281]:
customer_df.isnull().sum()

customer_id         0
name                0
class               0
gender              0
start_date          0
end_date         2842
campaign_id         0
is_deleted          0
class_name          0
price               0
campaign_name       0
regular             0
count               0
routine_flg         0
dtype: int64

## 8. 회원 기간 계산

In [287]:
fillna_df = customer_df.fillna('2019-04-30')                # NaT라는 결측치 -> 가장 최근 날짜(데이터 기준)인 2019-04-30으로 채워서 계산
fillna_df['customer_keep'] = fillna_df['end_date'] - fillna_df['start_date']
fillna_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4192 entries, 0 to 4191
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   customer_id    4192 non-null   object         
 1   name           4192 non-null   object         
 2   class          4192 non-null   object         
 3   gender         4192 non-null   object         
 4   start_date     4192 non-null   datetime64[ns] 
 5   end_date       4192 non-null   datetime64[ns] 
 6   campaign_id    4192 non-null   object         
 7   is_deleted     4192 non-null   int64          
 8   class_name     4192 non-null   object         
 9   price          4192 non-null   int64          
 10  campaign_name  4192 non-null   object         
 11  regular        4192 non-null   int64          
 12  count          4192 non-null   int64          
 13  routine_flg    4192 non-null   int64          
 14  customer_keep  4192 non-null   timedelta64[ns]
dtypes: d

In [304]:
# 교수님 버전
from dateutil.relativedelta import relativedelta

customer_df['calc_date'] = customer_df['end_date']
customer_df['calc_date'] = customer_df['calc_date'].fillna(pd.to_datetime('20190430'))
customer_df['membership_period'] = 0

for i in range(len(customer_df)):
    delta = relativedelta(customer_df['calc_date'].iloc[i], customer_df['start_date'].iloc[i])
    print(delta, delta.months)
    customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months

customer_df.head()

relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=+3, months=+11, days=+29) 11
relativedelta(years=

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+10, days=+29) 10
relativedelta(years=+3, months=+9, days=+29) 9
relativedelta(years=+3, months=+9, days=+29) 9
relativedelta(years=+3, months=+9, days=+29) 9
relativedelta(years=+3, mo

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta(years=+3, months=+7, days=+29) 7
relativedelta

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta(years=+3, months=+4, days=+29) 4
relativedelta

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+2, days=+29) 2
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta(years=+3, months=+1, days=+29) 1
relativedelta

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=+2, months=+11, days=+29) 11
relativedelta(years=

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+1, months=+9, days=+30) 9
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta(years=+2, months=+8, days=+29) 8
relativedelta

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+1, months=+3, days=+30) 3
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta(years=+2, months=+2, days=+29) 2
relativedelta

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(years=+1, months=+11, days=+29) 11
relativedelta(years=+1, days=+30) 0
relativedelta(years=+1, days=+30) 0
relativedelta(years=+1, days=+30) 0
relativedelta(years=+1, days=+30) 0
relativedelta(years=+1, months=+11, days=+29) 11
relativedelta(years=+1, months=+11, days=+29) 11
relativedelta(years=+1, months=+11, days=+29) 11
relativedelta(years=+1, days=+30) 0
relativedelta(years=+1, months=+11, days=+29) 11
relativedelta(months=+11, days=+29) 11
relativedelta(years=+1, months=+11, days=+29) 11
relativedelta(months=+11, days=+29) 11
relativedelta(years=+1, days=+30) 0
relativedelta(months=+10, days=+29) 10
relativedelta(years=+1, months=+7, days=+30) 7
relativedelta(months=+11, days=+30) 11
relativedelta(years=+1, months=+10, days=+29) 10
relativedelta(years=+1, months=+7, days=+30) 7
relativedelta(years=+1, months=+10, days=+29) 10
relativedelta(months=+11, days=+30) 11
relativedelta(years=+1, months=+10, days=+29) 10
relativedelta(months=+11, days=+30) 11
relativedelta(y

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(months=+10, days=+29) 10
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(months=+10, days=+29) 10
relativedelta(years=+1, months=+2, days=+30) 2
relativedelta(months=+10, days=+29) 10
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(months=+9, days=+30) 9
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(months=+9, days=+30) 9
relativedelta(months=+9, days=+30) 9
relativedelta(months=+9, days=+30) 9
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(months=+9, days=+30) 9
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(months=+9, days=+30) 9
relativedelta(years=+1, months=+8, days=+29) 8
relativedelta(years=+1, months=+8, days=+29) 8
rel

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(months=+5, days=+23) 5
relativedelta(months=+5, days=+23) 5
relativedelta(months=+9, days=+24) 9
relativedelta(years=+1, days=+23) 0
relativedelta(months=+4, days=+24) 4
relativedelta(months=+10, days=+21) 10
relativedelta(months=+9, days=+23) 9
relativedelta(years=+1, days=+22) 0
relativedelta(months=+6, days=+23) 6
relativedelta(months=+6, days=+23) 6
relativedelta(years=+1, days=+22) 0
relativedelta(months=+9, days=+23) 9
relativedelta(years=+1, days=+22) 0
relativedelta(months=+2, days=+22) 2
relativedelta(months=+3, days=+23) 3
relativedelta(months=+9, days=+23) 9
relativedelta(months=+3, days=+22) 3
relativedelta(months=+3, days=+22) 3
relativedelta(years=+1, days=+21) 0
relativedelta(months=+9, days=+22) 9
relativedelta(months=+2, days=+21) 2
relativedelta(months=+1, days=+22) 1
relativedelta(months=+4, days=+22) 4
relativedelta(months=+9, days=+22) 9
relativedelta(months=+4, days=+22) 4
relativedelta(months=+6, days=+22) 6
relativedelta(months=+10, days=+19) 10
re

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df['membership_period'].iloc[i] = delta.years*12 + delta.months
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

relativedelta(months=+8, days=+17) 8
relativedelta(months=+6, days=+17) 6
relativedelta(months=+6, days=+17) 6
relativedelta(months=+5, days=+16) 5
relativedelta(years=+1, days=+16) 0
relativedelta(months=+8, days=+17) 8
relativedelta(months=+11, days=+17) 11
relativedelta(months=+6, days=+17) 6
relativedelta(months=+11, days=+17) 11
relativedelta(months=+1, days=+16) 1
relativedelta(months=+10, days=+13) 10
relativedelta(months=+7, days=+15) 7
relativedelta(months=+2, days=+15) 2
relativedelta(months=+2, days=+15) 2
relativedelta(years=+1, days=+15) 0
relativedelta(months=+7, days=+15) 7
relativedelta(months=+10, days=+13) 10
relativedelta(months=+2, days=+15) 2
relativedelta(years=+1, days=+15) 0
relativedelta(months=+11, days=+16) 11
relativedelta(months=+9, days=+16) 9
relativedelta(years=+1, days=+15) 0
relativedelta(months=+7, days=+15) 7
relativedelta(months=+11, days=+16) 11
relativedelta(months=+3, days=+16) 3
relativedelta(months=+6, days=+16) 6
relativedelta(months=+10, days

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name,regular,count,routine_flg,calc_date,membership_period
0,OA832399,XXXX,C01,F,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,0,4,1,2019-04-30,47
1,PL270116,XXXXX,C01,M,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1,5,1,2019-04-30,47
2,OA974876,XXXXX,C01,M,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1,5,1,2019-04-30,47
3,HD024127,XXXXX,C01,F,2015-05-01,NaT,CA1,0,0_종일,10500,2_일반,1,5,1,2019-04-30,47
4,HD661448,XXXXX,C03,F,2015-05-01,NaT,CA1,0,2_야간,6000,2_일반,1,5,1,2019-04-30,47


In [288]:
fillna_df['customer_keep'].mean()

Timedelta('599 days 14:23:35.267175576')

## 9. 고객행동의 각종통계량 파악

In [290]:
fillna_df['startmonth'] = fillna_df['start_date'].dt.strftime('%Y%m')
fillna_df['endmonth'] = fillna_df['end_date'].dt.strftime('%Y%m')
fillna_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4192 entries, 0 to 4191
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   customer_id    4192 non-null   object         
 1   name           4192 non-null   object         
 2   class          4192 non-null   object         
 3   gender         4192 non-null   object         
 4   start_date     4192 non-null   datetime64[ns] 
 5   end_date       4192 non-null   datetime64[ns] 
 6   campaign_id    4192 non-null   object         
 7   is_deleted     4192 non-null   int64          
 8   class_name     4192 non-null   object         
 9   price          4192 non-null   int64          
 10  campaign_name  4192 non-null   object         
 11  regular        4192 non-null   int64          
 12  count          4192 non-null   int64          
 13  routine_flg    4192 non-null   int64          
 14  customer_keep  4192 non-null   timedelta64[ns]
 15  star

In [306]:
customer_df[['mean','median','max','mimn']].describe()

KeyError: "None of [Index(['mean', 'median', 'max', 'mimn'], dtype='object')] are in the [columns]"

In [None]:
import matplotlib.pyplot as plt
plt.hist(customer_df('membership_period'))

# 결과: membership period가 10개월 이내인 고객이 많고, 그 이상인 고객은 일정하다
# -> 10개월 이내에 고객이 이탈한다

## 10. 탈퇴회원과 지속회원의 차이 파악

In [314]:
customers_quit = customer_df.loc[customer_df['is_deleted'] == 1 ]
customers_quit.describe()

Unnamed: 0,start_date,end_date,is_deleted,price,regular,count,routine_flg,calc_date,membership_period
count,1350,1350,1350.0,1350.0,1350.0,1350.0,1350.0,1350,1350.0
mean,2018-01-20 10:45:20,2018-10-17 16:23:28,1.0,8595.555556,0.159259,3.215556,0.456296,2018-10-17 16:23:28,8.026667
min,2016-05-01 00:00:00,2018-04-30 00:00:00,1.0,6000.0,0.0,1.0,0.0,2018-04-30 00:00:00,1.0
25%,2017-10-01 00:00:00,2018-06-30 00:00:00,1.0,6000.0,0.0,2.0,0.0,2018-06-30 00:00:00,4.0
50%,2018-03-01 00:00:00,2018-10-31 00:00:00,1.0,7500.0,0.0,3.0,0.0,2018-10-31 00:00:00,7.0
75%,2018-06-03 00:00:00,2019-01-31 00:00:00,1.0,10500.0,0.0,4.0,1.0,2019-01-31 00:00:00,11.0
max,2019-02-11 00:00:00,2019-03-31 00:00:00,1.0,10500.0,1.0,5.0,1.0,2019-03-31 00:00:00,23.0
std,,,0.0,1949.163652,0.366053,1.217087,0.498271,,5.033692


In [313]:
customers_stay = customer_df.loc[customer_df['is_deleted'] == 0]
customers_stay.describe()

Unnamed: 0,start_date,end_date,is_deleted,price,regular,count,routine_flg,calc_date,membership_period
count,2842,0,2842.0,2842.0,2842.0,2842.0,2842.0,2842,2842.0
mean,2017-04-03 22:42:28.627726848,NaT,0.0,8542.927516,0.885644,4.865588,0.984166,2019-04-30 00:00:00,23.970443
min,2015-05-01 00:00:00,NaT,0.0,6000.0,0.0,2.0,0.0,2019-04-30 00:00:00,1.0
25%,2016-05-01 00:00:00,NaT,0.0,6000.0,1.0,5.0,1.0,2019-04-30 00:00:00,12.0
50%,2017-04-01 00:00:00,NaT,0.0,7500.0,1.0,5.0,1.0,2019-04-30 00:00:00,24.0
75%,2018-04-05 00:00:00,NaT,0.0,10500.0,1.0,5.0,1.0,2019-04-30 00:00:00,35.0
max,2019-03-15 00:00:00,NaT,0.0,10500.0,1.0,5.0,1.0,2019-04-30 00:00:00,47.0
std,,,0.0,1977.189779,0.318299,0.406154,0.124855,,13.746761


In [315]:
customer_df.to_csv('./data/customer_df.csv',index=False)