In [75]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import urllib.parse
from datetime import datetime
import pytz
import requests
import numpy as np

# Table of Content
* [Data Extraction](#Data-Extraction)
* [Data Filtering](#Data-Filtering)
* [Customer Conversion](#Customer-Conversion)
    * [Feature 1: trial_nums](#Feature-1:-trial_nums)
    * [Feature 2: trial_price](#Feature-2:-trial_price)
    * [Feature 3: teacher_id](#Feature-3:-teacher_id)
    * [Feature 4: teacher_type](#Feature-4:-teacher_type)
    * [Feature 5: is_weekend](#Feature-5:-is_weekend)
    * [Feature 6: num_days_reserve](#Feature-6:-num_days_reserve)
    * [Feature 7: student_evaluation](#Feature-7:-student_evaluation)
    * [Feature 8: has_comment](#Feature-8:-has_comment)
    * [Feature 9: trial_issue](#Feature-9:-trial_issue)
* [Finalization](#Finalization)

# Data Extraction

In [77]:
query1 = "SELECT classorderid as class_order_id,classid as class_id, studentid as user_id, classorderstatus as class_order_status, paytype as pay_type FROM ehailuo_class_orderlist_glv"
class_reservation = pd.read_sql(query1, engine)
class_reservation

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type
0,1,39,401,3,1
1,2,52,402,1,1
2,3,50,401,0,1
3,4,44,401,2,1
4,5,54,401,1,1
...,...,...,...,...,...
1359719,1363903,2935928,16557469,1,25
1359720,1363904,2935960,141620,1,21
1359721,1363905,2884172,16576479,1,25
1359722,1363906,2933018,16568079,1,25


In [78]:
query2 = "SELECT classid as class_id, teacherid as teacher_id, teachertype as teacher_type, starttime as start_time, adddate as add_date,wday as w_day FROM ehailuo_class_schedule_glv"
class_detail = pd.read_sql(query2, engine)
class_detail

Unnamed: 0,class_id,teacher_id,teacher_type,start_time,add_date,w_day
0,39,402,1,1396663200,1395906274,6
1,40,402,1,1396711800,1395906274,6
2,41,406,2,1396146600,1396175299,0
3,42,406,2,1396175400,1396175299,0
4,43,406,2,1396670400,1396175339,6
...,...,...,...,...,...,...
2926875,2935995,2064,2,1690538400,1690475457,5
2926876,2935996,838,5,1690959600,1690476036,3
2926877,2935997,838,5,1690956000,1690476038,3
2926878,2935998,838,5,1691046000,1690476041,4


In [79]:
query3 = "SELECT ttypeid as type_id, ttypename_app as teacher_type_en FROM ehailuo_teacher_type_glv"
teacher_type_list = pd.read_sql(query3, engine)

teacher_type_list

Unnamed: 0,type_id,teacher_type_en
0,1,Philippines
1,2,American
2,3,ABC
3,4,International
4,5,Chinese
5,6,Others
6,7,TOEFL&IELTS
7,8,TOEFL
8,9,IELTS
9,10,Japanese


In [80]:
query4 = "SELECT classorderid as class_order_id, total as student_evaluation, adddate as add_date FROM ehailuo_to_student_eval_glv"
student_evaluation_raw = pd.read_sql(query4, engine)

student_evaluation_raw

Unnamed: 0,class_order_id,student_evaluation,add_date
0,151973,0.0,1.510407e+09
1,10,0.0,1.400053e+09
2,11,0.0,1.400053e+09
3,18,0.0,1.400140e+09
4,23,0.0,1.402491e+09
...,...,...,...
722815,1361294,0.0,1.690475e+09
722816,1362157,0.0,1.690475e+09
722817,1362507,0.0,1.690475e+09
722818,1361145,0.0,1.690476e+09


In [81]:
query5 = "SELECT classorderid as class_order_id, totalpoint as total_point, material FROM ehailuo_class_eval_glv"
class_comment_raw = pd.read_sql(query5, engine)
class_comment_raw

Unnamed: 0,class_order_id,total_point,material
0,10.0,5,5
1,18.0,1,4
2,47.0,5,3
3,205.0,5,5
4,216.0,5,5
...,...,...,...
36778,1337832.0,5,5
36779,1363345.0,5,4
36780,1361891.0,5,5
36781,1363018.0,4,5


In [82]:
query6 = "SELECT classorderid as class_order_id, startattr1 as issue_type_1, startattr2 as issue_type_2, content as issue_content FROM ehailuo_feedback"
trial_issue = pd.read_sql(query6, engine)
trial_issue

Unnamed: 0,class_order_id,issue_type_1,issue_type_2,issue_content
0,110384,0,0,老师缺席课程
1,110384,0,0,老师缺席课程 已经扣除老师两倍工资
2,110707,0,0,Pat 老师缺席\r\n老师9点和11点都有课 按理不会缺席10点的课程 但是因为学员是临时...
3,113216,925,0,由于老师网络出现问题，QQ和skype都无法进行正常上课，之后就临时换了平台上课，但是后期还...
4,113015,925,0,学员：马雷\r\n账号：13663002657\r\n上课时间：2.10 22:00-22:...
...,...,...,...,...
33125,1360868,0,0,未及时填写课后评价
33126,1358795,0,0,未及时填写课后评价
33127,1362043,0,0,返还学员本节课时，老师按设备问题处理
33128,1360403,0,0,老师设备问题，少上了10分钟，申请补偿。


In [83]:
query7 = "SELECT order_id as class_order_id, type_id as issue_type FROM ehailuo_feedback_auto"
trial_issue_auto = pd.read_sql(query7, engine)
trial_issue_auto

Unnamed: 0,class_order_id,issue_type
0,1329397,3175
1,1328083,3183
2,1329255,3176
3,1327062,3175
4,1327039,3175
...,...,...
1134,1363266,3179
1135,1363018,3175
1136,1363335,3179
1137,1363169,3175


In [127]:
query8 = "SELECT userid as user_id, productid as product_id, productname as product_name, producttype as buy_type, adddate as add_date, lesson, productmoney as product_money FROM ehailuo_product_buy_glv"
order = pd.read_sql(query8, engine)
order

Unnamed: 0,user_id,product_id,product_name,buy_type,add_date,lesson,product_money
0,113040.0,21.0,菲律宾外教,105.0,,16.0,0.0
1,113040.0,25.0,欧美外教次卡,105.0,,14.0,0.0
2,113040.0,17.0,海螺币,105.0,,0.0,0.0
3,113040.0,21.0,菲律宾外教,105.0,,16.0,0.0
4,113040.0,25.0,欧美外教次卡,105.0,,14.0,0.0
...,...,...,...,...,...,...,...
231204,16499394.0,17.0,海螺币,3185.0,1.690463e+09,0.0,0.0
231205,16517179.0,17.0,海螺币,3185.0,1.690466e+09,0.0,0.0
231206,16581580.0,17.0,海螺币,3185.0,1.690466e+09,0.0,0.0
231207,16562491.0,41.0,取消约课卡,1151.0,1.690467e+09,2.0,0.0


In [84]:
query9 = "SELECT attriid as attri_id, attriname as attri_name, attriname_en as attri_name_en, pid FROM ehailuo_attribute_info_glv"
attribute = pd.read_sql(query9, engine)
attribute

Unnamed: 0,attri_id,attri_name,attri_name_en,pid
0,1,小学生,Elementary Students,0.0
1,2,中学生,Junior/Senior Students,0.0
2,3,大学生,Undergraduates,0.0
3,4,销售/客服/采购,Sales/Customer Service/Purchase,0.0
4,5,IT/通信/电子,IT/Communication/E-commerce,0.0
...,...,...,...,...
2752,3187,抖音直播,,3172.0
2753,3188,CR30天停课学员回访,,
2754,3189,青少儿应试,Young Learners Test Preparation,568.0
2755,3190,小红书投放,,3172.0


In [85]:
customer_reach = pd.read_csv('customer_reach.csv')
customer_reach

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_completed
0,16410781,0.0,0.0,1662.0,0.0,0
1,16448486,100.0,4.0,97.0,0.0,1
2,16531800,650.0,2.0,2045.0,0.0,1
3,16532204,100.0,2.0,97.0,0.0,1
4,16398016,197.0,0.0,510.0,0.0,0
...,...,...,...,...,...,...
158085,16558015,647.0,3.0,1662.0,0.0,0
158086,16569340,647.0,0.0,1662.0,0.0,0
158087,16578239,647.0,0.0,1662.0,0.0,0
158088,16578345,647.0,0.0,1662.0,0.0,0


# Data Filtering

In [86]:
customer_conversion = customer_reach[customer_reach['trial_completed'] == 1].copy()
customer_conversion = customer_conversion.drop('trial_completed',axis=1)
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money
1,16448486,100.0,4.0,97.0,0.0
2,16531800,650.0,2.0,2045.0,0.0
3,16532204,100.0,2.0,97.0,0.0
17,131946,0.0,0.0,242.0,0.0
109,135946,0.0,0.0,0.0,0.0
...,...,...,...,...,...
157396,16571299,490.0,0.0,1662.0,0.0
157405,16573426,490.0,0.0,1662.0,0.0
157421,16557658,647.0,2.0,1662.0,0.0
157444,16558135,647.0,3.0,1662.0,0.0


In [87]:
trial_customer_list = customer_conversion['user_id'].tolist()

# Customer Conversion

#### Feature 1: trial_nums

In [88]:
# Find the class that are paid/reserved by trial-class card in the account
trial_class_reservation = class_reservation[class_reservation['pay_type']==26]

# Find the trial class reservations that are completed successfully
trial_class_completed = trial_class_reservation[trial_class_reservation['class_order_status']==2]
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type
7773,7774,46923,119379,2,26
7792,7793,47452,119443,2,26
7804,7805,47403,119445,2,26
7849,7850,47967,119449,2,26
7855,7856,47399,119447,2,26
...,...,...,...,...,...
1359220,1363404,2910335,16583078,2,26
1359222,1363406,2926073,16583073,2,26
1359238,1363422,2927291,16582687,2,26
1359245,1363429,2923661,16582825,2,26


In [89]:
# Filter the trial class history completed by trial_customer_list 
trial_class_completed = trial_class_completed[trial_class_completed['user_id'].isin(trial_customer_list)]
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type
20276,20277,91114,131640,2,26
20277,20278,92907,131641,2,26
20279,20280,90632,131638,2,26
20309,20310,84149,131648,2,26
20311,20312,93459,131650,2,26
...,...,...,...,...,...
1358047,1362231,2897760,16561991,2,26
1358680,1362864,2897784,16578831,2,26
1359012,1363196,2923720,16569615,2,26
1359065,1363249,2934694,16579139,2,26


In [90]:
trial_class_completed['user_id'].value_counts()

165268      4
16577052    4
16447633    4
16452598    4
16503474    4
           ..
163542      1
163286      1
162414      1
163281      1
16579139    1
Name: user_id, Length: 59577, dtype: int64

In [91]:
# Find users who have completed multiple trial classes
trial_student_counts = trial_class_completed['user_id'].value_counts()
students_with_multiple_trials = trial_student_counts[trial_student_counts > 1].index.tolist()
print(len(students_with_multiple_trials))

1941


In [92]:
# Add a column to show if the user have completed multiple trial class
customer_conversion['trial_nums'] = customer_conversion['user_id'].apply(lambda x: trial_student_counts[x] if x in students_with_multiple_trials else 1)
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums
1,16448486,100.0,4.0,97.0,0.0,1
2,16531800,650.0,2.0,2045.0,0.0,1
3,16532204,100.0,2.0,97.0,0.0,1
17,131946,0.0,0.0,242.0,0.0,1
109,135946,0.0,0.0,0.0,0.0,2
...,...,...,...,...,...,...
157396,16571299,490.0,0.0,1662.0,0.0,1
157405,16573426,490.0,0.0,1662.0,0.0,1
157421,16557658,647.0,2.0,1662.0,0.0,1
157444,16558135,647.0,3.0,1662.0,0.0,1


#### Feature 2:  trial_price

In [93]:
customer_conversion['trial_price'] = customer_conversion['trial_money'] / customer_conversion['trial_nums']
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price
1,16448486,100.0,4.0,97.0,0.0,1,0.0
2,16531800,650.0,2.0,2045.0,0.0,1,0.0
3,16532204,100.0,2.0,97.0,0.0,1,0.0
17,131946,0.0,0.0,242.0,0.0,1,0.0
109,135946,0.0,0.0,0.0,0.0,2,0.0
...,...,...,...,...,...,...,...
157396,16571299,490.0,0.0,1662.0,0.0,1,0.0
157405,16573426,490.0,0.0,1662.0,0.0,1,0.0
157421,16557658,647.0,2.0,1662.0,0.0,1,0.0
157444,16558135,647.0,3.0,1662.0,0.0,1,0.0


#### Feature 3: teacher_id

For student who have taken more than one trial class, only the teacher the student who has most class with will be kept. However, if the teachers have equal frequency for a student, then the most recent teacher will be kept as I assume that the experience of the most recent class has the most significant influence on customer's decisions. 

In [95]:
# Merger the teacher details will trail_class_completed table 
trial_class_completed = pd.merge(trial_class_completed, class_detail, on='class_id', how='left')
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day
0,20277,91114,131640,2,26,205,2,1451552400,1451088191,4
1,20278,92907,131641,2,26,86,1,1451552400,1451438562,4
2,20280,90632,131638,2,26,213,2,1451703600,1450930364,6
3,20310,84149,131648,2,26,75,2,1451642400,1449389122,5
4,20312,93459,131650,2,26,86,1,1451646000,1451556691,5
...,...,...,...,...,...,...,...,...,...,...
61591,1362231,2897760,16561991,2,26,2024,2,1690268400,1687104851,2
61592,1362864,2897784,16578831,2,26,2024,2,1690455600,1687104880,4
61593,1363196,2923720,16569615,2,26,1857,2,1690434000,1689332686,4
61594,1363249,2934694,16579139,2,26,2024,2,1690441200,1690353406,4


In [96]:
# Sort the 'trial_class_completed' table in descending order based on the 'adddate' column
trial_class_completed = trial_class_completed.sort_values(by='add_date', ascending=False)
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day
61594,1363249,2934694,16579139,2,26,2024,2,1690441200,1690353406,4
61595,1363375,2924756,16569641,2,26,2094,2,1690455600,1689488111,4
61583,1358789,2924014,16573832,2,26,1357,2,1689516000,1689396512,0
61593,1363196,2923720,16569615,2,26,1857,2,1690434000,1689332686,4
61584,1359487,2919542,16576604,2,26,1419,5,1689681600,1688954252,2
...,...,...,...,...,...,...,...,...,...,...
53,20751,83433,131724,2,26,203,2,1452078000,1449268421,3
85,21033,83429,131772,2,26,203,2,1452409200,1449268417,0
28,20580,83422,131701,2,26,203,2,1451912400,1449268406,1
21,20491,83416,131689,2,26,203,2,1451890800,1449268402,1


In [98]:
# Group by 'userid' and count the occurrences of each teacher
trial_teacher = trial_class_completed.groupby('user_id')['teacher_id'].agg(lambda x: x.value_counts().idxmax()).reset_index()

customer_conversion = customer_conversion.merge(trial_teacher, on='user_id', how='left')
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911
3,131946,0.0,0.0,242.0,0.0,1,0.0,298
4,135946,0.0,0.0,0.0,0.0,2,0.0,123
...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432


#### Feature 4: teacher_type

In [99]:
# Group by 'userid' and count the occurrences of each teacher_type
trial_teacher_type = trial_class_completed.groupby('user_id')['teacher_type'].agg(lambda x: x.value_counts().idxmax()).reset_index()

customer_conversion = customer_conversion.merge(trial_teacher_type, on='user_id', how='left')
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id,teacher_type
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677,2
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241,2
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911,2
3,131946,0.0,0.0,242.0,0.0,1,0.0,298,2
4,135946,0.0,0.0,0.0,0.0,2,0.0,123,5
...,...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904,2
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893,2
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638,2
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432,2


In [100]:
teacher_type_list

Unnamed: 0,type_id,teacher_type_en
0,1,Philippines
1,2,American
2,3,ABC
3,4,International
4,5,Chinese
5,6,Others
6,7,TOEFL&IELTS
7,8,TOEFL
8,9,IELTS
9,10,Japanese


#### Feature 5: is_weekend

In [101]:
trial_class_completed['w_day'].value_counts()

2    9261
6    9178
4    9153
3    9102
5    8944
0    8002
1    7956
Name: w_day, dtype: int64

In [102]:
trial_class_completed['w_day'].replace(0, 7, inplace=True)

In [103]:
# Create a mapping dictionary for the 'wday' values to 'is_weekend' values
mapping = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 1, 7: 1}

# Add the 'is_weekend' column based on the 'wday' column using the mapping dictionary
trial_class_completed['is_weekend'] = trial_class_completed['w_day'].map(mapping)
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day,is_weekend
61594,1363249,2934694,16579139,2,26,2024,2,1690441200,1690353406,4,0
61595,1363375,2924756,16569641,2,26,2094,2,1690455600,1689488111,4,0
61583,1358789,2924014,16573832,2,26,1357,2,1689516000,1689396512,7,1
61593,1363196,2923720,16569615,2,26,1857,2,1690434000,1689332686,4,0
61584,1359487,2919542,16576604,2,26,1419,5,1689681600,1688954252,2,0
...,...,...,...,...,...,...,...,...,...,...,...
53,20751,83433,131724,2,26,203,2,1452078000,1449268421,3,0
85,21033,83429,131772,2,26,203,2,1452409200,1449268417,7,1
28,20580,83422,131701,2,26,203,2,1451912400,1449268406,1,0
21,20491,83416,131689,2,26,203,2,1451890800,1449268402,1,0


In [104]:
# Group by 'userid' and count the occurrences of each teacher
trial_weekend = trial_class_completed.groupby('user_id')['is_weekend'].agg(lambda x: x.value_counts().idxmax()).reset_index()

customer_conversion = customer_conversion.merge(trial_weekend, on='user_id', how='left')
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id,teacher_type,is_weekend
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677,2,1
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241,2,0
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911,2,0
3,131946,0.0,0.0,242.0,0.0,1,0.0,298,2,0
4,135946,0.0,0.0,0.0,0.0,2,0.0,123,5,0
...,...,...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904,2,0
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893,2,0
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638,2,0
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432,2,1


#### Feature 6: num_days_reserve

In [105]:
# Convert UNIX timestamps to datetime objects
trial_class_completed['start_time'] = pd.to_datetime(trial_class_completed['start_time'], unit='s')
trial_class_completed['add_date'] = pd.to_datetime(trial_class_completed['add_date'], unit='s')


# Set timezone to China timezone
china_tz = pytz.timezone('Asia/Shanghai')
trial_class_completed['start_time'] = trial_class_completed['start_time'].dt.tz_localize(pytz.utc).dt.tz_convert(china_tz)
trial_class_completed['add_date'] = trial_class_completed['add_date'].dt.tz_localize(pytz.utc).dt.tz_convert(china_tz)


# Extract yyyy-mm-dd format from datetime objects
trial_class_completed['start_time'] = trial_class_completed['start_time'].dt.strftime('%Y-%m-%d %H:%M')
trial_class_completed['add_date'] = trial_class_completed['add_date'].dt.strftime('%Y-%m-%d %H:%M')

trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day,is_weekend
61594,1363249,2934694,16579139,2,26,2024,2,2023-07-27 15:00,2023-07-26 14:36,4,0
61595,1363375,2924756,16569641,2,26,2094,2,2023-07-27 19:00,2023-07-16 14:15,4,0
61583,1358789,2924014,16573832,2,26,1357,2,2023-07-16 22:00,2023-07-15 12:48,7,1
61593,1363196,2923720,16569615,2,26,1857,2,2023-07-27 13:00,2023-07-14 19:04,4,0
61584,1359487,2919542,16576604,2,26,1419,5,2023-07-18 20:00,2023-07-10 09:57,2,0
...,...,...,...,...,...,...,...,...,...,...,...
53,20751,83433,131724,2,26,203,2,2016-01-06 19:00,2015-12-05 06:33,3,0
85,21033,83429,131772,2,26,203,2,2016-01-10 15:00,2015-12-05 06:33,7,1
28,20580,83422,131701,2,26,203,2,2016-01-04 21:00,2015-12-05 06:33,1,0
21,20491,83416,131689,2,26,203,2,2016-01-04 15:00,2015-12-05 06:33,1,0


In [106]:
trial_class_completed['start_time'] = pd.to_datetime(trial_class_completed['start_time'])
trial_class_completed['add_date'] = pd.to_datetime(trial_class_completed['add_date'])
trial_class_completed['num_days_reserve'] = (trial_class_completed['start_time'] - trial_class_completed['add_date']).dt.days
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day,is_weekend,num_days_reserve
61594,1363249,2934694,16579139,2,26,2024,2,2023-07-27 15:00:00,2023-07-26 14:36:00,4,0,1
61595,1363375,2924756,16569641,2,26,2094,2,2023-07-27 19:00:00,2023-07-16 14:15:00,4,0,11
61583,1358789,2924014,16573832,2,26,1357,2,2023-07-16 22:00:00,2023-07-15 12:48:00,7,1,1
61593,1363196,2923720,16569615,2,26,1857,2,2023-07-27 13:00:00,2023-07-14 19:04:00,4,0,12
61584,1359487,2919542,16576604,2,26,1419,5,2023-07-18 20:00:00,2023-07-10 09:57:00,2,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...
53,20751,83433,131724,2,26,203,2,2016-01-06 19:00:00,2015-12-05 06:33:00,3,0,32
85,21033,83429,131772,2,26,203,2,2016-01-10 15:00:00,2015-12-05 06:33:00,7,1,36
28,20580,83422,131701,2,26,203,2,2016-01-04 21:00:00,2015-12-05 06:33:00,1,0,30
21,20491,83416,131689,2,26,203,2,2016-01-04 15:00:00,2015-12-05 06:33:00,1,0,30


In [107]:
average_days = trial_class_completed.groupby('user_id')['num_days_reserve'].mean().reset_index()
customer_conversion = customer_conversion.merge(average_days,on='user_id', how='left')
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id,teacher_type,is_weekend,num_days_reserve
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677,2,1,11.0
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241,2,0,7.0
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911,2,0,8.0
3,131946,0.0,0.0,242.0,0.0,1,0.0,298,2,0,9.0
4,135946,0.0,0.0,0.0,0.0,2,0.0,123,5,0,8.0
...,...,...,...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904,2,0,10.0
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893,2,0,14.0
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638,2,0,8.0
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432,2,1,8.0


#### Feature 7: student_evaluation

In [108]:
student_evaluation = student_evaluation_raw[['class_order_id','student_evaluation']].drop_duplicates()
student_evaluation

Unnamed: 0,class_order_id,student_evaluation
0,151973,0.0
1,10,0.0
2,11,0.0
3,18,0.0
4,23,0.0
...,...,...
722815,1361294,0.0
722816,1362157,0.0
722817,1362507,0.0
722818,1361145,0.0


In [109]:
student_evaluation['class_order_id'].value_counts()

379910     2
398406     2
1055098    2
1218786    2
1149336    2
          ..
302612     1
301459     1
296200     1
301228     1
1362113    1
Name: class_order_id, Length: 660618, dtype: int64

The reason for having two evaluation records for one class could potentially be attributed to an error in the previous record, which has been rectified in the second record. As a result, the more recent record is considered to be more accurate, and therefore, only the student level grade from the more recent record will be retained.

In [110]:
#vSort the 'student_level' table in descending order based on the 'adddate' column
student_evaluation_sorted = student_evaluation_raw.sort_values(by='add_date', ascending=False)

# Drop duplicate rows based on the 'classorderid' column, keeping only the first occurrence
# This will ensure that only the most recent student_level value is retained for each classorderid
student_evaluation_unique = student_evaluation_sorted.drop_duplicates(subset='class_order_id', keep='first')

student_evaluation_unique = student_evaluation_unique.drop('add_date', axis=1)

In [111]:
# Merge the unique student initial English level evaluation into the customer_conversion table
trial_class_completed = pd.merge(trial_class_completed,student_evaluation_unique,on = 'class_order_id', how = 'left')
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day,is_weekend,num_days_reserve,student_evaluation
0,1363249,2934694,16579139,2,26,2024,2,2023-07-27 15:00:00,2023-07-26 14:36:00,4,0,1,12.00
1,1363375,2924756,16569641,2,26,2094,2,2023-07-27 19:00:00,2023-07-16 14:15:00,4,0,11,0.75
2,1358789,2924014,16573832,2,26,1357,2,2023-07-16 22:00:00,2023-07-15 12:48:00,7,1,1,0.15
3,1363196,2923720,16569615,2,26,1857,2,2023-07-27 13:00:00,2023-07-14 19:04:00,4,0,12,7.00
4,1359487,2919542,16576604,2,26,1419,5,2023-07-18 20:00:00,2023-07-10 09:57:00,2,0,8,10.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61591,20751,83433,131724,2,26,203,2,2016-01-06 19:00:00,2015-12-05 06:33:00,3,0,32,0.00
61592,21033,83429,131772,2,26,203,2,2016-01-10 15:00:00,2015-12-05 06:33:00,7,1,36,0.00
61593,20580,83422,131701,2,26,203,2,2016-01-04 21:00:00,2015-12-05 06:33:00,1,0,30,0.00
61594,20491,83416,131689,2,26,203,2,2016-01-04 15:00:00,2015-12-05 06:33:00,1,0,30,0.00


In [112]:
evaluation_mean = round(trial_class_completed['student_evaluation'].mean(), 1)
trial_class_completed['student_evaluation'].replace(0, evaluation_mean, inplace=True)
trial_class_completed['student_evaluation'].replace(np.nan, evaluation_mean, inplace=True)

In [113]:
average_evaluation = trial_class_completed.groupby('user_id')['student_evaluation'].mean().reset_index()
customer_conversion = customer_conversion.merge(average_evaluation,on='user_id', how='left')
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id,teacher_type,is_weekend,num_days_reserve,student_evaluation
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677,2,1,11.0,13.0
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241,2,0,7.0,10.0
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911,2,0,8.0,7.0
3,131946,0.0,0.0,242.0,0.0,1,0.0,298,2,0,9.0,3.5
4,135946,0.0,0.0,0.0,0.0,2,0.0,123,5,0,8.0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904,2,0,10.0,15.0
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893,2,0,14.0,3.8
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638,2,0,8.0,9.0
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432,2,1,8.0,9.0


#### Feature 8: has_comment

In [114]:
class_comment = class_comment_raw.drop_duplicates()
class_comment

Unnamed: 0,class_order_id,total_point,material
0,10.0,5,5
1,18.0,1,4
2,47.0,5,3
3,205.0,5,5
4,216.0,5,5
...,...,...,...
36778,1337832.0,5,5
36779,1363345.0,5,4
36780,1361891.0,5,5
36781,1363018.0,4,5


In [115]:
trial_class_completed = trial_class_completed.merge(class_comment, on='class_order_id', how='left')
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day,is_weekend,num_days_reserve,student_evaluation,total_point,material
0,1363249,2934694,16579139,2,26,2024,2,2023-07-27 15:00:00,2023-07-26 14:36:00,4,0,1,12.00,,
1,1363375,2924756,16569641,2,26,2094,2,2023-07-27 19:00:00,2023-07-16 14:15:00,4,0,11,0.75,,
2,1358789,2924014,16573832,2,26,1357,2,2023-07-16 22:00:00,2023-07-15 12:48:00,7,1,1,0.15,,
3,1363196,2923720,16569615,2,26,1857,2,2023-07-27 13:00:00,2023-07-14 19:04:00,4,0,12,7.00,,
4,1359487,2919542,16576604,2,26,1419,5,2023-07-18 20:00:00,2023-07-10 09:57:00,2,0,8,10.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61591,20751,83433,131724,2,26,203,2,2016-01-06 19:00:00,2015-12-05 06:33:00,3,0,32,3.50,,
61592,21033,83429,131772,2,26,203,2,2016-01-10 15:00:00,2015-12-05 06:33:00,7,1,36,3.50,,
61593,20580,83422,131701,2,26,203,2,2016-01-04 21:00:00,2015-12-05 06:33:00,1,0,30,3.50,,
61594,20491,83416,131689,2,26,203,2,2016-01-04 15:00:00,2015-12-05 06:33:00,1,0,30,3.50,,


In [116]:
trial_class_completed['total_point'] = pd.to_numeric(trial_class_completed['total_point'], errors='coerce')
average_comment_total = trial_class_completed.groupby('user_id')['total_point'].mean().reset_index()
customer_conversion = customer_conversion.merge(average_comment_total,on='user_id', how='left')

trial_class_completed['material'] = pd.to_numeric(trial_class_completed['material'], errors='coerce')
average_comment_material = trial_class_completed.groupby('user_id')['material'].mean().reset_index()
customer_conversion = customer_conversion.merge(average_comment_material,on='user_id', how='left')

customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id,teacher_type,is_weekend,num_days_reserve,student_evaluation,total_point,material
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677,2,1,11.0,13.0,,
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241,2,0,7.0,10.0,,
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911,2,0,8.0,7.0,,
3,131946,0.0,0.0,242.0,0.0,1,0.0,298,2,0,9.0,3.5,,
4,135946,0.0,0.0,0.0,0.0,2,0.0,123,5,0,8.0,3.5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904,2,0,10.0,15.0,,
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893,2,0,14.0,3.8,,
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638,2,0,8.0,9.0,,
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432,2,1,8.0,9.0,,


In [117]:
# Check the number of null values in 'totalpoint' and 'material'
customer_conversion['total_point'].info()
customer_conversion['material'].info()

<class 'pandas.core.series.Series'>
Int64Index: 59577 entries, 0 to 59576
Series name: total_point
Non-Null Count  Dtype  
--------------  -----  
1314 non-null   float64
dtypes: float64(1)
memory usage: 930.9 KB
<class 'pandas.core.series.Series'>
Int64Index: 59577 entries, 0 to 59576
Series name: material
Non-Null Count  Dtype  
--------------  -----  
1314 non-null   float64
dtypes: float64(1)
memory usage: 930.9 KB


In [118]:
customer_conversion['has_comment'] = np.where(customer_conversion['total_point'].notnull(), 1, 0)
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id,teacher_type,is_weekend,num_days_reserve,student_evaluation,total_point,material,has_comment
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677,2,1,11.0,13.0,,,0
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241,2,0,7.0,10.0,,,0
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911,2,0,8.0,7.0,,,0
3,131946,0.0,0.0,242.0,0.0,1,0.0,298,2,0,9.0,3.5,,,0
4,135946,0.0,0.0,0.0,0.0,2,0.0,123,5,0,8.0,3.5,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904,2,0,10.0,15.0,,,0
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893,2,0,14.0,3.8,,,0
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638,2,0,8.0,9.0,,,0
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432,2,1,8.0,9.0,,,0


In [119]:
customer_conversion = customer_conversion.drop('total_point',axis=1)
customer_conversion = customer_conversion.drop('material',axis=1)

customer_conversion['has_comment'].value_counts()

0    58263
1     1314
Name: has_comment, dtype: int64

#### Feature 9: trial_issue

In [120]:
trial_issue

Unnamed: 0,class_order_id,issue_type_1,issue_type_2,issue_content
0,110384,0,0,老师缺席课程
1,110384,0,0,老师缺席课程 已经扣除老师两倍工资
2,110707,0,0,Pat 老师缺席\r\n老师9点和11点都有课 按理不会缺席10点的课程 但是因为学员是临时...
3,113216,925,0,由于老师网络出现问题，QQ和skype都无法进行正常上课，之后就临时换了平台上课，但是后期还...
4,113015,925,0,学员：马雷\r\n账号：13663002657\r\n上课时间：2.10 22:00-22:...
...,...,...,...,...
33125,1360868,0,0,未及时填写课后评价
33126,1358795,0,0,未及时填写课后评价
33127,1362043,0,0,返还学员本节课时，老师按设备问题处理
33128,1360403,0,0,老师设备问题，少上了10分钟，申请补偿。


In [121]:
trial_issue_auto

Unnamed: 0,class_order_id,issue_type
0,1329397,3175
1,1328083,3183
2,1329255,3176
3,1327062,3175
4,1327039,3175
...,...,...
1134,1363266,3179
1135,1363018,3175
1136,1363335,3179
1137,1363169,3175


Before automatic issue reporting system was inplaced, student's feedback on class issues are reported manually. Some issues are treated by the system and some issues are classified and handled by TA. For issues treated by the system, there were no tags indicating what the issues are about, which created difficulties to categorize the feedbacks. Therefore, a more general way of handling feedback, by looking at if each class has feedback from student, would be more reasonable to keep the accuracy of the model. 

In [122]:
trial_issue_auto = trial_issue_auto.drop('issue_type',axis=1)
trial_issue.drop(['issue_type_1', 'issue_type_2','issue_content'], axis=1, inplace=True)

trial_issue = pd.concat([trial_issue_auto, trial_issue], ignore_index=True)
trial_issue

Unnamed: 0,class_order_id
0,1329397
1,1328083
2,1329255
3,1327062
4,1327039
...,...
34264,1360868
34265,1358795
34266,1362043
34267,1360403


In [123]:
trial_issue = trial_issue.drop_duplicates()
trial_issue

Unnamed: 0,class_order_id
0,1329397
1,1328083
2,1329255
3,1327062
4,1327039
...,...
34262,1361600
34263,1360926
34264,1360868
34265,1358795


In [124]:
trial_class_completed['has_feedback'] = np.where(trial_class_completed['class_order_id'].isin(trial_issue['class_order_id']), 1, 0)
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day,is_weekend,num_days_reserve,student_evaluation,total_point,material,has_feedback
0,1363249,2934694,16579139,2,26,2024,2,2023-07-27 15:00:00,2023-07-26 14:36:00,4,0,1,12.00,,,0
1,1363375,2924756,16569641,2,26,2094,2,2023-07-27 19:00:00,2023-07-16 14:15:00,4,0,11,0.75,,,0
2,1358789,2924014,16573832,2,26,1357,2,2023-07-16 22:00:00,2023-07-15 12:48:00,7,1,1,0.15,,,0
3,1363196,2923720,16569615,2,26,1857,2,2023-07-27 13:00:00,2023-07-14 19:04:00,4,0,12,7.00,,,0
4,1359487,2919542,16576604,2,26,1419,5,2023-07-18 20:00:00,2023-07-10 09:57:00,2,0,8,10.00,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61591,20751,83433,131724,2,26,203,2,2016-01-06 19:00:00,2015-12-05 06:33:00,3,0,32,3.50,,,0
61592,21033,83429,131772,2,26,203,2,2016-01-10 15:00:00,2015-12-05 06:33:00,7,1,36,3.50,,,0
61593,20580,83422,131701,2,26,203,2,2016-01-04 21:00:00,2015-12-05 06:33:00,1,0,30,3.50,,,0
61594,20491,83416,131689,2,26,203,2,2016-01-04 15:00:00,2015-12-05 06:33:00,1,0,30,3.50,,,0


In [125]:
customer_conversion['trial_issue'] = trial_class_completed.groupby('user_id')['has_feedback'].transform('sum')
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id,teacher_type,is_weekend,num_days_reserve,student_evaluation,has_comment,trial_issue
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677,2,1,11.0,13.0,0,0
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241,2,0,7.0,10.0,0,0
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911,2,0,8.0,7.0,0,0
3,131946,0.0,0.0,242.0,0.0,1,0.0,298,2,0,9.0,3.5,0,0
4,135946,0.0,0.0,0.0,0.0,2,0.0,123,5,0,8.0,3.5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904,2,0,10.0,15.0,0,0
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893,2,0,14.0,3.8,0,0
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638,2,0,8.0,9.0,0,0
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432,2,1,8.0,9.0,0,0


In [126]:
customer_conversion['trial_issue'].value_counts()

0    59240
1      335
2        2
Name: trial_issue, dtype: int64

#### Dependent Variable: if_converted

In [128]:
# Get the product type list
product_list = order[['product_id', 'product_name']].drop_duplicates()

# Sort the values in ascending order
product_list = product_list.sort_values('product_id')

product_list

Unnamed: 0,product_id,product_name
144789,0.0,
2,17.0,海螺币
19447,17.0,体验卡
0,21.0,菲律宾外教
1834,21.0,菲律宾外教次卡
1776,23.0,专业中教
1838,23.0,专业中教次卡
1862,24.0,测评卡
1,25.0,欧美外教次卡
106075,25.0,欧美外交次卡


In [129]:
official_order = order[order['product_id'].isin([21,23,25,27,28,32,33,34,35,38,42,62,64,66])]

In [130]:
buy_type_list = official_order['buy_type'].unique().tolist()
attribute_buy_type = attribute[attribute['attri_id'].isin(buy_type_list)]
print(attribute_buy_type.to_string())

      attri_id         attri_name                    attri_name_en     pid
3            4           销售/客服/采购  Sales/Customer Service/Purchase     0.0
98         104              体验课充值                             None     0.0
99         105               升级充值                             None     0.0
100        107               续费充值                             None     0.0
104        111             推荐学员奖励                             None     0.0
111        118         被推荐奖励免费体验课                             None     0.0
112        120               其他奖励                             None     0.0
113        121                 赠送                             None     0.0
119        131                 约课                             None     0.0
188        252               课程补偿                             None     0.0
189        253              有效期延期                             None     0.0
202        267               直接付费                             None     0.0
256        337           

In [131]:
buy_type_new = attribute_buy_type[attribute_buy_type['attri_name'].isin(['升级充值','直接付费','官网自己充值','新签','新签升级',
                                                            '推荐升级','推荐直接付费','新签扩包'])]

In [132]:
order_new = official_order[official_order['buy_type'].isin(buy_type_new['attri_id'])]
order_new

Unnamed: 0,user_id,product_id,product_name,buy_type,add_date,lesson,product_money
0,113040.0,21.0,菲律宾外教,105.0,,16.0,0.0
1,113040.0,25.0,欧美外教次卡,105.0,,14.0,0.0
3,113040.0,21.0,菲律宾外教,105.0,,16.0,0.0
4,113040.0,25.0,欧美外教次卡,105.0,,14.0,0.0
8,113040.0,21.0,菲律宾外教,105.0,1.412956e+09,16.0,0.0
...,...,...,...,...,...,...,...
231036,16582872.0,25.0,欧美外教次卡,710.0,1.690356e+09,100.0,17900.0
231045,16582701.0,25.0,欧美外教次卡,710.0,1.690358e+09,48.0,8999.0
231088,16582687.0,25.0,欧美外教次卡,710.0,1.690371e+09,50.0,8999.0
231144,16582197.0,25.0,欧美外教次卡,710.0,1.690442e+09,32.0,6688.0


In [133]:
order_new = order_new[order_new['product_money']!=0]
order_new = order_new[order_new['lesson']>=5]
order_new = order_new[order_new['user_id'].isin(trial_customer_list)]
order_new

Unnamed: 0,user_id,product_id,product_name,buy_type,add_date,lesson,product_money
5749,131650.0,21.0,菲律宾外教次卡,105.0,1.451713e+09,32.0,2244.0
5770,131648.0,25.0,欧美外教次卡,105.0,1.451875e+09,8.0,1144.0
5845,131698.0,25.0,欧美外教次卡,105.0,1.451976e+09,49.0,7152.0
5855,131665.0,25.0,欧美外教次卡,105.0,1.452059e+09,48.0,7871.0
5860,131732.0,32.0,日语次卡,105.0,1.452064e+09,14.0,2366.0
...,...,...,...,...,...,...,...
230141,16577932.0,25.0,欧美外教次卡,710.0,1.689650e+09,50.0,8999.0
230144,16401660.0,25.0,欧美外教次卡,710.0,1.689651e+09,32.0,7000.0
230207,16559095.0,25.0,欧美外教次卡,710.0,1.689672e+09,68.0,13376.0
230399,16564940.0,25.0,欧美外教次卡,715.0,1.689842e+09,64.0,12736.0


In [134]:
order_new['user_id'].value_counts()

16495885.0    4
16571168.0    4
16510596.0    4
16460003.0    3
160134.0      3
             ..
160076.0      1
16400768.0    1
16400774.0    1
16400752.0    1
16561546.0    1
Name: user_id, Length: 10990, dtype: int64

In [135]:
order_new_sorted = order_new.sort_values(by='add_date', ascending=True)
order_new_first = order_new_sorted.drop_duplicates(subset='user_id', keep='first')
order_new_first

Unnamed: 0,user_id,product_id,product_name,buy_type,add_date,lesson,product_money
5749,131650.0,21.0,菲律宾外教次卡,105.0,1.451713e+09,32.0,2244.0
5770,131648.0,25.0,欧美外教次卡,105.0,1.451875e+09,8.0,1144.0
5845,131698.0,25.0,欧美外教次卡,105.0,1.451976e+09,49.0,7152.0
5855,131665.0,25.0,欧美外教次卡,105.0,1.452059e+09,48.0,7871.0
5860,131732.0,32.0,日语次卡,105.0,1.452064e+09,14.0,2366.0
...,...,...,...,...,...,...,...
230141,16577932.0,25.0,欧美外教次卡,710.0,1.689650e+09,50.0,8999.0
230144,16401660.0,25.0,欧美外教次卡,710.0,1.689651e+09,32.0,7000.0
230207,16559095.0,25.0,欧美外教次卡,710.0,1.689672e+09,68.0,13376.0
230399,16564940.0,25.0,欧美外教次卡,715.0,1.689842e+09,64.0,12736.0


In [136]:
# Convert UNIX timestamps to datetime objects
order_new_first['add_date'] = pd.to_datetime(order_new_first['add_date'], unit='s')

# Set timezone to China timezone
china_tz = pytz.timezone('Asia/Shanghai')
order_new_first['add_date'] = order_new_first['add_date'].dt.tz_localize(pytz.utc).dt.tz_convert(china_tz)

# Extract yyyy-mm-dd format from datetime objects
order_new_first['add_date'] = order_new_first['add_date'].dt.strftime('%Y-%m-%d %H:%M')

order_new_first.rename(columns={'add_date': 'order_date'}, inplace=True)

order_new_first

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_new_first['add_date'] = pd.to_datetime(order_new_first['add_date'], unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_new_first['add_date'] = order_new_first['add_date'].dt.tz_localize(pytz.utc).dt.tz_convert(china_tz)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_new_firs

Unnamed: 0,user_id,product_id,product_name,buy_type,order_date,lesson,product_money
5749,131650.0,21.0,菲律宾外教次卡,105.0,2016-01-02 13:29,32.0,2244.0
5770,131648.0,25.0,欧美外教次卡,105.0,2016-01-04 10:29,8.0,1144.0
5845,131698.0,25.0,欧美外教次卡,105.0,2016-01-05 14:32,49.0,7152.0
5855,131665.0,25.0,欧美外教次卡,105.0,2016-01-06 13:35,48.0,7871.0
5860,131732.0,32.0,日语次卡,105.0,2016-01-06 14:59,14.0,2366.0
...,...,...,...,...,...,...,...
230141,16577932.0,25.0,欧美外教次卡,710.0,2023-07-18 11:17,50.0,8999.0
230144,16401660.0,25.0,欧美外教次卡,710.0,2023-07-18 11:30,32.0,7000.0
230207,16559095.0,25.0,欧美外教次卡,710.0,2023-07-18 17:19,68.0,13376.0
230399,16564940.0,25.0,欧美外教次卡,715.0,2023-07-20 16:32,64.0,12736.0


In [137]:
trial_class_completed = trial_class_completed.merge(order_new_first[['user_id','order_date']],on='user_id',how='left')
trial_class_completed

Unnamed: 0,class_order_id,class_id,user_id,class_order_status,pay_type,teacher_id,teacher_type,start_time,add_date,w_day,is_weekend,num_days_reserve,student_evaluation,total_point,material,has_feedback,order_date
0,1363249,2934694,16579139,2,26,2024,2,2023-07-27 15:00:00,2023-07-26 14:36:00,4,0,1,12.00,,,0,2023-06-18 21:29
1,1363375,2924756,16569641,2,26,2094,2,2023-07-27 19:00:00,2023-07-16 14:15:00,4,0,11,0.75,,,0,
2,1358789,2924014,16573832,2,26,1357,2,2023-07-16 22:00:00,2023-07-15 12:48:00,7,1,1,0.15,,,0,
3,1363196,2923720,16569615,2,26,1857,2,2023-07-27 13:00:00,2023-07-14 19:04:00,4,0,12,7.00,,,0,
4,1359487,2919542,16576604,2,26,1419,5,2023-07-18 20:00:00,2023-07-10 09:57:00,2,0,8,10.00,,,0,2023-07-17 14:39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61591,20751,83433,131724,2,26,203,2,2016-01-06 19:00:00,2015-12-05 06:33:00,3,0,32,3.50,,,0,
61592,21033,83429,131772,2,26,203,2,2016-01-10 15:00:00,2015-12-05 06:33:00,7,1,36,3.50,,,0,
61593,20580,83422,131701,2,26,203,2,2016-01-04 21:00:00,2015-12-05 06:33:00,1,0,30,3.50,,,0,
61594,20491,83416,131689,2,26,203,2,2016-01-04 15:00:00,2015-12-05 06:33:00,1,0,30,3.50,,,0,


In [138]:
trial_class_completed['order_date'] = pd.to_datetime(trial_class_completed['order_date'])

trial_class_completed['num_days_order']=(trial_class_completed['order_date'] - trial_class_completed['start_time']).dt.days
trial_class_completed['num_days_order'].value_counts()

 0.0       3147
 1.0       1637
 2.0        996
 3.0        652
 4.0        511
           ... 
 1050.0       1
 351.0        1
-81.0         1
-274.0        1
 1380.0       1
Name: num_days_order, Length: 804, dtype: int64

In [139]:
trial_class_completed['within_15days'] = (trial_class_completed['num_days_order'].between(0, 15)).astype(int)
trial_class_completed['within_15days'].value_counts()

0    52834
1     8762
Name: within_15days, dtype: int64

In [140]:
converted_user = trial_class_completed[trial_class_completed['within_15days']==1]
converted_user_list = converted_user['user_id'].unique().tolist()
customer_conversion['if_converted'] = customer_conversion['user_id'].apply(lambda x: 1 if x in converted_user_list else 0)
customer_conversion

Unnamed: 0,user_id,cc_id,num_cc,channel,trial_money,trial_nums,trial_price,teacher_id,teacher_type,is_weekend,num_days_reserve,student_evaluation,has_comment,trial_issue,if_converted
0,16448486,100.0,4.0,97.0,0.0,1,0.0,677,2,1,11.0,13.0,0,0,0
1,16531800,650.0,2.0,2045.0,0.0,1,0.0,1241,2,0,7.0,10.0,0,0,0
2,16532204,100.0,2.0,97.0,0.0,1,0.0,911,2,0,8.0,7.0,0,0,0
3,131946,0.0,0.0,242.0,0.0,1,0.0,298,2,0,9.0,3.5,0,0,0
4,135946,0.0,0.0,0.0,0.0,2,0.0,123,5,0,8.0,3.5,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59572,16571299,490.0,0.0,1662.0,0.0,1,0.0,1904,2,0,10.0,15.0,0,0,0
59573,16573426,490.0,0.0,1662.0,0.0,1,0.0,1893,2,0,14.0,3.8,0,0,0
59574,16557658,647.0,2.0,1662.0,0.0,1,0.0,1638,2,0,8.0,9.0,0,0,0
59575,16558135,647.0,3.0,1662.0,0.0,1,0.0,432,2,1,8.0,9.0,0,0,0


# Finalization

In [141]:
customer_conversion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59577 entries, 0 to 59576
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             59577 non-null  int64  
 1   cc_id               59577 non-null  float64
 2   num_cc              59577 non-null  float64
 3   channel             59577 non-null  float64
 4   trial_money         59577 non-null  float64
 5   trial_nums          59577 non-null  int64  
 6   trial_price         59577 non-null  float64
 7   teacher_id          59577 non-null  int64  
 8   teacher_type        59577 non-null  int64  
 9   is_weekend          59577 non-null  int64  
 10  num_days_reserve    59577 non-null  float64
 11  student_evaluation  59577 non-null  float64
 12  has_comment         59577 non-null  int64  
 13  trial_issue         59577 non-null  int64  
 14  if_converted        59577 non-null  int64  
dtypes: float64(7), int64(8)
memory usage: 7.3 MB


In [142]:
# Save the DataFrame to a CSV file
customer_conversion.to_csv('customer_conversion.csv', index=False)

print("saved to CSV file successfully.")

saved to CSV file successfully.
