In [1]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import urllib.parse
from datetime import datetime
import pytz
import requests
import numpy as np

# Table of Content
* [Data Extraction](#Data-Extraction)
* [Data Filtering](#Data-Filtering)
* [Member Demographics](#Member-Demographics)
    * [Feature 1: age](#Feature-1:-age)
    * [Feature 2: gender](#Feature-2:-gender)
    * [Feature 3: job](#Feature-3:-job)
    * [Feature 4: after_policy](#Feature-4:-after_policy)
* [Finalization](#Finalization)



# Data Extraction

In [3]:
# Fetch data from MySQL (attribute_info)
query1 = "SELECT attriid as 'attri_id', attriname as 'attri_name', attriname_en as 'attri_name_en' FROM ehailuo_attribute_info_glv"
attribute = pd.read_sql(query1, engine)
attribute

Unnamed: 0,attri_id,attri_name,attri_name_en
0,1,小学生,Elementary Students
1,2,中学生,Junior/Senior Students
2,3,大学生,Undergraduates
3,4,销售/客服/采购,Sales/Customer Service/Purchase
4,5,IT/通信/电子,IT/Communication/E-commerce
...,...,...,...
2752,3187,抖音直播,
2753,3188,CR30天停课学员回访,
2754,3189,青少儿应试,Young Learners Test Preparation
2755,3190,小红书投放,


In [4]:
# Fetch data from MySQL (member)
query2 = "SELECT userid as 'user_id', regdate as 'reg_date', product_want FROM ehailuo_member"
member_raw = pd.read_sql(query2, engine)
member_raw

Unnamed: 0,user_id,reg_date,product_want
0,100001,1399203423,567.0
1,100002,1399204559,567.0
2,100100,1399217597,567.0
3,113040,1397133620,568.0
4,113041,1397633140,567.0
...,...,...,...
237804,16583131,1690460914,1035.0
237805,16583132,1690460921,1035.0
237806,16583133,1690461000,1035.0
237807,16583134,1690461094,1035.0


In [5]:
# Fetch data from MySQL (member detail)
query3 = "SELECT userid as 'user_id', gender, age, job FROM ehailuo_member_detail"
member_detail_raw = pd.read_sql(query3, engine)
member_detail_raw

Unnamed: 0,user_id,gender,age,job
0,100001,1,413.0,0
1,113040,1,0.0,0
2,113043,0,418.0,0
3,113050,0,0.0,0
4,113086,0,0.0,0
...,...,...,...,...
242720,16583131,0,1504.0,0
242721,16583132,1,412.0,1
242722,16583133,0,1503.0,0
242723,16583134,0,1503.0,0


In [6]:
# Read the CSV file into a DataFrame
cleaned_member = pd.read_csv('cleaned_member.csv')
cleaned_member

Unnamed: 0,user_id
0,131638
1,131639
2,131640
3,131641
4,131642
...,...
158085,16579599
158086,16579600
158087,16579601
158088,16579602


# Data Filtering

Filter the data according to the cleaned member user_id.

In [7]:
# Get a list of user_id of online members
cleaned_member_list = cleaned_member['user_id'].tolist()

In [8]:
# Filter the table according to the online_member_list
member_demographics_raw = member_raw.merge(member_detail_raw, on='user_id',how='inner')
member_demographics = member_demographics_raw[member_demographics_raw['user_id'].isin(cleaned_member_list)]
member_demographics

Unnamed: 0,user_id,reg_date,product_want,gender,age,job
18541,131638,1451536291,569.0,1,420.0,0
18542,131639,1451536531,567.0,0,417.0,0
18543,131640,1451538181,567.0,0,413.0,0
18544,131641,1451540380,567.0,0,416.0,0
18545,131642,1451547330,567.0,0,420.0,0
...,...,...,...,...,...,...
234233,16579599,1682859465,1035.0,0,412.0,16
234234,16579600,1682859819,567.0,0,417.0,16
234235,16579601,1682861069,1035.0,0,412.0,1
234236,16579602,1682865539,567.0,0,419.0,16


In [9]:
member_demographics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158090 entries, 18541 to 234237
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   user_id       158090 non-null  int64  
 1   reg_date      158090 non-null  int64  
 2   product_want  152755 non-null  float64
 3   gender        158090 non-null  int64  
 4   age           139397 non-null  float64
 5   job           158090 non-null  int64  
dtypes: float64(2), int64(4)
memory usage: 8.4 MB


# Member Demgraphics

#### Feature 1: age

In [10]:
member_demographics['age'].value_counts()

0.0       58411
418.0     19347
416.0     16074
417.0     12900
412.0      9084
419.0      7091
420.0      6332
411.0      1974
1004.0     1934
1504.0     1726
413.0      1624
414.0      1522
1503.0      646
1505.0      309
1002.0      219
1507.0       76
1003.0       68
1506.0       58
415.0         2
Name: age, dtype: int64

In [11]:
# find the attribute name for age related attribute id
age_list = member_demographics['age'].unique().tolist()
attribute_age = attribute[attribute['attri_id'].isin(age_list)]
attribute_age

Unnamed: 0,attri_id,attri_name,attri_name_en
324,411,学前,Pre-school
325,412,小学,Primary school
326,413,初中,Middle school
327,414,高中,High school
328,416,大学,University / College
329,417,工作不足1年,Have been working for less than 1 year
330,418,工作1-3年,Have been working for 1-3 years
331,419,工作3-5年,Have been working for 3-5 years
332,420,工作5到10年,Have been working for 5-10 years
779,1002,硕士,


In [12]:
# Replace 0 values with null, as no corresponding attribute name existed
member_demographics['age'].replace(0, np.nan, inplace=True)

# Replace 415 values with null, as no corresponding attribute name existed
member_demographics['age'].replace(415,np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['age'].replace(0, np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['age'].replace(415,np.nan, inplace=True)


In [13]:
# As there are mutiple attribute ids for the same age group, so replace the repeated attribute id
member_demographics['age'] = member_demographics['age'].replace({1503: 411, 1504.0: 412, 1505: 413, 
                                                                           1506.0: 414})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['age'] = member_demographics['age'].replace({1503: 411, 1504.0: 412, 1505: 413,


In [14]:
# get the non-repeated attribute list for age
age_list_edited = member_demographics['age'].unique().tolist()
attribute_age_edited = attribute[attribute['attri_id'].isin(age_list_edited)]
attribute_age_edited

Unnamed: 0,attri_id,attri_name,attri_name_en
324,411,学前,Pre-school
325,412,小学,Primary school
326,413,初中,Middle school
327,414,高中,High school
328,416,大学,University / College
329,417,工作不足1年,Have been working for less than 1 year
330,418,工作1-3年,Have been working for 1-3 years
331,419,工作3-5年,Have been working for 3-5 years
332,420,工作5到10年,Have been working for 5-10 years
779,1002,硕士,


In [15]:
# Determine if the customer is child based only on age

# Define the conditions
conditions = [
    (member_demographics['age'].isin([411, 412, 413, 414])),  # If age is 411, 412, 413, or 414, then the customer is under 18 years old
    (member_demographics['age'].isnull())  # If age is null
]

# Define the corresponding values
values = [1, np.nan]  # Assign 1 for the first condition, and null for the second condition

# Apply the conditions and values to create the 'age_child' column
member_demographics['age_child'] = np.select(conditions, values, default=0)

member_demographics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['age_child'] = np.select(conditions, values, default=0)


Unnamed: 0,user_id,reg_date,product_want,gender,age,job,age_child
18541,131638,1451536291,569.0,1,420.0,0,0.0
18542,131639,1451536531,567.0,0,417.0,0,0.0
18543,131640,1451538181,567.0,0,413.0,0,1.0
18544,131641,1451540380,567.0,0,416.0,0,0.0
18545,131642,1451547330,567.0,0,420.0,0,0.0
...,...,...,...,...,...,...,...
234233,16579599,1682859465,1035.0,0,412.0,16,1.0
234234,16579600,1682859819,567.0,0,417.0,16,0.0
234235,16579601,1682861069,1035.0,0,412.0,1,1.0
234236,16579602,1682865539,567.0,0,419.0,16,0.0


In [16]:
member_demographics['product_want'].value_counts()

567.0     63822
1035.0    63733
568.0      9071
569.0      6309
570.0      5406
571.0      3994
999.0       401
1000.0       12
1001.0        7
Name: product_want, dtype: int64

In [17]:
# find the attribute name for product_want
product_type_list = member_demographics['product_want'].unique().tolist()
attribute_product_type = attribute[attribute['attri_id'].isin(product_type_list)]
attribute_product_type

Unnamed: 0,attri_id,attri_name,attri_name_en
454,567,日常旅游,
455,568,青少儿,
456,569,商务面试外贸,
457,570,应试课程,
458,571,日语,
776,999,行业英语,
777,1000,西班牙语,
778,1001,德语,
806,1035,青少儿2（不固定老师）,


In [18]:
# Determine if the customer is child based only on product

# Define the conditions
conditions = [
    (member_demographics['product_want'].isin([568,1035])),  # If product_want is 568 or 1035
    (member_demographics['product_want'].isnull())  # If product_want is null
]

# Define the corresponding values
values = [1, np.nan]  # Assign 1 for the first condition, and null for the second condition

# Apply the conditions and values to create the 'age_child' column
member_demographics['product_child'] = np.select(conditions, values, default=0)

member_demographics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['product_child'] = np.select(conditions, values, default=0)


Unnamed: 0,user_id,reg_date,product_want,gender,age,job,age_child,product_child
18541,131638,1451536291,569.0,1,420.0,0,0.0,0.0
18542,131639,1451536531,567.0,0,417.0,0,0.0,0.0
18543,131640,1451538181,567.0,0,413.0,0,1.0,0.0
18544,131641,1451540380,567.0,0,416.0,0,0.0,0.0
18545,131642,1451547330,567.0,0,420.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...
234233,16579599,1682859465,1035.0,0,412.0,16,1.0,1.0
234234,16579600,1682859819,567.0,0,417.0,16,0.0,0.0
234235,16579601,1682861069,1035.0,0,412.0,1,1.0,1.0
234236,16579602,1682865539,567.0,0,419.0,16,0.0,0.0


In [19]:
# Define the conditions
conditions = [
    ((member_demographics['age_child'] == 0) & (member_demographics['product_child'] == 1)),  # age_child is 0, product_child is 1
    ((member_demographics['age_child'] == 1) & (member_demographics['product_child'] == 0)),  # age_child is 1, product_child is 0
    ((member_demographics['age_child'] == 0) & (member_demographics['product_child'] == 0)),  # both age_child and product_child are 0
    ((member_demographics['age_child'] == 1) & (member_demographics['product_child'] == 1)),  # both age_child and product_child are 1
    ((member_demographics['age_child'].isnull()) & (member_demographics['product_child'].notnull())),  # age_child is null, product_child is not null
    ((member_demographics['age_child'].notnull()) & (member_demographics['product_child'].isnull())),  # age_child is not null, product_child is null
    ((member_demographics['age_child'].isnull()) & (member_demographics['product_child'].isnull()))  # both age_child and product_child are null
]

# Define the corresponding values
values = [1, 1, 0, 1, member_demographics['product_child'], member_demographics['age_child'], np.nan]

# Apply the conditions and values to create the 'is_child' column
member_demographics['is_child'] = np.select(conditions, values, default=np.nan)
member_demographics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['is_child'] = np.select(conditions, values, default=np.nan)


Unnamed: 0,user_id,reg_date,product_want,gender,age,job,age_child,product_child,is_child
18541,131638,1451536291,569.0,1,420.0,0,0.0,0.0,0.0
18542,131639,1451536531,567.0,0,417.0,0,0.0,0.0,0.0
18543,131640,1451538181,567.0,0,413.0,0,1.0,0.0,1.0
18544,131641,1451540380,567.0,0,416.0,0,0.0,0.0,0.0
18545,131642,1451547330,567.0,0,420.0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
234233,16579599,1682859465,1035.0,0,412.0,16,1.0,1.0,1.0
234234,16579600,1682859819,567.0,0,417.0,16,0.0,0.0,0.0
234235,16579601,1682861069,1035.0,0,412.0,1,1.0,1.0,1.0
234236,16579602,1682865539,567.0,0,419.0,16,0.0,0.0,0.0


In [20]:
member_demographics['is_child'].info()

<class 'pandas.core.series.Series'>
Int64Index: 158090 entries, 18541 to 234237
Series name: is_child
Non-Null Count   Dtype  
--------------   -----  
154375 non-null  float64
dtypes: float64(1)
memory usage: 2.4 MB


Outliers can be identified if the age is not child, yet the product is child. The probable reason for this could be the CC records the age of the parents, and the actual student is the child. For consistency reason, the age should record the student's age, so the value of parent's age should be avoided. Therefore, change its value to null.

In [22]:
# Identify the outliers and change its value to null
member_demographics.loc[(member_demographics['age_child'] == 0) & (member_demographics['product_child'] == 1), 'age'] = np.nan
member_demographics

Unnamed: 0,user_id,reg_date,product_want,gender,age,job,age_child,product_child,is_child
18541,131638,1451536291,569.0,1,420.0,0,0.0,0.0,0.0
18542,131639,1451536531,567.0,0,417.0,0,0.0,0.0,0.0
18543,131640,1451538181,567.0,0,413.0,0,1.0,0.0,1.0
18544,131641,1451540380,567.0,0,416.0,0,0.0,0.0,0.0
18545,131642,1451547330,567.0,0,420.0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
234233,16579599,1682859465,1035.0,0,412.0,16,1.0,1.0,1.0
234234,16579600,1682859819,567.0,0,417.0,16,0.0,0.0,0.0
234235,16579601,1682861069,1035.0,0,412.0,1,1.0,1.0,1.0
234236,16579602,1682865539,567.0,0,419.0,16,0.0,0.0,0.0


In [23]:
member_demographics['age'].replace(np.nan, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['age'].replace(np.nan, 0, inplace=True)


#### Feature 2: gender

The gender for child student is not valuable for analysis. For child student, the gender will be changed to 2.

In [24]:
# Check the condition and update the gender column
member_demographics.loc[member_demographics['is_child'] == 1, 'gender'] = 2
member_demographics['gender'].value_counts()

2    76554
0    68266
1    13270
Name: gender, dtype: int64

#### Feature 3: job

In [25]:
member_demographics['job'].value_counts()

0      79691
16     55440
3       9021
1       7593
2       2199
4        852
13       606
5        393
11       364
8        329
14       312
7        277
15       256
9        225
12       193
6        190
10       147
255        2
Name: job, dtype: int64

In [26]:
job_list = member_demographics['job'].unique().tolist()
attribute_job = attribute[attribute['attri_id'].isin(job_list)]
attribute_job

Unnamed: 0,attri_id,attri_name,attri_name_en
0,1,小学生,Elementary Students
1,2,中学生,Junior/Senior Students
2,3,大学生,Undergraduates
3,4,销售/客服/采购,Sales/Customer Service/Purchase
4,5,IT/通信/电子,IT/Communication/E-commerce
5,6,房产/建筑建设/物业,Real Estate/Construction/Management Agency
6,7,财会/金融,Accounting/Finance
7,8,汽车/工程机械,Auto Industry/Engineering Machinery
8,9,消费品/生产/物流,Consumer Goods/Production/Physical Distribution
9,10,市场/媒介/设计,Market/Intermediary/Design


In [27]:
# Replace the non-meaningful values to 0 
member_demographics['job'].replace(np.nan, 0, inplace=True)
member_demographics['job'].replace(255, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['job'].replace(np.nan, 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['job'].replace(255, 0, inplace=True)


#### Feature 4: after_policy

In [28]:
# Convert UNIX timestamps to datetime objects
member_demographics['reg_date'] = pd.to_datetime(member_demographics['reg_date'], unit='s')

# Set timezone to China timezone
china_tz = pytz.timezone('Asia/Shanghai')
member_demographics['reg_date'] = member_demographics['reg_date'].dt.tz_localize(pytz.utc).dt.tz_convert(china_tz)

# Extract yyyy-mm-dd format from datetime objects
member_demographics['reg_date'] = member_demographics['reg_date'].dt.strftime('%Y-%m-%d %H:%M')

member_demographics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['reg_date'] = pd.to_datetime(member_demographics['reg_date'], unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['reg_date'] = member_demographics['reg_date'].dt.tz_localize(pytz.utc).dt.tz_convert(china_tz)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,user_id,reg_date,product_want,gender,age,job,age_child,product_child,is_child
18541,131638,2015-12-31 12:31,569.0,1,420.0,0,0.0,0.0,0.0
18542,131639,2015-12-31 12:35,567.0,0,417.0,0,0.0,0.0,0.0
18543,131640,2015-12-31 13:03,567.0,2,413.0,0,1.0,0.0,1.0
18544,131641,2015-12-31 13:39,567.0,0,416.0,0,0.0,0.0,0.0
18545,131642,2015-12-31 15:35,567.0,0,420.0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
234233,16579599,2023-04-30 20:57,1035.0,2,412.0,16,1.0,1.0,1.0
234234,16579600,2023-04-30 21:03,567.0,0,417.0,16,0.0,0.0,0.0
234235,16579601,2023-04-30 21:24,1035.0,2,412.0,1,1.0,1.0,1.0
234236,16579602,2023-04-30 22:38,567.0,0,419.0,16,0.0,0.0,0.0


In [29]:
# Create the 'after_policy' column
member_demographics['after_policy'] = (member_demographics['reg_date'] > '2021-07-24').astype(int)
member_demographics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demographics['after_policy'] = (member_demographics['reg_date'] > '2021-07-24').astype(int)


Unnamed: 0,user_id,reg_date,product_want,gender,age,job,age_child,product_child,is_child,after_policy
18541,131638,2015-12-31 12:31,569.0,1,420.0,0,0.0,0.0,0.0,0
18542,131639,2015-12-31 12:35,567.0,0,417.0,0,0.0,0.0,0.0,0
18543,131640,2015-12-31 13:03,567.0,2,413.0,0,1.0,0.0,1.0,0
18544,131641,2015-12-31 13:39,567.0,0,416.0,0,0.0,0.0,0.0,0
18545,131642,2015-12-31 15:35,567.0,0,420.0,0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...
234233,16579599,2023-04-30 20:57,1035.0,2,412.0,16,1.0,1.0,1.0,1
234234,16579600,2023-04-30 21:03,567.0,0,417.0,16,0.0,0.0,0.0,1
234235,16579601,2023-04-30 21:24,1035.0,2,412.0,1,1.0,1.0,1.0,1
234236,16579602,2023-04-30 22:38,567.0,0,419.0,16,0.0,0.0,0.0,1


# Finalization

In [30]:
# Select the desired columns
selected_columns = ['user_id','age','gender','job','after_policy']

member_demographics = member_demographics[selected_columns]
member_demographics

Unnamed: 0,user_id,age,gender,job,after_policy
18541,131638,420.0,1,0,0
18542,131639,417.0,0,0,0
18543,131640,413.0,2,0,0
18544,131641,416.0,0,0,0
18545,131642,420.0,0,0,0
...,...,...,...,...,...
234233,16579599,412.0,2,16,1
234234,16579600,417.0,0,16,1
234235,16579601,412.0,2,1,1
234236,16579602,419.0,0,16,1


In [31]:
member_demographics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158090 entries, 18541 to 234237
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   user_id       158090 non-null  int64  
 1   age           158090 non-null  float64
 2   gender        158090 non-null  int64  
 3   job           158090 non-null  int64  
 4   after_policy  158090 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 7.2 MB


In [32]:
# Save the DataFrame to a CSV file
member_demographics.to_csv('member_demographics.csv', index=False)

print("saved to CSV file successfully.")

saved to CSV file successfully.
