In [2]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import urllib.parse
from datetime import datetime
import pytz
import requests
import numpy as np

# Table of Content
* [Database Connection](#Database-Connection)
* [Data Filtering](#Data-Filtering)
    * [Registration Date](#Registration-Date)
    * [Test Accounts](#Test-Accounts)
    * [Offline Students](#Offline-Students)
    * [Abroad Students](#Abroad-Students)
* [Finalization](#Finalization)

# Database Connection

# Data Filtering

As the study's interest is confined to online customers, member records related to test accounts and offline students need to excluded. Moreover, the analysis is defined by a specific temporal range, from the company's establishment until April 30, 2023. As a result, only members registered within this period are incorporated into the analysis. In addition, students who are abroad is not the company's target customers, and are an extreme minority of the population. Abroad students may represent different demand, yet to avoid unnecessary tilt of the dataset, only domestic students are kept for analysis.

In [4]:
# Extract all the member records from table ehailuo_member
query1 = "SELECT userid as 'user_id', regdate as 'reg_date', fullname as 'full_name', agentid as 'agent_id', is_aboard as 'is_abroad' FROM ehailuo_member"
member_raw = pd.read_sql(query1, engine)
member_raw

Unnamed: 0,user_id,reg_date,full_name,agent_id,is_abroad
0,100001,1399203423,刘佳旭,0.0,0
1,100002,1399204559,,0.0,0
2,100100,1399217597,,0.0,0
3,113040,1397133620,,0.0,0
4,113041,1397633140,,0.0,0
...,...,...,...,...,...
237462,16582789,1689944157,lucy,0.0,0
237463,16582790,1689944224,尹梓安,0.0,0
237464,16582791,1689944488,18102250047,0.0,0
237465,16582792,1689944721,妞妞240,0.0,0


In [5]:
member_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237467 entries, 0 to 237466
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    237467 non-null  int64  
 1   reg_date   237467 non-null  int64  
 2   full_name  235509 non-null  object 
 3   agent_id   237461 non-null  float64
 4   is_abroad  237467 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 9.1+ MB


#### Registration Date

The registration date is stored in UNIX timestamp format, which requires conversion into a more interpretable date-time format.

In [6]:
# Convert UNIX timestamps to datetime objects
member_raw['reg_date'] = pd.to_datetime(member_raw['reg_date'], unit='s')

# Set timezone to China timezone
china_tz = pytz.timezone('Asia/Shanghai')
member_raw['reg_date'] = member_raw['reg_date'].dt.tz_localize(pytz.utc).dt.tz_convert(china_tz)

# Extract yyyy-mm-dd format from datetime objects
member_raw['reg_date'] = member_raw['reg_date'].dt.strftime('%Y-%m-%d %H:%M')

member_raw

Unnamed: 0,user_id,reg_date,full_name,agent_id,is_abroad
0,100001,2014-05-04 19:37,刘佳旭,0.0,0
1,100002,2014-05-04 19:55,,0.0,0
2,100100,2014-05-04 23:33,,0.0,0
3,113040,2014-04-10 20:40,,0.0,0
4,113041,2014-04-16 15:25,,0.0,0
...,...,...,...,...,...
237462,16582789,2023-07-21 20:55,lucy,0.0,0
237463,16582790,2023-07-21 20:57,尹梓安,0.0,0
237464,16582791,2023-07-21 21:01,18102250047,0.0,0
237465,16582792,2023-07-21 21:05,妞妞240,0.0,0


In [7]:
# Filter the members by registration date before May 1st 2023
member_raw = member_raw[(member_raw['reg_date'] < '2023-05-01')&(member_raw['reg_date'] > '2015-12-31')]
member_raw

Unnamed: 0,user_id,reg_date,full_name,agent_id,is_abroad
18593,131638,2015-12-31 12:31,陈伟达/sinokortex,0.0,0
18594,131639,2015-12-31 12:35,林丽俊/兜兜多6暮暮,0.0,0
18595,131640,2015-12-31 13:03,Amy,0.0,0
18596,131641,2015-12-31 13:39,萨格拉斯的疯狂,0.0,0
18597,131642,2015-12-31 15:35,罗艳芝,0.0,0
...,...,...,...,...,...
234285,16579599,2023-04-30 20:57,Zoe,0.0,0
234286,16579600,2023-04-30 21:03,tb5491192489,0.0,0
234287,16579601,2023-04-30 21:24,Coco,0.0,0
234288,16579602,2023-04-30 22:38,Suki,0.0,0


#### Test Accounts

Test accounts can be identified by the column 'full_name'. If the member's 'full_name' contains 'test', '测试', or '内部', then the member is a test or internal account, which need to be dropped.

In [8]:
member_less_test = member_raw[~member_raw['full_name'].str.contains('test|测试|内部', na=False)]
member_less_test

Unnamed: 0,user_id,reg_date,full_name,agent_id,is_abroad
18593,131638,2015-12-31 12:31,陈伟达/sinokortex,0.0,0
18594,131639,2015-12-31 12:35,林丽俊/兜兜多6暮暮,0.0,0
18595,131640,2015-12-31 13:03,Amy,0.0,0
18596,131641,2015-12-31 13:39,萨格拉斯的疯狂,0.0,0
18597,131642,2015-12-31 15:35,罗艳芝,0.0,0
...,...,...,...,...,...
234285,16579599,2023-04-30 20:57,Zoe,0.0,0
234286,16579600,2023-04-30 21:03,tb5491192489,0.0,0
234287,16579601,2023-04-30 21:24,Coco,0.0,0
234288,16579602,2023-04-30 22:38,Suki,0.0,0


#### Offline Students

The column 'agent_id' can be used to identify offline students. If a member's 'agent_id' is 0, it indicates that the member is an online member. On the other hand, if the 'agent_id' has a non-zero value, it signifies that the member is an offline student who has been referred by an agent.

In [9]:
# Find the members whose 'agent_id' is null
null_agent = member_less_test[member_less_test['agent_id'].isnull()]
null_agent

Unnamed: 0,user_id,reg_date,full_name,agent_id,is_abroad
22299,135346,2016-04-25 15:16,冯芊宁,,0
60532,16400076,2018-01-30 09:38,叶雅平,,0
88704,16433346,2018-09-02 12:19,Sunny,,0
214461,16559675,2021-11-23 14:12,菁菁,,0
230274,16575567,2023-01-12 18:00,,,0


Upon closer examination of each member with a null 'agent_id', it has been determined that they are legitimate online members. The presence of null values in the 'agent_id' column may be attributed to a systematic failure. Therefore, the 'agent_id' for these members will be updated to value 0.

In [10]:
member_less_test['agent_id'].replace(np.nan, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_less_test['agent_id'].replace(np.nan, 0, inplace=True)


In [11]:
# Extract online students by 'agent_id' equal to 0
online_member = member_less_test[member_less_test['agent_id']==0]
online_member

Unnamed: 0,user_id,reg_date,full_name,agent_id,is_abroad
18593,131638,2015-12-31 12:31,陈伟达/sinokortex,0.0,0
18594,131639,2015-12-31 12:35,林丽俊/兜兜多6暮暮,0.0,0
18595,131640,2015-12-31 13:03,Amy,0.0,0
18596,131641,2015-12-31 13:39,萨格拉斯的疯狂,0.0,0
18597,131642,2015-12-31 15:35,罗艳芝,0.0,0
...,...,...,...,...,...
234285,16579599,2023-04-30 20:57,Zoe,0.0,0
234286,16579600,2023-04-30 21:03,tb5491192489,0.0,0
234287,16579601,2023-04-30 21:24,Coco,0.0,0
234288,16579602,2023-04-30 22:38,Suki,0.0,0


#### Abroad Students

In [12]:
domestic_member = online_member[online_member['is_abroad']==0]
domestic_member

Unnamed: 0,user_id,reg_date,full_name,agent_id,is_abroad
18593,131638,2015-12-31 12:31,陈伟达/sinokortex,0.0,0
18594,131639,2015-12-31 12:35,林丽俊/兜兜多6暮暮,0.0,0
18595,131640,2015-12-31 13:03,Amy,0.0,0
18596,131641,2015-12-31 13:39,萨格拉斯的疯狂,0.0,0
18597,131642,2015-12-31 15:35,罗艳芝,0.0,0
...,...,...,...,...,...
234285,16579599,2023-04-30 20:57,Zoe,0.0,0
234286,16579600,2023-04-30 21:03,tb5491192489,0.0,0
234287,16579601,2023-04-30 21:24,Coco,0.0,0
234288,16579602,2023-04-30 22:38,Suki,0.0,0


# Finalization

Save the user_id of the cleaned members to a CSV file named 'online_member_list' for further data manipulation purposes.

In [13]:
domestic_member = domestic_member['user_id']

# Save the DataFrame to a CSV file
domestic_member.to_csv('cleaned_member.csv', index=False)

print("saved to CSV file successfully.")

saved to CSV file successfully.
