In [1]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import urllib.parse
from datetime import datetime
import pytz
import requests
import numpy as np

# Table of Content
* [Data Extraction](#Data-Extraction)
* [Data Filtering](#Data-Filtering)
* [Member Demand](#Member-Demand)
    * [Feature 1: product_want](#Feature-1:-product_want)
    * [Feature 2: study_level](#Feature-2:-study_level)
    * [Feature 3: study_from](#Feature-3:-study_from)
    * [Feature 4: want_to_study](#Feature-4:-want_to_study)
    * [Feature 5: study_time](#Feature-5:-study_time)
    * [Feature 6: study_budget](#Feature-6:-study_budget)
    * [Feature 7: purpose](#Feature-7:-purpose)
* [Finalization](#Finalization)

# Data Extraction

In [3]:
# Fetch data from MySQL (attribute_info)
query1 = "SELECT attriid as 'attri_id', attriname as 'attri_name', attriname_en as 'attri_name_en' FROM ehailuo_attribute_info_glv"
attribute = pd.read_sql(query1, engine)
attribute

Unnamed: 0,attri_id,attri_name,attri_name_en
0,1,小学生,Elementary Students
1,2,中学生,Junior/Senior Students
2,3,大学生,Undergraduates
3,4,销售/客服/采购,Sales/Customer Service/Purchase
4,5,IT/通信/电子,IT/Communication/E-commerce
...,...,...,...
2752,3187,抖音直播,
2753,3188,CR30天停课学员回访,
2754,3189,青少儿应试,Young Learners Test Preparation
2755,3190,小红书投放,


In [4]:
# Fetch data from MySQL (attribute_info)
query2 = "SELECT userid as 'user_id', product_want FROM ehailuo_member"
product_want = pd.read_sql(query2, engine)
product_want

Unnamed: 0,user_id,product_want
0,100001,567.0
1,100002,567.0
2,100100,567.0
3,113040,568.0
4,113041,567.0
...,...,...
237804,16583131,1035.0
237805,16583132,1035.0
237806,16583133,1035.0
237807,16583134,1035.0


In [5]:
# Fetch data from MySQL (attribute_info)
query3 = "SELECT userid as 'user_id', studylevel as study_level, studyfrom as study_from, studyhigh as want_to_study, studyshijian as study_time, studymoney as study_budget, purpose FROM ehailuo_member_detail"
demand_detail = pd.read_sql(query3, engine)
demand_detail

Unnamed: 0,user_id,study_level,study_from,want_to_study,study_time,study_budget,purpose
0,100001,0.0,,,544.0,475.0,0.0
1,113040,0.0,,,0.0,0.0,0.0
2,113043,422.0,433434,439,0.0,0.0,21.0
3,113050,0.0,431436,,471.0,476.0,0.0
4,113086,0.0,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...
242720,16583131,,,,,,0.0
242721,16583132,428.0,429,438439,544.0,545.0,23.0
242722,16583133,,,,,,0.0
242723,16583134,,,,,,0.0


In [6]:
# Read the CSV file into a DataFrame
member_demographics = pd.read_csv('member_demographics.csv')
member_demographics

Unnamed: 0,user_id,age,gender,job,after_policy
0,131638,420.0,1,0,0
1,131639,417.0,0,0,0
2,131640,413.0,2,0,0
3,131641,416.0,0,0,0
4,131642,420.0,0,0,0
...,...,...,...,...,...
158085,16579599,412.0,2,16,1
158086,16579600,417.0,0,16,1
158087,16579601,412.0,2,1,1
158088,16579602,419.0,0,16,1


# Data Filtering

Filter the data according to the user_id of members in the member_demographics table.

In [7]:
# Get a list of user_id of members for analysis
member_analysis_list = member_demographics['user_id'].tolist()

In [8]:
# Filter the table according to the member_analysis_list
member_demand_raw = product_want.merge(demand_detail, on='user_id',how='inner')
member_demand = member_demand_raw[member_demand_raw['user_id'].isin(member_analysis_list)]
member_demand

Unnamed: 0,user_id,product_want,study_level,study_from,want_to_study,study_time,study_budget,purpose
18541,131638,569.0,421.0,,438439,0.0,0.0,0.0
18542,131639,567.0,421.0,,438439,0.0,0.0,0.0
18543,131640,567.0,421.0,,,0.0,0.0,0.0
18544,131641,567.0,421.0,,,0.0,0.0,0.0
18545,131642,567.0,421.0,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
234233,16579599,1035.0,428.0,,438439440441,544.0,545.0,31.0
234234,16579600,567.0,428.0,,438439440441,544.0,545.0,31.0
234235,16579601,1035.0,428.0,,438439,544.0,545.0,23.0
234236,16579602,567.0,428.0,,438439440441,544.0,545.0,31.0


In [9]:
member_demand.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158090 entries, 18541 to 234237
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        158090 non-null  int64  
 1   product_want   152755 non-null  float64
 2   study_level    138860 non-null  float64
 3   study_from     59794 non-null   object 
 4   want_to_study  68072 non-null   object 
 5   study_time     138860 non-null  float64
 6   study_budget   138860 non-null  float64
 7   purpose        154031 non-null  float64
dtypes: float64(5), int64(1), object(2)
memory usage: 10.9+ MB


# Member Demand

#### Feature 1: product_want

In [10]:
member_demand['product_want'].value_counts()

567.0     63822
1035.0    63733
568.0      9071
569.0      6309
570.0      5406
571.0      3994
999.0       401
1000.0       12
1001.0        7
Name: product_want, dtype: int64

In [11]:
# find the attribute name for product_want
product_type_list = member_demand['product_want'].unique().tolist()
attribute_product_type = attribute[attribute['attri_id'].isin(product_type_list)]
attribute_product_type

Unnamed: 0,attri_id,attri_name,attri_name_en
454,567,日常旅游,
455,568,青少儿,
456,569,商务面试外贸,
457,570,应试课程,
458,571,日语,
776,999,行业英语,
777,1000,西班牙语,
778,1001,德语,
806,1035,青少儿2（不固定老师）,


No inconsistency in attribute for product_want, thus only convert the null values to 0

In [12]:
# change null value to 0, (as categorical value, 0 represents missing)
member_demand['product_want'].replace(np.nan, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['product_want'].replace(np.nan, 0, inplace=True)


#### Feature 2: study_level

In [13]:
member_demand['study_level'].value_counts()

428.0     61609
0.0       56763
421.0      7400
423.0      4878
422.0      4740
426.0      2013
427.0       492
425.0       371
424.0       320
1533.0      114
1532.0       71
1531.0       54
1530.0       35
Name: study_level, dtype: int64

In [14]:
# find the attribute name for study_level
english_level = member_demand['study_level'].unique().tolist()
attribute_english_level = attribute[attribute['attri_id'].isin(english_level)]
attribute_english_level

Unnamed: 0,attri_id,attri_name,attri_name_en
333,421,零基础,Zero base
334,422,四级,CET 4
335,423,六级,CET 6
336,424,专四,TEM 4
337,425,专八,TEM 8
338,426,雅思,IELTS
339,427,托福,TOEFL
340,428,其他,Others
1190,1530,完全零基础,Complete zero basis
1191,1531,认识26个英文字母和少许单词,Know 26 English letters and a few words


In [15]:
# change unmeaningful value to 0, (as categorical value, 0 represents missing)
member_demand['study_level'].replace(np.nan, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['study_level'].replace(np.nan, 0, inplace=True)


#### Feature 3: study_from

Value for 'study_from' could have multiple entries. Therefore, each attribute for 'study_from' is changed to a dummy to avoid multiple entries.

In [16]:
# Change empty value to None
member_demand['study_from'] = member_demand['study_from'].replace('', None)
member_demand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['study_from'] = member_demand['study_from'].replace('', None)


Unnamed: 0,user_id,product_want,study_level,study_from,want_to_study,study_time,study_budget,purpose
18541,131638,569.0,421.0,,438439,0.0,0.0,0.0
18542,131639,567.0,421.0,,438439,0.0,0.0,0.0
18543,131640,567.0,421.0,,,0.0,0.0,0.0
18544,131641,567.0,421.0,,,0.0,0.0,0.0
18545,131642,567.0,421.0,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
234233,16579599,1035.0,428.0,,438439440441,544.0,545.0,31.0
234234,16579600,567.0,428.0,,438439440441,544.0,545.0,31.0
234235,16579601,1035.0,428.0,,438439,544.0,545.0,23.0
234236,16579602,567.0,428.0,,438439440441,544.0,545.0,31.0


In [17]:
# Convert the column values to strings 
member_demand['study_from'] = member_demand['study_from'].astype(str).replace('None', '')

# Split the strings on commas and flatten the lists
background_list = member_demand['study_from'].str.split(',').explode()

# Remove any empty strings from the list
background_list = background_list[background_list != '']

# Convert the values to float
background_list = background_list.astype(float)

# Convert the list to a set to remove duplicates
background_unique = set(background_list)

# Convert the set back to a list
background_unique_list = list(background_unique)
background_unique_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['study_from'] = member_demand['study_from'].astype(str).replace('None', '')


[429.0, 430.0, 431.0, 432.0, 433.0, 434.0, 435.0, 436.0, 437.0]

In [18]:
# find the attribute name for study_from
attribute_background = attribute[attribute['attri_id'].isin(background_unique_list)]
attribute_background

Unnamed: 0,attri_id,attri_name,attri_name_en
341,429,线下培训学校,Study in language school
342,430,在线课程,Have classes online
343,431,手机App,Use app
344,432,英文电影,English movies
345,433,英文读物,Read English book
346,434,与外国人交谈,Talk with foreigners
347,435,工作中,In working
348,436,社交网站,Read Paragraphs on SNS
349,437,其他,Others


In [19]:
# Create the 'learnt_offline' column
member_demand['learnt_offline'] = member_demand['study_from'].str.contains('429').astype(int)

# Create the 'learnt_online' column
member_demand['learnt_online'] = member_demand['study_from'].str.contains('430').astype(int)

# Create the 'learnt_app' column
member_demand['learnt_app'] = member_demand['study_from'].str.contains('431').astype(int)

# Create the 'learnt_movie' column
member_demand['learnt_movie'] = member_demand['study_from'].str.contains('432').astype(int)

# Create the 'learnt_book' column
member_demand['learnt_book'] = member_demand['study_from'].str.contains('433').astype(int)

# Create the 'learnt_foreigner' column
member_demand['learnt_foreigner'] = member_demand['study_from'].str.contains('434').astype(int)

# Create the 'learnt_work' column
member_demand['learnt_work'] = member_demand['study_from'].str.contains('435').astype(int)

# Create the 'learnt_media' column
member_demand['learnt_media'] = member_demand['study_from'].str.contains('436').astype(int)


# Create the 'learnt_other' column
member_demand['learnt_other'] = member_demand['study_from'].str.contains('437').astype(int)

member_demand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['learnt_offline'] = member_demand['study_from'].str.contains('429').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['learnt_online'] = member_demand['study_from'].str.contains('430').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['lea

Unnamed: 0,user_id,product_want,study_level,study_from,want_to_study,study_time,study_budget,purpose,learnt_offline,learnt_online,learnt_app,learnt_movie,learnt_book,learnt_foreigner,learnt_work,learnt_media,learnt_other
18541,131638,569.0,421.0,,438439,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
18542,131639,567.0,421.0,,438439,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
18543,131640,567.0,421.0,,,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
18544,131641,567.0,421.0,,,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
18545,131642,567.0,421.0,,,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234233,16579599,1035.0,428.0,,438439440441,544.0,545.0,31.0,0,0,0,0,0,0,0,0,0
234234,16579600,567.0,428.0,,438439440441,544.0,545.0,31.0,0,0,0,0,0,0,0,0,0
234235,16579601,1035.0,428.0,,438439,544.0,545.0,23.0,0,0,0,0,0,0,0,0,0
234236,16579602,567.0,428.0,,438439440441,544.0,545.0,31.0,0,0,0,0,0,0,0,0,0


#### Feature 4: want_to_study

In [20]:
# Change empty value to None
member_demand['want_to_study'] = member_demand['want_to_study'].replace('', None)
member_demand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['want_to_study'] = member_demand['want_to_study'].replace('', None)


Unnamed: 0,user_id,product_want,study_level,study_from,want_to_study,study_time,study_budget,purpose,learnt_offline,learnt_online,learnt_app,learnt_movie,learnt_book,learnt_foreigner,learnt_work,learnt_media,learnt_other
18541,131638,569.0,421.0,,438439,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
18542,131639,567.0,421.0,,438439,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
18543,131640,567.0,421.0,,,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
18544,131641,567.0,421.0,,,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
18545,131642,567.0,421.0,,,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234233,16579599,1035.0,428.0,,438439440441,544.0,545.0,31.0,0,0,0,0,0,0,0,0,0
234234,16579600,567.0,428.0,,438439440441,544.0,545.0,31.0,0,0,0,0,0,0,0,0,0
234235,16579601,1035.0,428.0,,438439,544.0,545.0,23.0,0,0,0,0,0,0,0,0,0
234236,16579602,567.0,428.0,,438439440441,544.0,545.0,31.0,0,0,0,0,0,0,0,0,0


In [21]:
# Convert the column values to strings 
member_demand['want_to_study'] = member_demand['want_to_study'].astype(str).replace('None', '')

# Split the strings on commas and flatten the lists
intention_list = member_demand['want_to_study'].str.split(',').explode()

# Remove any empty strings from the list
intention_list = intention_list[intention_list != '']

# Convert the values to float
intention_list = intention_list.astype(float)

# Convert the list to a set to remove duplicates
intention_unique = set(intention_list)

# Convert the set back to a list 
intention_unique_list = list(intention_unique)
intention_unique_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['want_to_study'] = member_demand['want_to_study'].astype(str).replace('None', '')


[448.0,
 449.0,
 450.0,
 438.0,
 439.0,
 440.0,
 441.0,
 442.0,
 443.0,
 444.0,
 445.0,
 446.0,
 447.0]

In [22]:
# find the attribute name for want_to_study
attribute_background = attribute[attribute['attri_id'].isin(intention_unique_list)]
attribute_background

Unnamed: 0,attri_id,attri_name,attri_name_en
350,438,听力,Listening
351,439,口语,Speaking
352,440,阅读,Reading
353,441,写作,Writing
354,442,词汇,Vocabulary
355,443,发音,Pronunciation
356,444,语法,Gramma
357,445,流利,Fluency
358,446,口译,Interpreting
359,447,笔译,Translation


In [23]:
# Create the 'int_listen' column
member_demand['int_listen'] = member_demand['want_to_study'].str.contains('438').astype(int)

# Create the 'int_speak' column
member_demand['int_speak'] = member_demand['want_to_study'].str.contains('439').astype(int)

# Create the 'int_read' column
member_demand['int_read'] = member_demand['want_to_study'].str.contains('440').astype(int)

# Create the 'int_write' column
member_demand['int_write'] = member_demand['want_to_study'].str.contains('441').astype(int)

# Create the 'int_vocab' column
member_demand['int_vocab'] = member_demand['want_to_study'].str.contains('442').astype(int)

# Create the 'int_grammar' column
member_demand['int_grammar'] = member_demand['want_to_study'].str.contains('444').astype(int)

# Create the 'int_fluency' column
member_demand['int_fluency'] = member_demand['want_to_study'].str.contains('445').astype(int)

# Create the 'int_interpret' column
member_demand['int_interpret'] = member_demand['want_to_study'].str.contains('446').astype(int)

# Create the 'int_translation' column
member_demand['int_translation'] = member_demand['want_to_study'].str.contains('447').astype(int)

# Create the 'int_business' column
member_demand['int_business'] = member_demand['want_to_study'].str.contains('448').astype(int)

# Create the 'int_talk' column
member_demand['int_talk'] = member_demand['want_to_study'].str.contains('449').astype(int)

# Create the 'int_other' column
member_demand['int_other'] = member_demand['want_to_study'].str.contains('450').astype(int)

member_demand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['int_listen'] = member_demand['want_to_study'].str.contains('438').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['int_speak'] = member_demand['want_to_study'].str.contains('439').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['int_r

Unnamed: 0,user_id,product_want,study_level,study_from,want_to_study,study_time,study_budget,purpose,learnt_offline,learnt_online,...,int_read,int_write,int_vocab,int_grammar,int_fluency,int_interpret,int_translation,int_business,int_talk,int_other
18541,131638,569.0,421.0,,438439,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18542,131639,567.0,421.0,,438439,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18543,131640,567.0,421.0,,,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18544,131641,567.0,421.0,,,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18545,131642,567.0,421.0,,,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234233,16579599,1035.0,428.0,,438439440441,544.0,545.0,31.0,0,0,...,1,1,0,0,0,0,0,0,0,0
234234,16579600,567.0,428.0,,438439440441,544.0,545.0,31.0,0,0,...,1,1,0,0,0,0,0,0,0,0
234235,16579601,1035.0,428.0,,438439,544.0,545.0,23.0,0,0,...,0,0,0,0,0,0,0,0,0,0
234236,16579602,567.0,428.0,,438439440441,544.0,545.0,31.0,0,0,...,1,1,0,0,0,0,0,0,0,0


#### Feature 5: study_time

In [24]:
member_demand['study_time'].value_counts()

544.0    67943
0.0      57450
471.0     5850
470.0     2835
469.0     1751
466.0     1394
468.0     1085
467.0      552
Name: study_time, dtype: int64

In [25]:
# find the attribute name for study_time
time = member_demand['study_time'].unique().tolist()
attribute_time = attribute[attribute['attri_id'].isin(time)]
attribute_time

Unnamed: 0,attri_id,attri_name,attri_name_en
378,466,1个月内,Within one month
379,467,2个月内,Within two months
380,468,3个月内,Within three months
381,469,半年内,Within six months
382,470,1年内,Within one year
383,471,超过1年,More than one year
435,544,未知,Unsubmitted


In [26]:
# change unmeaningful value to 0 (as categorical value, 0 represents missing)
member_demand['study_time'].replace(544, 0, inplace=True)
member_demand['study_time'].replace(np.nan, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['study_time'].replace(544, 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['study_time'].replace(np.nan, 0, inplace=True)


#### Feature 6: study_budget

In [27]:
member_demand['study_budget'].value_counts()

545.0     68174
0.0       59520
476.0      5208
475.0      1448
474.0      1147
2695.0     1123
2696.0     1051
472.0       560
473.0       460
2694.0      123
2693.0       46
Name: study_budget, dtype: int64

In [28]:
# find the attribute name for study_budget
budget = member_demand['study_budget'].unique().tolist()
attribute_budget = attribute[attribute['attri_id'].isin(budget)]
attribute_budget

Unnamed: 0,attri_id,attri_name,attri_name_en
384,472,1000元以内,
385,473,3000元以内,
386,474,5000元以内,
387,475,10000元以内,
388,476,超过10000元,
436,545,未知,
2297,2693,2000-5000,
2298,2694,5000-8000,
2299,2695,8000-15000,
2300,2696,15000以上,


In [29]:
# change unmeaningful value to 0 (as categorical value, 0 represents missing) 
member_demand['study_budget'].replace(np.nan, 0, inplace=True)
member_demand['study_budget'].replace(545, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['study_budget'].replace(np.nan, 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['study_budget'].replace(545, 0, inplace=True)


#### Feature 7: purpose

In [30]:
member_demand['purpose'].value_counts()

0.0      77390
23.0     29967
31.0     23176
577.0     4673
20.0      3228
21.0      3172
25.0      2618
28.0      1622
824.0     1196
18.0       918
69.0       844
24.0       714
19.0       685
594.0      655
27.0       419
29.0       395
576.0      386
26.0       364
70.0       316
30.0       272
825.0      265
578.0      246
826.0      227
22.0       146
574.0      101
579.0       36
Name: purpose, dtype: int64

In [31]:
# find the attribute name for purpose
purpose_list = member_demand['purpose'].unique().tolist()
attribute_purpose = attribute[attribute['attri_id'].isin(purpose_list)]
attribute_purpose

Unnamed: 0,attri_id,attri_name,attri_name_en
16,18,出国留学,Examinations for Going Abroad
17,19,出国旅游,Outbound Tourism
18,20,职业发展,Career Development
19,21,兴趣爱好,Hobbies and Interests
20,22,出国移民,Emigration
21,23,提高日常口语,Improve Daily Spoken English
22,24,公司面试,Job Interview
23,25,雅思考试,IELTS
24,26,托福考试,TOEFL
25,27,其他考试,Other Exams


In [32]:
# change unmeaningful value to 0 (as categorical value, 0 represents missing) 
member_demand['purpose'].replace(np.nan, 0, inplace=True)
member_demand['purpose'].replace(100, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['purpose'].replace(np.nan, 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  member_demand['purpose'].replace(100, 0, inplace=True)


# Finalization

In [33]:
member_demand

Unnamed: 0,user_id,product_want,study_level,study_from,want_to_study,study_time,study_budget,purpose,learnt_offline,learnt_online,...,int_read,int_write,int_vocab,int_grammar,int_fluency,int_interpret,int_translation,int_business,int_talk,int_other
18541,131638,569.0,421.0,,438439,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18542,131639,567.0,421.0,,438439,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18543,131640,567.0,421.0,,,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18544,131641,567.0,421.0,,,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18545,131642,567.0,421.0,,,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234233,16579599,1035.0,428.0,,438439440441,0.0,0.0,31.0,0,0,...,1,1,0,0,0,0,0,0,0,0
234234,16579600,567.0,428.0,,438439440441,0.0,0.0,31.0,0,0,...,1,1,0,0,0,0,0,0,0,0
234235,16579601,1035.0,428.0,,438439,0.0,0.0,23.0,0,0,...,0,0,0,0,0,0,0,0,0,0
234236,16579602,567.0,428.0,,438439440441,0.0,0.0,31.0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [34]:
# Drop unnecessary columns from the table
member_demand = member_demand.drop('study_from',axis=1)
member_demand = member_demand.drop('want_to_study',axis=1)

In [35]:
member_demand.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158090 entries, 18541 to 234237
Data columns (total 27 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   user_id           158090 non-null  int64  
 1   product_want      158090 non-null  float64
 2   study_level       158090 non-null  float64
 3   study_time        158090 non-null  float64
 4   study_budget      158090 non-null  float64
 5   purpose           158090 non-null  float64
 6   learnt_offline    158090 non-null  int64  
 7   learnt_online     158090 non-null  int64  
 8   learnt_app        158090 non-null  int64  
 9   learnt_movie      158090 non-null  int64  
 10  learnt_book       158090 non-null  int64  
 11  learnt_foreigner  158090 non-null  int64  
 12  learnt_work       158090 non-null  int64  
 13  learnt_media      158090 non-null  int64  
 14  learnt_other      158090 non-null  int64  
 15  int_listen        158090 non-null  int64  
 16  int_speak       

In [36]:
# Save the DataFrame to a CSV file
member_demand.to_csv('member_demand.csv', index=False)

print("saved to CSV file successfully.")

saved to CSV file successfully.
