In [44]:
import os
import pandas as pd
import numpy as np

import hashlib

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from timeit import default_timer as timer

from tqdm import tqdm_notebook
from tqdm import tqdm

N_PARTITIONS = 1000

In [18]:
# 处理 meso
def id_to_hash(costomer_id):
    
    return int(hashlib.md5(costomer_id.encode('utf-8')).hexdigest(), 16)

In [19]:
# 创建新文件
base_dir = 'data/partitions/'
if not os.path.exists(base_dir + 'p999'):
    for i in range(N_PARRTITIONS):
        os.mkdir(base_dir + f'p{i}', exist_ok=False)

In [20]:
# 创建新csv、并且将csv头写入
def create_blank_partitions():
    for i in range(N_PARRTITIONS):
        directory = base_dir + f'p{i}/'
        for file in ['transactions.csv', 'train.csv', 'test.csv', 'members.csv', 'logs.csv']:
            with open(directory + file, 'w') as f:
                if file == 'transactions.csv':
                    f.write(','.join(list(transactions.columns)))
                elif file == 'train.csv':
                    f.write(','.join(list(train.columns)))
                elif file == 'test.csv':
                    f.write(','.join(list(test.columns)))
                elif file == 'members.csv':
                    f.write(','.join(list(members.columns)))
                elif file == 'logs.csv':
                    f.write(','.join(list(logs.columns)))
                    
    return directory

In [32]:
# 将大文件分组分别写入不同的文件夹内
def partition_by_hashing(df, name, progress=None):
    start = timer()
    df['partition'] = df['msno'].apply(id_to_hash) % N_PARRTITIONS
    for partition, grouped in df.groupby('partition'):
        grouped = grouped.drop(columns = 'partition')
        with open(base_dir + f'p{partition}/{name}.csv', 'a') as f:
            f.write('\n')
            grouped.to_csv(f, header=False, index=False)
        # 进度条
        if progress is not None:
            if partition % progress == 0:
                print(f'完成{100 * round(partition / N_PARRTITIONS, 2)}%。用时{round(timer() - start)}秒。', end='\r')
    end = timer()
    if progress is not None:
        print(f'\n完成{df.shape[0]}行用时{round(end - start)}秒.')

In [39]:
_ = create_blank_partitions()

In [24]:
members = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/members_v3.csv', nrows = 1)
members.head()
transactions = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/transactions.csv', nrows = 1)
transactions.head()
logs = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/user_logs.csv', nrows = 1)
logs.head()
train = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/train.csv', nrows = 1)
train.head()
test = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/sample_submission_v2/churn_comp_refresh/sample_submission_v2.csv', nrows = 1)
test.head()
_ = create_blank_partitions()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911


Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150930,20151101,0


Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,rxIP2f2aN0rYNp+toI0Obt/N/FYQX8hcO1fTmmy2h34=,20150513,0,0,0,0,1,1,280.335


Unnamed: 0,msno,is_churn
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1


Unnamed: 0,msno,is_churn
0,4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=,0


In [33]:
members = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/members_v3.csv')
partition_by_hashing(members, name = 'members', progress = 10)
members.head()

完成99.0%。用时10秒。99999%。用时8秒。。
完成6769473行用时10秒.


Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,partition
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911,27
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914,932
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915,99
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915,991
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915,113


In [34]:
train = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/train.csv')
partition_by_hashing(train, name = 'train', progress = 10)
train.head()

完成99.0%。用时1秒。999999%。用时1秒。。
完成992931行用时1秒.


Unnamed: 0,msno,is_churn,partition
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,983
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,444
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,383
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,434
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,835


In [36]:
test = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/sample_submission_v2/churn_comp_refresh/sample_submission_v2.csv')
partition_by_hashing(test, name = 'test', progress = 10)
test.head()

完成99.0%。用时1秒。999999%。用时1秒。。
完成907471行用时1秒.


Unnamed: 0,msno,is_churn,partition
0,4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=,0,191
1,aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=,0,906
2,rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=,0,220
3,WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=,0,77
4,aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=,0,549


In [37]:
transactions = pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/transactions.csv')
partition_by_hashing(transactions, name = 'transactions', progress = 10)
transactions.head()

完成99.0%。用时42秒。99999%。用时30秒。。
完成21547746行用时43秒.


Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,partition
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150930,20151101,0,728
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,20150930,20151031,0,663
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,20150930,20160427,0,972
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,20150930,20151128,0,21
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,20150930,20151121,0,852


In [46]:
# 1e6 = 1,000,000 = 100w
chunksize = 1e6
start = timer()

for chunk in pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/user_logs_v2/churn_comp_refresh/user_logs_v2.csv', chunksize = chunksize):
    partition_by_hashing(chunk, name = 'logs', progress = None)
    
    if (i + 1) % 10 == 0:
        print(f'{i * chunksize} rows processed.', end = '\r')

end = timer()
print(f'\nOverall time: {round(end - start)} seconds.')


Overall time: 47 seconds.


In [47]:
pd.read_csv(base_dir + f'p{partition}/logs.csv').head()

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,GxHYnqRIvwFfR9wLnvo1Nm9aK7Vn1rCN0xscvQZ7Wkg=,20170308,0,1,0,1,66,52,15988.75
1,jUQvtPRIcorMD0OT4otp3QxGMpYCWOwVrydb9zGEXKU=,20170303,1,0,1,0,25,19,6434.302
2,0kDW/zb3rk4H785dfLBg3/7z4tUVNDiNE62opZ8jh8I=,20170302,11,5,2,7,10,24,4763.899
3,Bsqi4khDvaKdY7XkVp5goHat4wmyqyjCZlIA0VN9LZE=,20170308,49,4,1,0,1,14,1135.579
4,ucLYiChtrwQdUsZeexo0DWq4K/N6wckNvTeMAd4Jk2Y=,20170331,1,0,0,1,10,6,2464.959


In [48]:
chunksize = 1e7

start = timer()

for i, chunk in enumerate(pd.read_csv('/Users/dususu/Desktop/kkbox-churn-prediction-challenge/user_logs.csv', chunksize = chunksize)):
    partition_by_hashing(chunk, name = 'logs', progress = None)
    
    if (i + 1) % 10 == 0:
        print(f'{i * chunksize} rows processed.', end = '\r')
    
end = timer()
print(f'\nOverall time: {round(end - start)} seconds.')

390000000.0 rows processed.
Overall time: 952 seconds.


In [49]:
pd.read_csv(base_dir + f'p{partition}/logs.csv').tail()

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
431501,OsxMZTPONscfoViAGXqycCGWMWkOIhUEu/5iff4MkLc=,20160613,6,0,1,0,2,9,727.105
431502,OsxMZTPONscfoViAGXqycCGWMWkOIhUEu/5iff4MkLc=,20160807,1,0,0,1,8,10,2328.199
431503,66sQPlkfefTFBBqlr3GDLtbSXQn4Gq65YWwi6npKQmw=,20160206,0,0,0,0,27,20,6023.0
431504,yiiHTJHxUhAlz91+vInQiE0scrOANjxv7anmCu5vhxk=,20160723,1,1,0,0,4,5,1496.181
431505,J4GiFdNNSFcjMasfH9Mwz8zatX8+/gapN570AfOdqB4=,20170211,1,0,0,1,6,7,1793.154
