In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
uid_train = pd.read_csv('./data/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('./data/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('./data/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('./data/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'))

voice_test = pd.read_csv('./data/voice_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('./data/sms_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('./data/wa_test_b.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'))


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('./data/uid_test_b.txt',index=None)

In [4]:
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)

In [5]:
voice_opp_num = voice.groupby(['uid'])['opp_num'].agg({'unique_count': lambda x: len(pd.unique(x)),'count':'count'}).add_prefix('voice_opp_num_').reset_index()
voice_opp_head=voice.groupby(['uid'])['opp_head'].agg({'unique_count': lambda x: len(pd.unique(x))}).add_prefix('voice_opp_head_').reset_index()
voice_opp_len=voice.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)
voice_call_type = voice.groupby(['uid','call_type'])['uid'].count().unstack().add_prefix('voice_call_type_').reset_index().fillna(0)
voice_in_out = voice.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('voice_in_out_').reset_index().fillna(0)

sms_opp_num = sms.groupby(['uid'])['opp_num'].agg({'unique_count': lambda x: len(pd.unique(x)),'count':'count'}).add_prefix('sms_opp_num_').reset_index()
sms_opp_head=sms.groupby(['uid'])['opp_head'].agg({'unique_count': lambda x: len(pd.unique(x))}).add_prefix('sms_opp_head_').reset_index()
sms_opp_len=sms.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)
sms_in_out = sms.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('sms_in_out_').reset_index().fillna(0)

wa_name = wa.groupby(['uid'])['wa_name'].agg({'unique_count': lambda x: len(pd.unique(x)),'count':'count'}).add_prefix('wa_name_').reset_index()
visit_cnt = wa.groupby(['uid'])['visit_cnt'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_cnt_').reset_index()
visit_dura = wa.groupby(['uid'])['visit_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_dura_').reset_index()
up_flow = wa.groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_').reset_index()
down_flow = wa.groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_').reset_index()
wa_date = wa.groupby(['uid'])['date'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_date_').reset_index()

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.
is deprecated and will be removed in a future version
  
is deprecated and will be removed in a future version
  import sys
is deprecated and will be removed in a future version
  
is deprecated and will be removed in a future version
  if sys.path[0] == '':


In [6]:
def get_callTime(start_time, end_time):
    total_time = []
    for index in range(len(start_time)):
        start_day = int(start_time[index][0:2])
        start_hour = int(start_time[index][2:4])
        start_minute = int(start_time[index][4:6])
        start_second = int(start_time[index][6:8])
        end_day = int(end_time[index][0:2])
        end_hour = int(end_time[index][2:4])
        end_minute = int(end_time[index][4:6])
        end_second = int(end_time[index][6:8])

        if end_second < start_second:
            end_minute = end_minute - 1
            second = end_second + 60 - start_second
        else:
            second = end_second - start_second

        if end_minute < start_minute:
            end_hour = end_hour - 1
            minute = end_minute + 60 - start_minute
        else:
            minute = end_minute - start_minute

        if end_hour < start_hour:
            end_day = end_day - 1
            hour = end_hour + 60 - start_hour
        else:
            hour = end_hour - start_hour

        day = end_day - start_day
        result = second + minute * 60 + hour * 60 * 60 + day * 24 * 60 * 60
        total_time.append(result)
    return total_time


In [7]:
voice['call_time'] = get_callTime(voice['start_time'].values, voice['end_time'].values)
voice_call_time = voice.groupby(['uid'])['call_time'].agg(['std','max','min','median','mean','sum']).add_prefix('voice_call_time_').reset_index().fillna(0)

In [8]:
def get_startDay(start_time):
    start_day = []
    for index in range(len(start_time)):
        day = int(start_time[index][0:2])
        start_day.append(day)
    return start_day

In [9]:
sms['start_day'] = get_startDay(sms['start_time'].values)
sms_start_day = sms.groupby(['uid'])['start_day'].agg(['std','max','min','median','mean','sum']).add_prefix('sms_start_day_').reset_index().fillna(0)

In [10]:
voice['start_day'] = get_startDay(voice['start_time'].values)
voice_start_day = voice.groupby(['uid'])['start_day'].agg(['std','max','min','median','mean','sum']).add_prefix('voice_start_day_').reset_index().fillna(0)

In [11]:
feature = [voice_opp_num,voice_opp_head,voice_opp_len,voice_call_type,voice_call_time,voice_in_out,voice_start_day,sms_opp_num,sms_opp_head,sms_opp_len,sms_in_out,sms_start_day,wa_name,visit_cnt,visit_dura,up_flow,
           down_flow,wa_date]

In [12]:
train_feature = uid_train
for feat in feature:
    train_feature=pd.merge(train_feature,feat,how='left',on='uid')

test_feature = uid_test
for feat in feature:
    test_feature=pd.merge(test_feature,feat,how='left',on='uid')

In [13]:
train_feature.to_csv('./data/train_featureV2.csv',index=None)
test_feature.to_csv('./data/test_featureV2.csv',index=None)