In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, hstack
from sklearn.cluster import KMeans
import pickle
from collections import Counter
def savePickle(target, filename):
    with open(filename, "wb") as f:
        pickle.dump(target, f)
def loadPickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [2]:
train = loadPickle("./data/train.pkl")
test = loadPickle("./data/test.pkl")
# col = ['service_type', 'is_mix_service', 'online_time', '1_total_fee',
#        '2_total_fee', '3_total_fee', '4_total_fee', 'month_traffic',
#        'many_over_bill', 'contract_type', 'contract_time',
#        'is_promise_low_consume', 'net_service', 'pay_times', 'pay_num',
#        'last_month_traffic', 'local_trafffic_month', 'local_caller_time',
#        'service1_caller_time', 'service2_caller_time', 'gender', 'age',
#        'complaint_level', 'former_complaint_num', 'former_complaint_fee',
#        'current_service']
# du = train[train.duplicated(col)]
# train = train.drop_duplicates(col)
# train = train[:300000]
# test = test[:300000]
# train = train.drop(['user_id'], axis = 1).duplicated()

In [3]:
print(train.columns)

Index(['service_type', 'is_mix_service', 'online_time', '1_total_fee',
       '2_total_fee', '3_total_fee', '4_total_fee', 'month_traffic',
       'many_over_bill', 'contract_type', 'contract_time',
       'is_promise_low_consume', 'net_service', 'pay_times', 'pay_num',
       'last_month_traffic', 'local_trafffic_month', 'local_caller_time',
       'service1_caller_time', 'service2_caller_time', 'gender', 'age',
       'complaint_level', 'former_complaint_num', 'former_complaint_fee',
       'current_service', 'user_id'],
      dtype='object')


In [4]:
categoryes = sorted(train['current_service'].unique())
label_dict = {}
for i in range(len(categoryes)):
    label_dict[categoryes[i]] = i
    label_dict[i] = categoryes[i]

 - 1_total_fee array([ 0.39827125,  0.39316978,  0.39571121])
 - 2_total_fee _array([ 0.34979964,  0.35166863,  0.35014922])
 - 3_total_fee array([ 0.34979964,  0.35166863,  0.35014922])
 - 4_total_fee  array([ 0.3525123 ,  0.35381964,  0.35444017])
 - 1234 array([ 0.56280605,  0.56499034,  0.56352438])
 c =  array([ 0.29369751,  0.29296579,  0.29410951])
 ---------------------------------------------------
 ####  total fee
  - 1234 0.5604
  - 1234 + 1/2 0.5630
  - 1234 +1/2 1/3 0.5648
  - 1234 + 1/2+1/3+2/4 0.5651
  - 1234 + 1/2 1/3 1/4 2/3 2/4 0.5662
  - 1234 + 1/2 1/3 1/4 2/3 2/4 1/234 0.5665
  - 1234 + 1/2 1/3 1/4 2/3 2/4 3/4 1/234 0.5674
  
#### traffic
  - fee + local 0.5994
  - fee + local + last 0.7095
  - fee + local _ last + month 0.7472
  - fee + local _ last + month +  last/month 0.7476
  - fee + local _ last + month +  last/month local/month 0.7493
  
#### call
  -  fee + traffic + local 0.7519
  -  fee + traffic + local + service1 0.7850
  -  fee + traffic + local + service1 + service2  0.8160
  -  fee + traffic + local + service1 + service2 + s2/local 0.8162
  
#### nature
  - fee + traffic + call + age  0.8184
  - fee + traffic + call + age + gender 0.8189
  
#### statistics
  - fee + traffic + call + age + onlinetime 0.0.8280
  - fee + traffic + call + age + onlinetime + service_type 0.8352
  - fee + traffic + call + age + onlinetime + service_type + is_mix_type 0.8362
  - ... +  many_over_bill 0.8443
  - ... +  contract_type 0.8496
  - ... + contract_time 0.8683
  - ... + is_promise_low_consume 0.8689
  - ... + pay_times 0.8690
  - ... + pay_num 0.8693
#### cross feature
  - ... + traffic/fee 0.8699
  - ... + traffic/call 0.8701
  - ... 8839， 0.8843

In [5]:

weight_dict = {0:105, 1:110, 2:125, 3:103, 4:120, 5:133, 6:134, 7:125, 8:124, 9:111, 10:135}
lgbm = lgb.LGBMClassifier(objective = "multiclass", boosting_type =  'gbdt',
                          colsample_bytree = 0.625, importance_type = 'split', learning_rate = 0.2, max_depth = 9,
                         min_child_samples = 133, min_child_weight = 2, min_split_gain = 0.1, n_estimators = 2000, n_jobs = -1,
                         num_leaves = 58, random_state = 10, reg_alpha = 0.0005, reg_lambda = 0.0471,  subsample = 0.9375,
                        subsample_for_bin = 20000, subsample_freq = 1,verbosity = 0)

# lgbm = lgb.LGBMClassifier(objective = "multiclass",  boosting_type =  'gbdt', random_state = 10, n_jobs = -1, class_weight = weight_dict)
lr = LogisticRegression()
enc = OneHotEncoder()

In [6]:
def square_f1_score(y_true, y_pred):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    samples = len(y_true)
    classes = int(len(y_pred)/samples)
    df = pd.DataFrame()
    for i in range(0, classes):
        df['c_%d'%i] =  y_pred[samples * i: samples*(i+1)]
    pred = np.argmax(np.array(df), axis = 1)
    score = np.square(f1_score(y_true, pred, average = 'macro'))
    return 'square_f1_score', score, True

In [7]:
features = train[['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']].copy()
features['1_2_total_fee_n'] = np.array(features['1_total_fee'])/np.array(features['2_total_fee'])
features['1_3_total_fee_n'] = np.array(features['1_total_fee'])/np.array(features['3_total_fee'])
features['2_4_total_fee_n'] = np.array(features['2_total_fee'])/np.array(features['4_total_fee'])
features['2_3_total_fee_n'] = np.array(features['2_total_fee'])/np.array(features['3_total_fee'])
features['3_4_total_fee_n'] = np.array(features['3_total_fee'])/np.array(features['4_total_fee'])

# traficc
features['local_trafffic_month'] = train['local_trafffic_month']
features['last_month_traffic'] = train['last_month_traffic']
features['month_traffic'] = train['month_traffic']
features['r_last_month_traffic'] = np.array(train['last_month_traffic'])/np.array(train['month_traffic'])
features['r_local_month_traffic'] = np.array(train['local_trafffic_month'])/np.array(train['month_traffic'])
features['non_local_trafffic_month'] = np.array(train['month_traffic']) - np.array(train['local_trafffic_month'])

#call
features['local_caller_time'] = train['local_caller_time']
features['service1_caller_time'] = train['service1_caller_time']
features['service2_caller_time'] = train['service2_caller_time']
features['s2_local_caller_time'] = np.array(train['service2_caller_time'])/np.array(train['local_caller_time'])
features['s1_s2_caller_time'] = np.array(train['service1_caller_time'])/np.array(train['service2_caller_time'])

#nature
features['age'] = train['age']
features['gender0'] = np.array(train['gender'] == 0).astype(int)
features['gender1'] = np.array(train['gender'] == 1).astype(int)
features['gender2'] = np.array(train['gender'] == 2).astype(int)

#other
features['online_time'] = train['online_time']
features['service_type'] = np.array(train['service_type'] == 4).astype(int)
features['service_type'] = np.array(train['service_type'] == 1).astype(int)
features['is_mix_service'] = train['is_mix_service']
features['many_over_bill'] = train['many_over_bill']
features['contract_type'] = np.array(train['contract_type'] == 0).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 1).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 2).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 3).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 6).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 7).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 8).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 9).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 8).astype(int)
features['contract_type'] = np.array(train['contract_type'] == 12).astype(int)
for t in sorted(train['contract_time'].unique()):
    features['contract_time_%d'%t] = np.array(train['contract_time'] == t).astype(int)
features['is_promise_low_consume'] = train['is_promise_low_consume']
# for t in sorted(train['net_service'].unique()):
#     features['net_service_%d'%t] = np.array(train['net_service'] == t).astype(int) not good
features['pay_times'] = train['pay_times']
features['pay_num'] = train['pay_num']

# cross
features['traffic_fee'] = np.array(train['month_traffic'])/np.array(train['1_total_fee'])

features['1_e_2_fee'] = np.array(train['1_total_fee']==train['2_total_fee']).astype(int)
features['1_e_3_fee'] = np.array(train['1_total_fee']==train['3_total_fee']).astype(int)
features['1_e_4_fee'] = np.array(train['1_total_fee']==train['4_total_fee']).astype(int)
features['2_e_3_fee'] = np.array(train['2_total_fee']==train['3_total_fee']).astype(int)
features['2_e_4_fee'] = np.array(train['2_total_fee']==train['4_total_fee']).astype(int)
features['3_e_4_fee'] = np.array(train['3_total_fee']==train['4_total_fee']).astype(int)

features = features.fillna(-1)

  
  
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  """
  """
  
  
  if sys.path[0] == '':
  if sys.path[0] == '':
  del sys.path[0]
  del sys.path[0]


In [8]:
train_y = np.array(train['current_service'].apply(lambda x:label_dict[x]))
features['y'] = train_y
du = features[features.duplicated(keep = False)]
train_x = features.drop_duplicates(keep = False).drop(['y'], axis = 1)
train_y = np.array(features.drop_duplicates(keep = False)['y'])
du_x = du.drop(['y'], axis = 1)
du_y = np.array(du['y'])

In [9]:
train_x, eval_x, train_y, eval_y = train_test_split(train_x, train_y, test_size = 0.33, random_state = 42)

In [10]:
train_x = np.vstack([train_x, du_x])
train_y= np.hstack([train_y, du_y])

In [None]:
lgbm.fit(train_x, train_y, eval_set = (eval_x, eval_y), early_stopping_rounds = 50, eval_metric = square_f1_score)

[1]	valid_0's multi_logloss: 1.61434	valid_0's square_f1_score: 0.609568
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_logloss: 1.2958	valid_0's square_f1_score: 0.653958
[3]	valid_0's multi_logloss: 1.07988	valid_0's square_f1_score: 0.678341
[4]	valid_0's multi_logloss: 0.922032	valid_0's square_f1_score: 0.691468
[5]	valid_0's multi_logloss: 0.79822	valid_0's square_f1_score: 0.702124
[6]	valid_0's multi_logloss: 0.704561	valid_0's square_f1_score: 0.708503
[7]	valid_0's multi_logloss: 0.627893	valid_0's square_f1_score: 0.712902
[8]	valid_0's multi_logloss: 0.563608	valid_0's square_f1_score: 0.715476
[9]	valid_0's multi_logloss: 0.512789	valid_0's square_f1_score: 0.718479
[10]	valid_0's multi_logloss: 0.469509	valid_0's square_f1_score: 0.722198
[11]	valid_0's multi_logloss: 0.433809	valid_0's square_f1_score: 0.723822
[12]	valid_0's multi_logloss: 0.403505	valid_0's square_f1_score: 0.726592
[13]	valid_0's multi_logloss: 0.37947	valid_0's squa

[110]	valid_0's multi_logloss: 0.191564	valid_0's square_f1_score: 0.777215
[111]	valid_0's multi_logloss: 0.191417	valid_0's square_f1_score: 0.777549
[112]	valid_0's multi_logloss: 0.191325	valid_0's square_f1_score: 0.777721
[113]	valid_0's multi_logloss: 0.191201	valid_0's square_f1_score: 0.778204
[114]	valid_0's multi_logloss: 0.191072	valid_0's square_f1_score: 0.778107
[115]	valid_0's multi_logloss: 0.190958	valid_0's square_f1_score: 0.778363
[116]	valid_0's multi_logloss: 0.190878	valid_0's square_f1_score: 0.778617
[117]	valid_0's multi_logloss: 0.190724	valid_0's square_f1_score: 0.778594
[118]	valid_0's multi_logloss: 0.190555	valid_0's square_f1_score: 0.778846
[119]	valid_0's multi_logloss: 0.19047	valid_0's square_f1_score: 0.77862
[120]	valid_0's multi_logloss: 0.19037	valid_0's square_f1_score: 0.779073
[121]	valid_0's multi_logloss: 0.190188	valid_0's square_f1_score: 0.77926
[122]	valid_0's multi_logloss: 0.190067	valid_0's square_f1_score: 0.779291
[123]	valid_0's 

[219]	valid_0's multi_logloss: 0.18483	valid_0's square_f1_score: 0.784451
[220]	valid_0's multi_logloss: 0.184792	valid_0's square_f1_score: 0.784695
[221]	valid_0's multi_logloss: 0.184732	valid_0's square_f1_score: 0.784665
[222]	valid_0's multi_logloss: 0.184677	valid_0's square_f1_score: 0.784483
[223]	valid_0's multi_logloss: 0.184659	valid_0's square_f1_score: 0.784712
[224]	valid_0's multi_logloss: 0.184641	valid_0's square_f1_score: 0.784706
[225]	valid_0's multi_logloss: 0.184605	valid_0's square_f1_score: 0.784735
[226]	valid_0's multi_logloss: 0.184583	valid_0's square_f1_score: 0.785069
[227]	valid_0's multi_logloss: 0.184515	valid_0's square_f1_score: 0.785141
[228]	valid_0's multi_logloss: 0.184473	valid_0's square_f1_score: 0.785171
[229]	valid_0's multi_logloss: 0.184463	valid_0's square_f1_score: 0.785339
[230]	valid_0's multi_logloss: 0.184455	valid_0's square_f1_score: 0.785205
[231]	valid_0's multi_logloss: 0.184446	valid_0's square_f1_score: 0.785188
[232]	valid_0

[328]	valid_0's multi_logloss: 0.182319	valid_0's square_f1_score: 0.788829
[329]	valid_0's multi_logloss: 0.182326	valid_0's square_f1_score: 0.788789
[330]	valid_0's multi_logloss: 0.182322	valid_0's square_f1_score: 0.788681
[331]	valid_0's multi_logloss: 0.182323	valid_0's square_f1_score: 0.788838
[332]	valid_0's multi_logloss: 0.182293	valid_0's square_f1_score: 0.788883
[333]	valid_0's multi_logloss: 0.182283	valid_0's square_f1_score: 0.78891
[334]	valid_0's multi_logloss: 0.182248	valid_0's square_f1_score: 0.78907
[335]	valid_0's multi_logloss: 0.18222	valid_0's square_f1_score: 0.789022
[336]	valid_0's multi_logloss: 0.182207	valid_0's square_f1_score: 0.78902
[337]	valid_0's multi_logloss: 0.182197	valid_0's square_f1_score: 0.789079
[338]	valid_0's multi_logloss: 0.1822	valid_0's square_f1_score: 0.789044
[339]	valid_0's multi_logloss: 0.182198	valid_0's square_f1_score: 0.78909
[340]	valid_0's multi_logloss: 0.182182	valid_0's square_f1_score: 0.789207
[341]	valid_0's mul

[437]	valid_0's multi_logloss: 0.181541	valid_0's square_f1_score: 0.79115
[438]	valid_0's multi_logloss: 0.181524	valid_0's square_f1_score: 0.791148
[439]	valid_0's multi_logloss: 0.181552	valid_0's square_f1_score: 0.791258
[440]	valid_0's multi_logloss: 0.181549	valid_0's square_f1_score: 0.791099
[441]	valid_0's multi_logloss: 0.181535	valid_0's square_f1_score: 0.791155
[442]	valid_0's multi_logloss: 0.181534	valid_0's square_f1_score: 0.791028
[443]	valid_0's multi_logloss: 0.18153	valid_0's square_f1_score: 0.791164
[444]	valid_0's multi_logloss: 0.181532	valid_0's square_f1_score: 0.791382
[445]	valid_0's multi_logloss: 0.181545	valid_0's square_f1_score: 0.791614
[446]	valid_0's multi_logloss: 0.181538	valid_0's square_f1_score: 0.7917
[447]	valid_0's multi_logloss: 0.181545	valid_0's square_f1_score: 0.79181
[448]	valid_0's multi_logloss: 0.18156	valid_0's square_f1_score: 0.791556
[449]	valid_0's multi_logloss: 0.181557	valid_0's square_f1_score: 0.791613
[450]	valid_0's mu

In [206]:
score = f1_score(eval_y, lgbm.predict(eval_x), average = 'macro')
print(score, np.square(score))

0.891086616108 0.794035357406


In [None]:
.7953

In [183]:
help(features.duplicated)


Help on method duplicated in module pandas.core.frame:

duplicated(subset=None, keep='first') method of pandas.core.frame.DataFrame instance
    Return boolean Series denoting duplicate rows, optionally only
    considering certain columns
    
    Parameters
    ----------
    subset : column label or sequence of labels, optional
        Only consider certain columns for identifying duplicates, by
        default use all of the columns
    keep : {'first', 'last', False}, default 'first'
        - ``first`` : Mark duplicates as ``True`` except for the
          first occurrence.
        - ``last`` : Mark duplicates as ``True`` except for the
          last occurrence.
        - False : Mark all duplicates as ``True``.
    
    Returns
    -------
    duplicated : Series



In [16]:
def print_stats(tdf):
    dct = Counter(tdf)
    cn = len(tdf)
    for key in dct.keys():
        print(key, round(dct[key]/cn, 3))
    print(cn, end = "\n\n")
print_stats(train_y)

0 0.179
1 0.098
2 0.045
3 0.386
4 0.051
5 0.03
6 0.028
7 0.039
8 0.044
9 0.071
10 0.028
300000



In [17]:
# kmeans1 = KMeans(n_clusters=10,  n_jobs = -1, random_state = 2018).fit(train_x)
# train_c = kmeans1.predict(train_x)
# train_c = train_c.reshape(-1,1)
# train_c = enc.fit_transform(train_c)
# train_co = hstack((train_x, train_c))

In [18]:
scores = cross_val_score(lgbm, train_x, train_y, scoring = 'f1_macro', cv  = 3, n_jobs  = -1, verbose = 2)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  8.3min finished


In [19]:
print(np.mean(scores))
print(np.square(np.mean(scores)))

0.883655080019
0.780846300444


In [None]:
#lgbm.fit(train_x,train_y)

In [697]:
sorted(dict(zip(features.columns, lgbm.feature_importances_)).items(),key=lambda x:x[1])

NotFittedError: No feature_importances found. Need to call fit beforehand.

In [None]:
plt.plot(sorted(train[train['current_service'] == 99999825]['contract_time']))

In [None]:
plt.plot(sorted(train[train['current_service'] == 99999826]['online_time']))

In [None]:
sorted(train['online_time'].unique())

In [None]:
features.columns

In [None]:
train.user_id

In [437]:
lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'multiclass',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 1}