In [24]:
import datetime
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from scipy import sparse
# matplotlib inline
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
import lightgbm as lgb
import time

sns.set(style="whitegrid", color_codes=True)
sns.set(font_scale=1)
pd.set_option('display.width',None)

In [25]:
path = './data'
data_train = pd.read_csv(path + "/age_train.csv",header=None)
data_train.columns = ['uld','label']

data_test = pd.read_csv(path + "/age_test.csv",header=None)
data_test.columns = ['uld']
data_test['label'] = -1

data_all = pd.concat([data_train,data_test])
print('data prepared step 1!')

data_user_basic = pd.read_csv(path + "/user_basic_info.csv", header=None)
user_basic_feature = ['uld','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os']
data_user_basic.columns = user_basic_feature

data_user_behavior = pd.read_csv(path + "/user_behavior_info.csv", header=None)
user_behavior_feature = ['uld','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum']
data_user_behavior.columns = user_behavior_feature

temp  = pd.merge(data_all, data_user_basic,on='uld')
data  = pd.merge(temp,data_user_behavior,on='uld')
print('data prepared step 2!')
cate_feat_list = user_basic_feature[1:] + user_behavior_feature[1:]

data prepared step 1!
data prepared step 2!


In [26]:
#编码
for i in cate_feat_list:
    data[i] = data[i].map(dict(zip(data[i].unique(),range(0,data[i].nunique()))))

data = data.fillna(data.mean())

print('data prepared complete!')
count_feature_list = []

data prepared complete!


In [27]:
print(data.columns.values)

['uld' 'label' 'gender' 'city' 'prodName' 'ramCapacity' 'ramLeftRation'
 'romCapacity' 'romLeftRation' 'color' 'fontSize' 'ct' 'carrier' 'os'
 'bootTimes' 'AFuncTimes' 'BFuncTimes' 'CFuncTimes' 'DFuncTimes'
 'EFuncTimes' 'FFuncTimes' 'FFuncSum']


In [28]:
#特征计数
def feature_count(data, features=[], is_feature=True):
    if len(set(features)) != len(features):
        print('equal feature !!!!')
        return data
    new_feature = 'count'
    nunique = []
    for i in features:
        nunique.append(data[i].nunique())
        new_feature += '_' + i.replace('add_','')
    if len(features) > 1 and len(data[features].drop_duplicates()) <= np.max(nunique):
        print(new_feature, 'is unvalid cross feature:')
        return data
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
    data = data.merge(temp,'left',on=features)
    if is_feature:
        count_feature_list.append(new_feature)
    return data

for i in cate_feat_list:
    n = data[i].nunique()
    if n > 5:
        data = feature_count(data,[i])

print(data.columns.values)

['uld' 'label' 'gender' 'city' 'prodName' 'ramCapacity' 'ramLeftRation'
 'romCapacity' 'romLeftRation' 'color' 'fontSize' 'ct' 'carrier' 'os'
 'bootTimes' 'AFuncTimes' 'BFuncTimes' 'CFuncTimes' 'DFuncTimes'
 'EFuncTimes' 'FFuncTimes' 'FFuncSum' 'count_city' 'count_prodName'
 'count_ramCapacity' 'count_ramLeftRation' 'count_romCapacity'
 'count_romLeftRation' 'count_color' 'count_fontSize' 'count_ct'
 'count_os' 'count_bootTimes' 'count_AFuncTimes' 'count_BFuncTimes'
 'count_CFuncTimes' 'count_DFuncTimes' 'count_EFuncTimes'
 'count_FFuncTimes' 'count_FFuncSum']


In [29]:
feature = cate_feat_list + count_feature_list #采用基本特征+计数特征
print(len(feature),feature)

#print(data.head())

#低频过滤
for feature in cate_feat_list:
    if 'count_' + feature in data.keys():
        print(feature)
        data.loc[data['count_'+feature]<2, feature] = -1
        data[feature] = data[feature] + 1
        
print(data.columns.values)

38 ['gender', 'city', 'prodName', 'ramCapacity', 'ramLeftRation', 'romCapacity', 'romLeftRation', 'color', 'fontSize', 'ct', 'carrier', 'os', 'bootTimes', 'AFuncTimes', 'BFuncTimes', 'CFuncTimes', 'DFuncTimes', 'EFuncTimes', 'FFuncTimes', 'FFuncSum', 'count_city', 'count_prodName', 'count_ramCapacity', 'count_ramLeftRation', 'count_romCapacity', 'count_romLeftRation', 'count_color', 'count_fontSize', 'count_ct', 'count_os', 'count_bootTimes', 'count_AFuncTimes', 'count_BFuncTimes', 'count_CFuncTimes', 'count_DFuncTimes', 'count_EFuncTimes', 'count_FFuncTimes', 'count_FFuncSum']
city
prodName
ramCapacity
ramLeftRation
romCapacity
romLeftRation
color
fontSize
ct
os
bootTimes
AFuncTimes
BFuncTimes
CFuncTimes
DFuncTimes
EFuncTimes
FFuncTimes
FFuncSum
['uld' 'label' 'gender' 'city' 'prodName' 'ramCapacity' 'ramLeftRation'
 'romCapacity' 'romLeftRation' 'color' 'fontSize' 'ct' 'carrier' 'os'
 'bootTimes' 'AFuncTimes' 'BFuncTimes' 'CFuncTimes' 'DFuncTimes'
 'EFuncTimes' 'FFuncTimes' 'FFuncSum

In [30]:
predict = data[(data.label == -1)]
#print(predict.head())
predict_result = predict[['uld']]
pred_temp = predict[['uld']]
#print(predict_result.head())
predict_result['predicted_score'] = 0
predict_x = predict.drop('label',axis=1)
print(predict_x.columns.values)
train_x = data[data.label != -1].reset_index(drop=True)
train_y = train_x.pop('label').values
print(train_y.size)
base_train_csr = sparse.csr_matrix((len(train_x),0))
base_predict_csr = sparse.csr_matrix((len(predict_x),0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


['uld' 'gender' 'city' 'prodName' 'ramCapacity' 'ramLeftRation'
 'romCapacity' 'romLeftRation' 'color' 'fontSize' 'ct' 'carrier' 'os'
 'bootTimes' 'AFuncTimes' 'BFuncTimes' 'CFuncTimes' 'DFuncTimes'
 'EFuncTimes' 'FFuncTimes' 'FFuncSum' 'count_city' 'count_prodName'
 'count_ramCapacity' 'count_ramLeftRation' 'count_romCapacity'
 'count_romLeftRation' 'count_color' 'count_fontSize' 'count_ct'
 'count_os' 'count_bootTimes' 'count_AFuncTimes' 'count_BFuncTimes'
 'count_CFuncTimes' 'count_DFuncTimes' 'count_EFuncTimes'
 'count_FFuncTimes' 'count_FFuncSum']
2010000


In [31]:
enc = OneHotEncoder()
for feature in cate_feat_list:
    enc.fit(data[feature].values.reshape(-1,1))
    base_train_csr = sparse.hstack((base_train_csr,enc.transform(train_x[feature].values.reshape(-1,1))),'csr','bool')
    base_predict_csr = sparse.hstack((base_predict_csr,enc.transform(predict[feature].values.reshape(-1,1))),'csr','bool')
print('one-hot prepared!')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the On

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


one-hot prepared!


In [32]:
num_feature = count_feature_list
print(train_x.columns.values)
train_csr = sparse.hstack((sparse.csr_matrix(train_x[num_feature]),base_train_csr),'csr').astype('float32')
predict_csr = sparse.hstack((sparse.csr_matrix(predict_x[num_feature]),base_predict_csr),'csr').astype('float32')
lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', 
                               num_leaves=122, reg_alpha=3, reg_lambda=1,
                               max_depth = -1, n_estimators=5000,objective='multiclass',num_class=6,
                               subsample=0.8,colsample_bytree=0.8,subsample_feq=1,
                               learning_rate=0.1,random_state=2018,n_jobs=10
                              )
skf = StratifiedKFold(n_splits=5, random_state=2018,shuffle=True)
best_score = []
print(train_y)

for index, (train_index, test_index) in enumerate(skf.split(train_csr,train_y)):
    lgb_model.fit(train_csr[train_index], train_y[train_index],
                 eval_set=[(train_csr[train_index],train_y[train_index]),
                          (train_csr[test_index],train_y[test_index])],early_stopping_rounds=200, verbose=10)
    best_score.append(lgb_model.best_score_['valid_1']['multi_logloss'])
    print(best_score)
    test_pred = lgb_model.predict(predict_csr)
    print(test_pred)
    pred_temp['label'] = test_pred
    now = datetime.datetime.now()
    now = now.strftime('%m-%d-%H-%M')
    pred_temp[['uld','label']].to_csv(path + "/submission/lgb_baseline_split_%s.csv" % now, index=False)
    predict_result['predicted_score'] = predict_result['predicted_score'] + test_pred
predict_result['predicted_score'] = predict_result['predicted_score'] / 5
mean = predict_result['predicted_score'].mean()
print('mean:',mean)

now = datetime.datetime.now()
now = now.strftime('%m-%d-%H-%M')
predict_result['label'] = predict_result['predicted_score']
predict_result[['uld','label']].to_csv(path + "/submission/lgb_baseline_%s.csv" % now, index=False)

['uld' 'gender' 'city' 'prodName' 'ramCapacity' 'ramLeftRation'
 'romCapacity' 'romLeftRation' 'color' 'fontSize' 'ct' 'carrier' 'os'
 'bootTimes' 'AFuncTimes' 'BFuncTimes' 'CFuncTimes' 'DFuncTimes'
 'EFuncTimes' 'FFuncTimes' 'FFuncSum' 'count_city' 'count_prodName'
 'count_ramCapacity' 'count_ramLeftRation' 'count_romCapacity'
 'count_romLeftRation' 'count_color' 'count_fontSize' 'count_ct'
 'count_os' 'count_bootTimes' 'count_AFuncTimes' 'count_BFuncTimes'
 'count_CFuncTimes' 'count_DFuncTimes' 'count_EFuncTimes'
 'count_FFuncTimes' 'count_FFuncSum']
[4 3 5 ... 2 2 3]
Training until validation scores don't improve for 200 rounds.
[10]	valid_0's multi_logloss: 1.4775	valid_1's multi_logloss: 1.47931
[20]	valid_0's multi_logloss: 1.42631	valid_1's multi_logloss: 1.42973
[30]	valid_0's multi_logloss: 1.39967	valid_1's multi_logloss: 1.40468
[40]	valid_0's multi_logloss: 1.38279	valid_1's multi_logloss: 1.3893
[50]	valid_0's multi_logloss: 1.37081	valid_1's multi_logloss: 1.37871
[60]	va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyError: 'predicted_score'

In [13]:
from sklearn.externals import joblib
joblib.dump(lgb_model,'lgb.pkl')

['lgb.pkl']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
