In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from datetime import datetime
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('./data/round1_diac2019_train.csv', parse_dates=['order_pay_time','goods_list_time','goods_delist_time'])
test = pd.read_csv('./data/round1_diac2019_test.csv',)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400831 entries, 0 to 1400830
Data columns (total 28 columns):
order_detail_id           1400831 non-null int64
order_id                  1400831 non-null int64
order_total_num           1400831 non-null int64
order_amount              1400831 non-null float64
order_total_payment       1400831 non-null float64
order_total_discount      1400831 non-null float64
order_pay_time            1400831 non-null datetime64[ns]
order_status              1400831 non-null int64
order_count               1400831 non-null int64
is_customer_rate          1315964 non-null float64
order_detail_status       1400831 non-null int64
order_detail_goods_num    1400831 non-null int64
order_detail_amount       1315964 non-null float64
order_detail_payment      1400831 non-null float64
order_detail_discount     1400831 non-null float64
customer_province         1315315 non-null object
customer_city             1315315 non-null object
member_id                 140

In [4]:
# 去除customer_id为空的样本
train = train.loc[~train.customer_id.isna()]

In [7]:
part_train = train[['customer_id', 'customer_province', 'customer_city']]
train = train.drop(['customer_province', 'customer_city'], axis=1).merge(part_train.groupby('customer_id').last(), on='customer_id', how='left')

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1400764 entries, 0 to 1400763
Data columns (total 28 columns):
order_detail_id           1400764 non-null int64
order_id                  1400764 non-null int64
order_total_num           1400764 non-null int64
order_amount              1400764 non-null float64
order_total_payment       1400764 non-null float64
order_total_discount      1400764 non-null float64
order_pay_time            1400764 non-null datetime64[ns]
order_status              1400764 non-null int64
order_count               1400764 non-null int64
is_customer_rate          1315897 non-null float64
order_detail_status       1400764 non-null int64
order_detail_goods_num    1400764 non-null int64
order_detail_amount       1315897 non-null float64
order_detail_payment      1400764 non-null float64
order_detail_discount     1400764 non-null float64
member_id                 1400764 non-null int64
customer_id               1400764 non-null float64
customer_gender           551

In [9]:
# 省份跟城市的空值以"未知"代替
train.customer_province = train.customer_province.fillna('未知')
train.customer_city = train.customer_city.fillna('未知')

In [10]:
# 缺失值太多, 删除列
train.drop(['customer_gender', 'member_status'], axis=1, inplace=True)

In [11]:
train.is_member_actived = train.is_member_actived.fillna(0)
train.is_customer_rate = train.is_customer_rate.fillna(0)

In [12]:
train.is_customer_rate = train.is_customer_rate.astype('int16')
train.customer_id = train.customer_id.astype('int64')
#train.customer_gender = train.customer_gender.astype('int16')
#train.member_status = train.member_status.astype('int16')
train.is_member_actived = train.is_member_actived.astype('int16')

In [13]:
# 总价应等于折扣+实付价格
train.order_detail_amount = train.order_detail_discount + train.order_detail_payment
train.order_amount = train.order_total_discount + train.order_total_payment

In [14]:
# 省份 label encoding
le = LabelEncoder()
le.fit(train.customer_province)
train.customer_province = le.transform(train.customer_province)

In [15]:
# 城市 label encoding
le2 = LabelEncoder()
le2.fit(train.customer_city)
train.customer_city = le2.transform(train.customer_city)

In [18]:
train.sort_values('order_pay_time', inplace=True)

In [70]:
# 取前180天的作为标签训练集
data_before = train.loc[train['order_pay_time'] <= datetime(2013, 7, 3)]
data_after = train.loc[train['order_pay_time'] > datetime(2013, 7, 3)]

In [71]:
# 构造标签
data_before['label'] = np.int16(np.isin(data_before['customer_id'], data_after['customer_id']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [72]:
# 半年内依旧上架的商品
# 对于训练集
test_still_list = data_before['goods_list_time'].apply(lambda x: 1 if datetime(2013, 7, 3)<=x<datetime(2014, 1, 1) else 0)
# 对于全集
still_list = train['goods_list_time'].apply(lambda x: 1 if datetime(2014, 1, 1)<=x<=datetime(2014, 7, 3) else 0)

In [73]:
data_before['still_list'] = test_still_list
train['still_list'] = still_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [74]:
data_before['payment'] = data_before['order_total_payment'] / data_before['order_total_num']
train['payment'] = train['order_total_payment'] / train['order_total_num']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [75]:
 def handle(df):
    g = df.groupby('customer_id')
    list_goods_num = g['still_list'].sum().values
    rate_num = g['is_customer_rate'].sum().values
    payment = g['payment'].sum().values
    customer_id = g.count().index
    return pd.DataFrame({'customer_id': customer_id, 'list_goods_num': list_goods_num, 'rate_num': rate_num, 'payment': payment})

In [76]:
data_before.columns

Index(['order_detail_id', 'order_id', 'order_total_num', 'order_amount',
       'order_total_payment', 'order_total_discount', 'order_pay_time',
       'order_status', 'order_count', 'is_customer_rate',
       'order_detail_status', 'order_detail_goods_num', 'order_detail_amount',
       'order_detail_payment', 'order_detail_discount', 'member_id',
       'customer_id', 'is_member_actived', 'goods_id', 'goods_price',
       'goods_status', 'goods_has_discount', 'goods_list_time',
       'goods_delist_time', 'customer_province', 'customer_city', 'still_list',
       'payment', 'label'],
      dtype='object')

In [115]:
# drop_columns = ['goods_id', 'order_total_payment', 'order_total_num', 'order_pay_time', 'goods_list_time', 
#                'is_customer_rate', 'still_list', 'payment']
drop_columns = ['order_pay_time', 'goods_list_time', 'goods_delist_time']

In [116]:
data_before_tool = handle(data_before)
data_before2 = data_before.drop(drop_columns, axis=1).loc[data_before['customer_id'].drop_duplicates().index]
data_before3 = data_before2.merge(data_before_tool, on='customer_id')

In [117]:
train_tool = handle(train)
train2 = train.drop(drop_columns, axis=1).loc[train['customer_id'].drop_duplicates().index]
train3 = train2.merge(train_tool, on='customer_id')

In [118]:
def model_data(data):
    X, y = data.drop(['label'], axis=1).values, data['label'].values
    return train_test_split(X, y, test_size=0.1, random_state=42,stratify=y)

In [119]:
X_all = train3.drop([], axis=1).values
X_train, X_valid, y_train, y_valid = model_data(data_before3)

### RF

In [148]:
rf = RandomForestClassifier(n_estimators=400, min_samples_split=35,
                                  min_samples_leaf=20,max_depth=19,max_features='sqrt' ,random_state=10, verbose=False, oob_score=True)

In [149]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=19, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=35,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=True, random_state=10,
                       verbose=False, warm_start=False)

In [150]:
pred3 = rf.predict_proba(X_valid)

In [151]:
log_loss(y_valid, pred3[:, 1])

0.3501532782579421

In [146]:
%%time
param1 = {'min_samples_split':range(30,70,5)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100, n_estimators=400, max_depth=19,
                                  max_features='sqrt' ,random_state=10, oob_score=True), 
                       param_grid = param1, scoring='neg_log_loss',cv=3)
gsearch1.fit(X_train, y_train)

CPU times: user 1h 14min 59s, sys: 37.3 s, total: 1h 15min 37s
Wall time: 1h 15min 47s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=19,
                                              max_features='sqrt',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=100,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=400, n_jobs=None,
                                              oob_score=True, random_state=10,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'min_

In [154]:
rf_pred = rf.predict_proba(X_all)

### 提交数据

In [37]:
def get_result(all_data, pred):
    global test
    
    all_result = pd.DataFrame({'customer_id': all_data.customer_id.values, 'result2':pred})
    result = test.merge(all_result, on='customer_id', how='left')
    result.result = result.result2
    result.drop(['result2'], axis=1, inplace=True)
    result.dropna(inplace=True)
    result.to_csv('./data/submit.csv', index=False)
    return result

In [155]:
rf_result = get_result(train3, rf_pred[:, 1])

In [158]:
rf_result.loc[rf_result.result>0.8]

Unnamed: 0,customer_id,result
5184,5485.0,0.817526
5256,5218.0,0.802154
6211,6055.0,0.841531
6222,5835.0,0.817040
9524,9800.0,0.802350
10090,10122.0,0.823482
13081,13448.0,0.802626
13963,14009.0,0.833354
14288,14532.0,0.813196
14720,14175.0,0.819458
