In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from datetime import datetime
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, KFold, cross_val_score

In [2]:
train = pd.read_csv('./data/round1_diac2019_train.csv', parse_dates=['order_pay_time','goods_list_time','goods_delist_time'])
test = pd.read_csv('./data/round1_diac2019_test.csv',)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400831 entries, 0 to 1400830
Data columns (total 28 columns):
order_detail_id           1400831 non-null int64
order_id                  1400831 non-null int64
order_total_num           1400831 non-null int64
order_amount              1400831 non-null float64
order_total_payment       1400831 non-null float64
order_total_discount      1400831 non-null float64
order_pay_time            1400831 non-null datetime64[ns]
order_status              1400831 non-null int64
order_count               1400831 non-null int64
is_customer_rate          1315964 non-null float64
order_detail_status       1400831 non-null int64
order_detail_goods_num    1400831 non-null int64
order_detail_amount       1315964 non-null float64
order_detail_payment      1400831 non-null float64
order_detail_discount     1400831 non-null float64
customer_province         1315315 non-null object
customer_city             1315315 non-null object
member_id                 140

In [4]:
# 去除customer_id为空的样本
train = train.loc[~train.customer_id.isna()]

In [5]:
part_train = train[['customer_id', 'customer_province', 'customer_city']]
train = train.drop(['customer_province', 'customer_city'], axis=1).merge(part_train.groupby('customer_id').last(), on='customer_id', how='left')

In [7]:
# 省份跟城市的空值以"未知"代替
train.customer_province = train.customer_province.fillna('未知')
train.customer_city = train.customer_city.fillna('未知')

In [8]:
# 缺失值太多, 删除列
train.drop(['member_status'], axis=1, inplace=True)

In [9]:
train.is_member_actived = train.is_member_actived.fillna(0)
train.is_customer_rate = train.is_customer_rate.fillna(0)

In [10]:
train.is_customer_rate = train.is_customer_rate.astype('int16')
train.customer_id = train.customer_id.astype('int64')
train.is_member_actived = train.is_member_actived.astype('int16')

In [11]:
# 总价应等于折扣+实付价格
train.order_detail_amount = train.order_detail_discount + train.order_detail_payment
train.order_amount = train.order_total_discount + train.order_total_payment

In [12]:
# 省份 label encoding
le = LabelEncoder()
le.fit(train.customer_province)
train.customer_province = le.transform(train.customer_province)

In [13]:
# 城市 label encoding
le2 = LabelEncoder()
le2.fit(train.customer_city)
train.customer_city = le2.transform(train.customer_city)

In [14]:
# 用模型预测性别
notnull_gender = train.loc[(train.customer_gender==1) | (train.customer_gender==2)]

In [15]:
notnull_gender.customer_gender = notnull_gender.customer_gender.astype('int16')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [17]:
drop_columns = ['order_pay_time', 'goods_list_time', 'goods_delist_time']

In [18]:
X = notnull_gender.drop(['customer_gender'], axis=1)
y = notnull_gender.customer_gender
X_gTrain, X_gValid, y_gTrain, y_gValid = train_test_split(X.drop(drop_columns, axis=1), y, test_size=0.3, random_state=42,stratify=y)

In [19]:
gender_rf = RandomForestClassifier(n_estimators=470, min_samples_split=20,n_jobs=4,
                                  min_samples_leaf=10,max_depth=21,max_features='sqrt' ,random_state=10, verbose=False, oob_score=True)

In [20]:
gender_rf.fit(X_gTrain, y_gTrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=21, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=470, n_jobs=4,
                       oob_score=True, random_state=10, verbose=False,
                       warm_start=False)

In [21]:
accuracy_score(y_gValid, gender_rf.predict(X_gValid))

0.8400120956903503

In [22]:
null_fill_gender = train.loc[~((train.customer_gender==1) | (train.customer_gender==2))]

In [23]:
null_new_gender = gender_rf.predict(null_fill_gender.drop(drop_columns + ['customer_gender'], axis=1))

In [25]:
null_fill_gender.customer_gender = null_new_gender

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [27]:
train = pd.concat([null_fill_gender, notnull_gender], axis=0)

In [28]:
train.sort_values('order_pay_time', inplace=True)

In [30]:
# 取前180天的作为标签训练集
data_before = train.loc[train['order_pay_time'] <= datetime(2013, 7, 3)]
data_after = train.loc[train['order_pay_time'] > datetime(2013, 7, 3)]

In [31]:
# 构造标签
data_before['label'] = np.int16(np.isin(data_before['customer_id'], data_after['customer_id']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [32]:
# 半年内依旧上架的商品
# 对于训练集
test_still_list = data_before['goods_list_time'].apply(lambda x: 1 if datetime(2013, 7, 3)<=x<datetime(2014, 1, 1) else 0)
# 对于全集
still_list = train['goods_list_time'].apply(lambda x: 1 if datetime(2014, 1, 1)<=x<=datetime(2014, 7, 3) else 0)

In [33]:
data_before['still_list'] = test_still_list
train['still_list'] = still_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [34]:
data_before['payment'] = data_before['order_total_payment'] / data_before['order_total_num']
train['payment'] = train['order_total_payment'] / train['order_total_num']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [35]:
 def handle(df):
    # 计算每个用户买过的在半年内依然上架商品的数量
    g = df.groupby('customer_id')
    list_goods_num = g['still_list'].sum().values
    rate_num = g['is_customer_rate'].sum().values
    payment = g['payment'].sum().values
    customer_id = g.count().index
    return pd.DataFrame({'customer_id': customer_id, 'list_goods_num': list_goods_num, 'rate_num': rate_num, 'payment': payment})

In [37]:
data_before_tool = handle(data_before)
data_before2 = data_before.drop(drop_columns, axis=1).loc[data_before['customer_id'].drop_duplicates().index]
data_before3 = data_before2.merge(data_before_tool, on='customer_id')

In [38]:
train_tool = handle(train)
train2 = train.drop(drop_columns, axis=1).loc[train['customer_id'].drop_duplicates().index]
train3 = train2.merge(train_tool, on='customer_id')

In [39]:
def model_data(data):
    X, y = data.drop(['label'], axis=1).values, data['label'].values
    return train_test_split(X, y, test_size=0.1, random_state=42,stratify=y)

In [40]:
X_all = train3.drop([], axis=1).values
X_train, X_valid, y_train, y_valid = model_data(data_before3)

In [30]:
# 模型训练
lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=64, reg_alpha=0.1, reg_lambda=1.0,
                                max_depth=-1, n_estimators=10000, objective='binary', metrics='None', 
                                bagging_fraction=0.8, is_unbalance=False, bagging_freq=5, min_child_samples=80, 
                                feature_fraction=0.8, learning_rate=0.1, random_state=42, n_jobs=8,
                                )

### RF

In [41]:
rf = RandomForestClassifier(n_estimators=400, min_samples_split=35, n_jobs=4, 
                                  min_samples_leaf=20,max_depth=19,max_features='sqrt' ,random_state=10, verbose=False, oob_score=True)

In [42]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=19, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=35,
                       min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=4,
                       oob_score=True, random_state=10, verbose=False,
                       warm_start=False)

In [45]:
rf_pred = rf.predict_proba(X_all)

### 提交数据

In [46]:
def get_result(all_data, pred):
    global test
    
    all_result = pd.DataFrame({'customer_id': all_data.customer_id.values, 'result2':pred})
    result = test.merge(all_result, on='customer_id', how='left')
    result.result = result.result2
    result.drop(['result2'], axis=1, inplace=True)
    result.dropna(inplace=True)
    result.to_csv('./data/submit.csv', index=False)
    return result

In [47]:
rf_result = get_result(train3, rf_pred[:, 1])

In [50]:
rf_result.loc[rf_result.result>0.8]

Unnamed: 0,customer_id,result
1208,921.0,0.832783
5184,5485.0,0.845010
5256,5218.0,0.834173
6211,6055.0,0.865779
6222,5835.0,0.833096
9073,9422.0,0.812629
9477,9754.0,0.816871
9524,9800.0,0.832430
10090,10122.0,0.854199
10251,10464.0,0.811006
