In [13]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

from datetime import date

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

# machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
os.getcwd()

'/Users/Bin/repos/competitions/O2O-Coupon-Usage-Forecast'

In [14]:
# import data
dfoff = pd.read_csv('../../ml-datasets/O2O-Coupon-Usage-Forecast/ccf_offline_stage1_train.csv')
dfon = pd.read_csv('../../ml-datasets/O2O-Coupon-Usage-Forecast/ccf_online_stage1_train.csv')
dftest = pd.read_csv('../../ml-datasets/O2O-Coupon-Usage-Forecast/ccf_offline_stage1_test_revised.csv')

## 1. 探索数据
参考单独的 EDA jupyter notebook。
## 2. 特征工程
### 2.1 折扣率 Discount_rate
```python
# print(dfoff['Discount_rate'].unique())
[nan '150:20' '20:1' '200:20' '30:5' '50:10' '10:5' '100:10' '200:30'
 '20:5' '30:10' '50:5' '150:10' '100:30' '200:50' '100:50' '300:30'
 '50:20' '0.9' '10:1' '30:1' '0.95' '100:5' '5:1' '100:20' '0.8' '50:1'
 '200:10' '300:20' '100:1' '150:30' '300:50' '20:10' '0.85' '0.6' '150:50'
 '0.75' '0.5' '200:5' '0.7' '30:20' '300:10' '0.2' '50:30' '200:100'
 '150:5']
```

根据打印的结果来看，打折率分为 3 种情况：

* ‘null’ 表示没有打折
* [0,1] 表示折扣率
* x:y 表示满 x 减 y

那我们的处理方式可以构建 4 个函数，分别提取 4 种特征，分别是：

1. 打折类型：Discount_rate_to_type()
2. 折扣率：Discount_rate_zero_to_one()
3. 满多少：getDiscountMan()
4. 减多少：getDiscountJian()

In [3]:
# 1. 通过不同类型的折扣优惠可以得到类型特征
def Discount_rate_to_type(row):
    if pd.isnull(row):
        return 0
    elif ':' in row:
        return 1
    elif '.' in row:
        return 2

# 2. 将 Discount_rate 字符串转化成 [0, 1]之间小数
def Discount_rate_zero_to_one(row):
    if pd.isnull(row):
        return 0
    if ':' in row:
        sps = row.split(':')
        return (float(sps[0]) - float(sps[1])) / float(sps[0])
    return float(row)

# 3. 统计折扣率满减时需要满多少金额
def Discount_rate_man(row):
    if pd.isnull(row):
        return 0
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

# 4. 统计折扣率满减时需要满多少金额
def Discount_rate_jian(row):
    if pd.isnull(row):
        return 0
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
    
# 汇总 Discount_rate 特征提取
def Discount_rate_feat(df):
    df['Discount_rate_type'] = df['Discount_rate'].apply(Discount_rate_to_type)
    df['Discount_rate_ratio'] = df['Discount_rate'].apply(Discount_rate_zero_to_one)
    df['Discount_rate_man'] = df['Discount_rate'].apply(Discount_rate_man)
    df['Discount_rate_jian'] = df['Discount_rate'].apply(Discount_rate_jian)
    # 填充缺失值
    df['Discount_rate'].fillna(0, inplace=True)
    return df

### 2.2 领券时间
还有一点很重要的是领券日期，因为一般而言，周末领取优惠券去消费的可能性更大一些。因此，我们可以构建关于领券日期的一些特征：

* weekday : {null, 1, 2, 3, 4, 5, 6, 7}
* weekday_type : {1, 0}（周六和周日为1，其他为0）
* Weekday_1 : {1, 0, 0, 0, 0, 0, 0}
* Weekday_2 : {0, 1, 0, 0, 0, 0, 0}
* Weekday_3 : {0, 0, 1, 0, 0, 0, 0}
* Weekday_4 : {0, 0, 0, 1, 0, 0, 0}
* Weekday_5 : {0, 0, 0, 0, 1, 0, 0}
* Weekday_6 : {0, 0, 0, 0, 0, 1, 0}
* Weekday_7 : {0, 0, 0, 0, 0, 0, 1}

In [4]:
# 1. 获取时间是一周的第几天
def get_weekday(row):
    if pd.isnull(row):
        return np.nan
    if row == 'nan':
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1

# 2. 获取是否为周末的特征
def get_weekday_type(row):
    if row in [6, 7]:
        return 1
    else:
        return 0

    
# ## 处理 Date_received，为空的返回为零，其他的按位取
# def Date_received_split_month(row):
#     if pd.isnull(row):
#         return 0
#     row_s = str(int(row))
#     return float(row_s[4:6])

# def Date_received_split_day(row):
#     if pd.isnull(row):
#         return 0
#     row_s = str(int(row))
#     return float(row_s[4:6])

# 汇总 Date_received 特征
def Date_received_feat(df):
#     df['Date_received_month'] = df['Date_received'].apply(Date_received_split_month)
#     df['Date_received_day'] = df['Date_received'].apply(Date_received_split_day)
    
    df['weekday'] = df['Date_received'].astype(str).apply(get_weekday)
    df['weekday_type'] = df['weekday'].apply(get_weekday_type)
    df = pd.concat((df, pd.get_dummies(df['weekday'], prefix='weekday_')), axis=1)
    
    # 处理缺失值
    df['weekday'].fillna(0, inplace=True)
    return df

### 2.3 距离 Distance
距离已经被处理过，那么就主要处理缺失值

In [5]:
# 3. Distance
def Distance_feat(df):
    # 缺失值
    df['Distance'].fillna(df['Distance'].median(), inplace=True)
    
    # 归一化距离
#     df['Distance'] = MinMaxScaler().fit_transform(df['Distance'].values.reshape(-1, 1))
    return df

### 2.4 类标
标注标签 Label
有了特征之后，我们还需要对训练样本进行 label 标注，即确定哪些是正样本（y = 1），哪些是负样本（y = 0）。我们要预测的是用户在领取优惠券之后 15 之内的消费情况。所以，总共有三种情况：

1. Date_received == ‘null’：

表示没有领到优惠券，无需考虑，y = -1

2. (Date_received != ‘null’) & (Date != ‘null’) & (Date – Date_received <= 15)：

表示领取优惠券且在15天内使用，即正样本，y = 1

3. (Date_received != ‘null’) & ((Date == ‘null’) | (Date – Date_received > 15))：

表示领取优惠券未在在15天内使用，即负样本，y = 0

好了，知道规则之后，我们就可以定义标签备注函数了。

In [6]:
def get_label(row):
    if row['Date_received'] == 'nan':
        return -1
    if pd.notnull(row['Date']):
        if pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d') < pd.Timedelta(15, 'D'):
            return 1
    return 0

def label_feat(df):
    df['label'] = df.apply(get_label, axis=1)
    return df

### 特征工程处理类

In [7]:
# 线下特征处理汇总
def off_feats(df_, no_discount_rate=True):
    # 创建一个拷贝，修改不影响原数据，这样就不用因为原数据被修改从头 run 一遍，节省时间，不要老对内存扣扣搜搜的，硬件问题不会特别大
    df = df_.copy()

    df = Discount_rate_feat(df)
    df = Date_received_feat(df)
    df = Distance_feat(df)
#     df = label_feat(df)
    
    # drop featues
    if no_discount_rate:
        feats = ['Discount_rate', 'Date_received']
        if 'Date' in df.columns:
            feats.append('Date')
        df.drop(feats, axis=1, inplace=True)
    return df

# 待丰富，做特征工程的类
class FeatureConverting(BaseEstimator, TransformerMixin):
    def __init__(self, no_discount_rate=True):
        self.no_discount_rate = no_discount_rate
    def fit(self, X, y=None):
        return self   # Nothing else to do
    def transform(self, X, y=None):
        return off_feats(X, self.no_discount_rate)
#     def fit_transform(self, X, y=None):
#         return off_feats(X)

## 选一个模型

In [8]:
# 准备数据：拆分线下数据成训练集和测试集
## 找到label
dfoff = label_feat(dfoff)
train_off, test_off = train_test_split(dfoff, test_size=0.2, random_state=42)

In [9]:
# 单模型测试
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# 管道机制使得参数集在新数据集（比如测试集）上的重复使用，管道机制实现了对全部步骤的流式化封装和管理。
feat_eng = Pipeline([
    ('fc', FeatureConverting()), # transformer
])

feats = ['Discount_rate', 'Distance', 'Date_received', 'Date']
feat_eng_tr = feat_eng.fit_transform(train_off[feats])
feat_eng_ts = feat_eng.transform(test_off[feats])

In [15]:
# # 下采样
# from imblearn.under_sampling import RandomUnderSampler
# cc = RandomUnderSampler(random_state=0)

# X_train, y_train = cc.fit_sample(feat_eng_tr, train_off['label'])
X_train, y_train = feat_eng_tr, train_off['label']

clf = RandomForestClassifier().fit(X_train, y_train)
y_pred = clf.predict(feat_eng_ts)

from sklearn.metrics import roc_auc_score
roc_auc_score(test_off['label'], y_pred)

0.5001444991713621

In [None]:
# 对结果预测
submit = dftest[['User_id', 'Coupon_id', 'Date_received']]

feat_dftest = feat_eng.transform(dftest[['Discount_rate', 'Distance', 'Date_received']])

submit['Probability'] = clf.predict_proba(feat_dftest)[:,0]
submit.to_csv('submission.csv', header=None, index=False)

In [17]:
# 多个模型批量跑
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 管道机制使得参数集在新数据集（比如测试集）上的重复使用，管道机制实现了对全部步骤的流式化封装和管理。
feat_eng = Pipeline([
    ('fc', FeatureConverting()), # transformer
])

feats = ['Discount_rate', 'Distance', 'Date_received']
feat_eng_tr = feat_eng.fit_transform(train_off[feats])
feat_eng_ts = feat_eng.transform(test_off[feats])

# 下采样
from imblearn.under_sampling import RandomUnderSampler
cc = RandomUnderSampler(random_state=0)
X_train, y_train = cc.fit_sample(feat_eng_tr, train_off['label'])

models = tqdm([LogisticRegression(), GaussianNB(), DecisionTreeClassifier(), RandomForestClassifier()])

for model in models:
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(feat_eng_ts)
    print(type(model).__name__, roc_auc_score(test_off['label'], y_pred), accuracy_score(test_off['label'], y_pred))

 50%|█████     | 2/4 [00:01<00:01,  1.60it/s]

('LogisticRegression', 0.7849732755882847, 0.6816543534191699)
('GaussianNB', 0.7205499132742075, 0.47964681446362584)


 75%|███████▌  | 3/4 [00:01<00:00,  1.94it/s]

('DecisionTreeClassifier', 0.7981439817937237, 0.7234120754351425)


100%|██████████| 4/4 [00:02<00:00,  1.65it/s]

('RandomForestClassifier', 0.7984330558818808, 0.7192494095054661)



