https://github.com/wepe/O2O-Coupon-Usage-Forecast/blob/master/code/wepon/season%20one/extract_feature.py

In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "./data/ml100marathon-02-01/"

In [2]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))

# 欄位名稱全部用小寫
dfoff.columns = [x.lower() for x in dfoff.columns]
dftest.columns = [x.lower() for x in dftest.columns]

# 將預測檔案中，沒有 coupon_id 的資料移除，不用去猜他們是否會使用 coupon 消費
dftest = dftest[~dftest.coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)

print(dfoff.shape)
print(dftest.shape)

dfoff.head()

(1160742, 7)
(306313, 6)


Unnamed: 0,user_id,merchant_id,coupon_id,discount_rate,distance,date_received,date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,


In [3]:
#資料量太大了，所以開發時先用較少量的資料來玩，調整差不多後，再用完整資料來玩
#dfoff = dfoff[:100000]
#dftest = dftest[:10000]

In [4]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['date_received']):
        return -1
    if not np.isnan(row['date']):
        td = pd.to_datetime(row['date'], format='%Y%m%d') -  pd.to_datetime(row['date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [5]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['date_received'].apply(getWeekday)
dftest['weekday'] = dftest['date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [6]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [7]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rateNum'] = df['discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.distance.isna(), "distance"] = 99
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

In [8]:
dfoff.head()

Unnamed: 0,user_id,merchant_id,coupon_id,discount_rate,distance,date_received,date,label,weekday,weekday_type,...,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rateNum,discount_man,discount_jian,discount_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0,...,0,0,0,0,0,0,,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,0,1,0,0,0,0,0.95,20,1,1
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0,...,0,0,0,0,1,0,0.95,20,1,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1


## 增加額外的 feature

In [9]:
off_train = dfoff.copy()
off_test = dftest.copy()

In [10]:
def extractFeature(dataset3):
    t = dataset3[['user_id']]
    t['this_month_user_receive_all_coupon_count'] = 1
    t = t.groupby('user_id').agg('sum').reset_index()

    t1 = dataset3[['user_id','coupon_id']]
    t1['this_month_user_receive_same_coupon_count'] = 1
    t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()

    t2 = dataset3[['user_id','coupon_id','date_received']]
    t2.date_received = t2.date_received.astype('str')
    t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
    t2 = t2[t2.receive_number>1]
    t2['max_date_received'] = t2.date_received.apply(lambda s:max([float(d) for d in s.split(':')]))
    t2['min_date_received'] = t2.date_received.apply(lambda s:min([float(d) for d in s.split(':')]))
    t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]

    t3 = dataset3[['user_id','coupon_id','date_received']]
    t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
    t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received
    t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - t3.min_date_received
    def is_firstlastone(x):
        if x==0:
            return 1
        elif x>0:
            return 0
        else:
            return -1 #those only receive once

    t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
    t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
    t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]

    t4 = dataset3[['user_id','date_received']]
    t4['this_day_user_receive_all_coupon_count'] = 1
    t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()

    t5 = dataset3[['user_id','coupon_id','date_received']]
    t5['this_day_user_receive_same_coupon_count'] = 1
    t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()

    t6 = dataset3[['user_id','coupon_id','date_received']]
    t6.date_received = t6.date_received.astype('str')
    t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t6.rename(columns={'date_received':'dates'},inplace=True)

    def get_day_gap_before(s):
        if str(s)=="nan":
            return -1
        date_received,dates = s.split('-')
        dates = dates.split(':')
        gaps = []
        for d in dates:
            this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
            if this_gap>0:
                gaps.append(this_gap)
        if len(gaps)==0:
            return -1
        else:
            return min(gaps)

    def get_day_gap_after(s):
        if str(s)=="nan":
            return -1
        date_received,dates = s.split('-')
        dates = dates.split(':')
        gaps = []
        for d in dates:
            this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
            if this_gap>0:
                gaps.append(this_gap)
        if len(gaps)==0:
            return -1
        else:
            return min(gaps)


    t7 = dataset3[['user_id','coupon_id','date_received']]
    t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
    t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
    t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
    t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
    t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]

    other_feature3 = pd.merge(t1,t,on='user_id')
    other_feature3 = pd.merge(other_feature3,t3,on=['user_id','coupon_id'])
    other_feature3 = pd.merge(other_feature3,t4,on=['user_id','date_received'])
    other_feature3 = pd.merge(other_feature3,t5,on=['user_id','coupon_id','date_received'])
    other_feature3 = pd.merge(other_feature3,t7,on=['user_id','coupon_id','date_received'])
    #other_feature3.to_csv('data/other_feature3.csv',index=None)
    
    return other_feature3


In [11]:
off_train_otherFeatures = extractFeature(off_train)
off_train_otherFeatures.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

Unnamed: 0,user_id,coupon_id,this_month_user_receive_same_coupon_count,this_month_user_receive_all_coupon_count,date_received,this_month_user_receive_same_coupon_lastone,this_month_user_receive_same_coupon_firstone,this_day_user_receive_all_coupon_count,this_day_user_receive_same_coupon_count,day_gap_before,day_gap_after
0,4,8735.0,1,1,20160214.0,-1,-1,1,1,-1,-1
1,35,1807.0,1,4,20160130.0,-1,-1,2,1,-1,-1
2,35,11951.0,2,4,20160130.0,1,0,2,1,1,-1
3,35,9776.0,1,4,20160129.0,-1,-1,2,1,-1,-1
4,35,11951.0,2,4,20160129.0,0,1,2,1,-1,1


In [12]:
off_test_otherFeatures = extractFeature(off_test)
off_test_otherFeatures.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http:

Unnamed: 0,user_id,coupon_id,this_month_user_receive_same_coupon_count,this_month_user_receive_all_coupon_count,date_received,this_month_user_receive_same_coupon_lastone,this_month_user_receive_same_coupon_firstone,this_day_user_receive_all_coupon_count,this_day_user_receive_same_coupon_count,day_gap_before,day_gap_after
0,4,2902.0,1,1,20160607.0,-1,-1,1,1,-1,-1
1,165,7571.0,1,1,20160525.0,-1,-1,1,1,-1,-1
2,166,9261.0,1,1,20160525.0,-1,-1,1,1,-1,-1
3,215,8944.0,1,1,20160524.0,-1,-1,1,1,-1,-1
4,236,11002.0,1,1,20160528.0,-1,-1,1,1,-1,-1


## 將額外的 feature 再併回去

In [13]:
dfoff2 = pd.merge(dfoff.copy()
                  , off_train_otherFeatures
                  ,on=['user_id','coupon_id','date_received']
                  ,how='left')
dfoff2.head()

Unnamed: 0,user_id,merchant_id,coupon_id,discount_rate,distance,date_received,date,label,weekday,weekday_type,...,discount_jian,discount_type,this_month_user_receive_same_coupon_count,this_month_user_receive_all_coupon_count,this_month_user_receive_same_coupon_lastone,this_month_user_receive_same_coupon_firstone,this_day_user_receive_all_coupon_count,this_day_user_receive_same_coupon_count,day_gap_before,day_gap_after
0,1439408,2632,,,0.0,,20160217.0,-1,,0,...,0,0,,,,,,,,
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,1,1,1.0,3.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0,...,1,1,1.0,3.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,20,1,1.0,1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,20,1,1.0,1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0


In [14]:
dftest2 = pd.merge(dftest.copy()
                  , off_test_otherFeatures
                  ,on=['user_id','coupon_id','date_received']
                  ,how='left')
dftest2.head()

Unnamed: 0,user_id,merchant_id,coupon_id,discount_rate,distance,date_received,weekday,weekday_type,weekday_1,weekday_2,...,discount_jian,discount_type,this_month_user_receive_same_coupon_count,this_month_user_receive_all_coupon_count,this_month_user_receive_same_coupon_lastone,this_month_user_receive_same_coupon_firstone,this_day_user_receive_all_coupon_count,this_day_user_receive_same_coupon_count,day_gap_before,day_gap_after
0,1439408,4663,11002.0,150:20,1.0,20160528.0,6,0,0,0,...,20,1,1,3,-1,-1,1,1,-1,-1
1,1439408,2632,8591.0,20:1,0.0,20160613.0,1,0,1,0,...,1,1,2,3,1,0,1,1,28,-1
2,1439408,2632,8591.0,20:1,0.0,20160516.0,1,0,1,0,...,1,1,2,3,0,1,1,1,-1,28
3,2029232,450,1532.0,30:5,0.0,20160530.0,1,0,1,0,...,5,1,1,2,-1,-1,1,1,-1,-1
4,2029232,6459,12737.0,20:1,0.0,20160519.0,4,0,0,0,...,1,1,1,2,-1,-1,1,1,-1,-1


## 切資料

In [15]:
dfoff2.columns

Index(['user_id', 'merchant_id', 'coupon_id', 'discount_rate', 'distance',
       'date_received', 'date', 'label', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'discount_rateNum', 'discount_man',
       'discount_jian', 'discount_type',
       'this_month_user_receive_same_coupon_count',
       'this_month_user_receive_all_coupon_count',
       'this_month_user_receive_same_coupon_lastone',
       'this_month_user_receive_same_coupon_firstone',
       'this_day_user_receive_all_coupon_count',
       'this_day_user_receive_same_coupon_count', 'day_gap_before',
       'day_gap_after'],
      dtype='object')

In [16]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff2[dfoff2['label'] != -1].copy()
df["is_train"] = df["date_received"].apply(split_train_valid)

train = df[df["is_train"]]
valid = df[~df["is_train"]]

train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 952393, #positive: 84886
Valid size: 83692, #positive: 6382


In [17]:
train.head()

Unnamed: 0,user_id,merchant_id,coupon_id,discount_rate,distance,date_received,date,label,weekday,weekday_type,...,discount_type,this_month_user_receive_same_coupon_count,this_month_user_receive_all_coupon_count,this_month_user_receive_same_coupon_lastone,this_month_user_receive_same_coupon_firstone,this_day_user_receive_all_coupon_count,this_day_user_receive_same_coupon_count,day_gap_before,day_gap_after,is_train
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,1,1.0,3.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,True
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0,...,1,1.0,3.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,True
2,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,1,1.0,1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,True
3,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0,...,1,1.0,1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,True
4,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7.0,0,...,1,1.0,1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,True


In [18]:
original_feature = ['discount_rateNum',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols

original_feature = original_feature + ['date_received'
                                       , 'this_month_user_receive_same_coupon_count'
                                       , 'this_month_user_receive_all_coupon_count'
                                       , 'this_month_user_receive_same_coupon_lastone'
                                       , 'this_month_user_receive_same_coupon_firstone'
                                       , 'this_day_user_receive_all_coupon_count'
                                       , 'this_day_user_receive_same_coupon_count'
                                       , 'day_gap_before'
                                       , 'day_gap_after']

print(len(original_feature),original_feature)

23 ['discount_rateNum', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7', 'date_received', 'this_month_user_receive_same_coupon_count', 'this_month_user_receive_all_coupon_count', 'this_month_user_receive_same_coupon_lastone', 'this_month_user_receive_same_coupon_firstone', 'this_day_user_receive_all_coupon_count', 'this_day_user_receive_same_coupon_count', 'day_gap_before', 'day_gap_after']


## 模型 SGDClassifier

In [19]:
predictors = original_feature
print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

['discount_rateNum', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7', 'date_received', 'this_month_user_receive_same_coupon_count', 'this_month_user_receive_all_coupon_count', 'this_month_user_receive_same_coupon_lastone', 'this_month_user_receive_same_coupon_firstone', 'this_day_user_receive_all_coupon_count', 'this_day_user_receive_same_coupon_count', 'day_gap_before', 'day_gap_after']


In [20]:
model = check_model(train, predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   36.1s finished


In [21]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

In [22]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.856, Accuracy: 0.939


In [23]:
targetset = dftest2.copy()
print(targetset.shape)
targetset = targetset[~targetset.coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(325057, 27)
(325057, 24)


In [24]:
output = pd.concat((targetset[["user_id", "coupon_id", "date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "user_id"] = output["user_id"].apply(lambda x:str(int(x)))
output.loc[:, "coupon_id"] = output["coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "date_received"] = output["date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["user_id", "coupon_id", "date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(325057, 4)


In [25]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("Day_051_HW_extractfeature.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.165017
1,1000020_8192_20160513,0.140324
2,1000065_1455_20160527,0.098129
3,1000085_8067_20160513,0.111541
4,1000086_2418_20160613,0.138009


## 輸出預測檔案

In [33]:
def exportSubmit(model, filename):

    targetset = dftest2.copy()
    print(targetset.shape)

    targetset = targetset[~targetset.coupon_id.isna()]
    targetset.reset_index(drop=True, inplace=True)
    testset = targetset[predictors].copy()

    y_test_pred = model.predict_proba(testset[predictors])
    test1 = testset.copy()
    test1['pred_prob'] = y_test_pred[:, 1]
    print(test1.shape)
    
    output = pd.concat((targetset[["user_id", "coupon_id", "date_received"]], test1["pred_prob"]), axis=1)
    print(output.shape)

    output.loc[:, "user_id"] = output["user_id"].apply(lambda x:str(int(x)))
    output.loc[:, "coupon_id"] = output["coupon_id"].apply(lambda x:str(int(x)))
    output.loc[:, "date_received"] = output["date_received"].apply(lambda x:str(int(x)))
    output["uid"] = output[["user_id", "coupon_id", "date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
    output.reset_index(drop=True, inplace=True)
    
    ### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
    out = output.groupby("uid", as_index=False).mean()
    out = out[["uid", "pred_prob"]]
    out.columns = ["uid", "label"]
    out.to_csv(filename, header=["uid", "label"], index=False) # submission format
    out.head()

## 模型 RandomForestClassifier

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
model_rfc = RandomForestClassifier(n_estimators=20, max_depth=4)

# 訓練模型
model_rfc.fit(train[predictors], train['label'])

# 預測測試集
pred = model_rfc.predict(valid[predictors])

# 評分
auc_score = roc_auc_score(y_true=valid.label, y_score=pred)
acc = accuracy_score(y_true=valid.label, y_pred=pred)

print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))


Validation AUC: 0.673, Accuracy: 0.947


In [34]:
# 輸出
exportSubmit(model_rfc, "Day_051_HW_extractfeature_rfc.csv")

(325057, 27)
(325057, 24)
(325057, 4)
