In [1]:
import os
import numpy as np
import pandas as pd
import time
from datetime import date
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "./data/"

In [2]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [3]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
#     if Date_received is nan, return -1
    if np.isnan(row['Date_received']):
        return -1
#     if Date is not nan, get td, which is Date - Date_received
#     and then if td <= 15 days, return 1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [4]:
# Generate features - weekday acquired coupon
def getWeekday(row):
#     if a colums "row" is nan or equals to -1, return this "row"
#     else change the form of Date_received to day of week 
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type= 1, if it is weekend; 0, if it is the other days )
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if (x =='6.0') | (x=='7.0') else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if (x =='6.0') | (x=='7.0') else 0 ) # apply to testset

dfoff['weekday_type'].head()

0    0
1    0
2    1
3    0
4    0
Name: weekday_type, dtype: int64

In [5]:
# produce week day columns
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

# using one-hot encoding on "weekday"
tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

dfoff.head()

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,,,0.0,,20160217.0,-1,,0,0,0,0,0,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,0,0,1,0,0,0,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,0,0,0,0,0,1,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,0,0,0,0,1,0,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,0,0,0,0,1,0,0


In [6]:
# Generate features - distance
dfoff.loc[dfoff.Distance.isna(), "Distance"] = 0
dftest.loc[dftest.Distance.isna(), "Distance"] = 0

In [31]:
# Generate features - coupon discount
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)
# get how much can get discount 
def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0
# get how much is the discount
def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)
dfoff['discount_rate'].value_counts()

0.900000    354273
0.833333    172602
0.750000     52140
0.800000     33864
0.950000     33359
0.700000     30234
0.850000     27497
0.500000     21844
0.966667      6456
0.933333      4958
0.866667      4411
0.980000      2331
0.666667      1889
0.600000       706
0.990000       240
0.200000       110
0.975000        38
0.400000         9
0.333333         8
Name: discount_rate, dtype: int64

In [8]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type']
dfoff[original_feature] = StandardScaler().fit_transform(dfoff[original_feature])
dftest[original_feature] = StandardScaler().fit_transform(dftest[original_feature])

In [9]:
## Naive model

# split data into train data and valid data
# return True if the element in row < data_cut ,else return False
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train

# copy the data whose label is not -1
# and then get a bolean array that produce
df = dfoff[dfoff['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)

# get a set of train data and a set of valid data  
# and then reset data's index
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [24]:
# set the name of features
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type']
print(len(original_feature),original_feature)
print(train['discount_type'].value_counts())
train[original_feature].head(30)

7 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type']
1    642491
0     25262
Name: discount_type, dtype: int64


Unnamed: 0,discount_rate,discount_type,discount_man,discount_jian,Distance,weekday,weekday_type
0,0.95,1,20,1,0.0,3.0,0
1,0.95,1,20,1,0.0,6.0,1
2,0.9,1,200,20,1.0,5.0,0
3,0.5,1,10,5,2.0,5.0,0
4,0.9,1,100,10,99.0,7.0,1
5,0.9,1,200,20,10.0,6.0,1
6,0.9,1,200,20,2.0,2.0,0
7,0.75,1,20,5,0.0,7.0,1
8,0.833333,1,30,5,2.0,3.0,0
9,0.95,1,20,1,10.0,1.0,0


In [20]:
from sklearn.tree import DecisionTreeClassifier

predictors = original_feature
print(predictors)

def check_model(data, predictors):
#     loss function is  logistic regression
#     penalty is L1+L2

    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)
#     classifier = lambda: DecisionTreeClassifier(criterion = "entropy")

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }
    
#   provides train/test indices to split data in train/test sets.
    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
#   adjust the parameters
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [21]:
model = check_model(train, predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  2.3min finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [22]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

  Xt = transform.transform(Xt)


In [23]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.743, Accuracy: 0.952


In [15]:
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 19)
(306313, 15)


  Xt = transform.transform(Xt)


In [16]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [17]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv(f"{time.time()}_testtest.csv", header=["uid", "label"], index=False) # submission format