In [1]:
import numpy as np  
import pandas as pd
import datetime
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings("ignore")
 
%matplotlib inline

id_col_names = ['user_id','coupon_id','date_received']
target_col_name = 'label'
id_target_cols = ['user_id','coupon_id','date_received','label']
myeval = 'roc_auc'

datapath = 'C:/Users/Stille/Desktop/ML projects/' 
featurepath = 'C:/Users/Stille/Desktop/ML projects/feature/' 
resultpath = 'C:/Users/Stille/Desktop/ML projects/result/'
tmppath = 'C:/Users/Stille/Desktop/ML projects/tmp/'
scorepath = 'C:/Users/Stille/Desktop/ML projects/score/'


In [2]:
def get_id_df(df):
    return df[id_col_names]

def get_target_df(df):
    return df[target_col_name]

def get_predictors_df(df):
    predictors = [f for f in df.columns if f not in id_target_cols]
    return df[predictors]

def read_featurefile_train(featurename): 
    df = pd.read_csv(featurepath+'train_'+featurename+'.csv', sep=',' , encoding = "utf-8")
    df.fillna(0,inplace=True)
    return df

def read_featurefile_test(featurename): 
    df = pd.read_csv(featurepath+'test_'+featurename+'.csv', sep=',' , encoding = "utf-8")
    df.fillna(0,inplace=True)
    return df

def standize_df(train_data,test_data):
    from sklearn import preprocessing 
    
    features_columns = [f for f in test_data.columns if f not in id_target_cols]
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler = min_max_scaler.fit(train_data[features_columns])
    
    train_data_scaler = min_max_scaler.transform(train_data[features_columns])
    test_data_scaler = min_max_scaler.transform(test_data[features_columns])
    
    train_data_scaler = pd.DataFrame(train_data_scaler)
    train_data_scaler.columns = features_columns
    
    test_data_scaler = pd.DataFrame(test_data_scaler)
    test_data_scaler.columns = features_columns
    
    train_data_scaler['label'] = train_data['label']
    train_data_scaler[id_col_names] = train_data[id_col_names]
    test_data_scaler[id_col_names] = test_data[id_col_names]
    return train_data_scaler,test_data_scaler

def read_data(featurename): 
    traindf = read_featurefile_train(featurename)
    testdf = read_featurefile_test(featurename) 
    return standize_df(traindf,testdf)  

In [3]:
# get train and test data with different features
train_f1, test_f1 = read_data('f1')
train_f2, test_f2 = read_data('sf2')
train_f3, test_f3 = read_data('sf3')

In [4]:
# auc eval
from sklearn.metrics import roc_auc_score 
def myauc(test):
    testgroup = test.groupby(['coupon_id'])
    aucs = []
    for i in testgroup:
        coupon_df = i[1]
        if len(coupon_df['label'].unique()) < 2:
            continue
        auc = metrics.roc_auc_score(coupon_df['label'], coupon_df['pred'])
        aucs.append(auc)
    return np.average(aucs)

In [5]:
# cross validation f1
from sklearn.model_selection import train_test_split
target = get_target_df(train_f1).copy()
traindf = train_f1.copy()
train_all,test_all,train_target,test_target=train_test_split(traindf,target,test_size=0.2,random_state=0)

train_data = get_predictors_df(train_all).copy()
test_data = get_predictors_df(test_all).copy()

clf = LogisticRegression()
clf.fit(train_data, train_target)
train_pred = clf.predict_proba(train_data)[:,1]
test_pred = clf.predict_proba(test_data)[:,1]

score_train = roc_auc_score(train_target,train_pred )
score_test = roc_auc_score(test_target, test_pred)
print("LogisticRegression train total AUC:   ", score_train)
print("LogisticRegression test total AUC:   ", score_test)

train_all['pred'] = train_pred
test_all['pred'] = test_pred
print("LogisticRegression train Coupon AUC:   ", myauc(train_all))
print("LogisticRegression test Coupon AUC:   ", myauc(test_all))

LogisticRegression train total AUC:    0.6634053625664106
LogisticRegression test total AUC:    0.6674417162568336
LogisticRegression train Coupon AUC:    0.540286974067141
LogisticRegression test Coupon AUC:    0.5410440709758992


In [6]:
# cross validation f3
from sklearn.model_selection import train_test_split
target = get_target_df(train_f3).copy()
traindf = train_f3.copy()

train_all,test_all,train_target,test_target = train_test_split(traindf,target,test_size=0.2,random_state=0)

train_data = get_predictors_df(train_all).copy()
test_data = get_predictors_df(test_all).copy()

clf = LogisticRegression()
clf.fit(train_data, train_target)
train_pred = clf.predict_proba(train_data)[:,1]
test_pred = clf.predict_proba(test_data)[:,1]

score_train = roc_auc_score(train_target,train_pred )
score_test = roc_auc_score(test_target, test_pred)
print("LogisticRegression train total AUC:   ", score_train)
print("LogisticRegression test total AUC:   ", score_test)

train_all['pred'] = train_pred
test_all['pred'] = test_pred
print("LogisticRegression train Coupon AUC:   ", myauc(train_all))
print("LogisticRegression test Coupon AUC:   ", myauc(test_all))

LogisticRegression train total AUC:    0.8442307684126165
LogisticRegression test total AUC:    0.8451226316516732
LogisticRegression train Coupon AUC:    0.7211551475639484
LogisticRegression test Coupon AUC:    0.7441844349601485


In [7]:
# k (=5) fold cross validation f3
train = train_f3.copy()
target = get_target_df(train_f3).copy()

from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)
for k, (train_index, test_index) in enumerate(kf.split(train)):
    train_data,test_data,train_target,test_target = train.iloc[train_index],train.iloc[test_index],target[train_index],target[test_index]
    clf = LogisticRegression() 
    clf.fit(get_predictors_df(train_data), train_target)
    
    train_pred = clf.predict_proba(get_predictors_df(train_data))[:,1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:,1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k+1, " fold", "LogisticRegression train total AUC:   ", score_train)
    print(k+1, " fold", "LogisticRegression test total AUC:   ", score_test)  
    print(k+1, " fold", "LogisticRegression train Coupon AUC:   ", myauc(train_data))
    print(k+1, " fold", "LogisticRegression test Coupon AUC:   ", myauc(test_data), '\n')

1  fold LogisticRegression train total AUC:    0.8372907329776114
1  fold LogisticRegression test total AUC:    0.8704783569178365
1  fold LogisticRegression train Coupon AUC:    0.7208942182594116
1  fold LogisticRegression test Coupon AUC:    0.7343703480190046 

2  fold LogisticRegression train total AUC:    0.8376318185497111
2  fold LogisticRegression test total AUC:    0.8690304859508954
2  fold LogisticRegression train Coupon AUC:    0.7223504316074432
2  fold LogisticRegression test Coupon AUC:    0.7239821961020498 

3  fold LogisticRegression train total AUC:    0.8488017222568782
3  fold LogisticRegression test total AUC:    0.8248302486792577
3  fold LogisticRegression train Coupon AUC:    0.7260867445667626
3  fold LogisticRegression test Coupon AUC:    0.7308291892321238 

4  fold LogisticRegression train total AUC:    0.8498961087643455
4  fold LogisticRegression test total AUC:    0.8218922217517983
4  fold LogisticRegression train Coupon AUC:    0.7174592534477268
4  f

In [8]:
# LPO CV f3
train = train_f3.copy()
target = get_target_df(train_f3).copy()

from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p = 200)
num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
    train_data,test_data,train_target,test_target = train.iloc[train_index],train.iloc[test_index],target[train_index],target[test_index]
    clf = LogisticRegression() 
    clf.fit(get_predictors_df(train_data), train_target)
    
    train_pred = clf.predict_proba(get_predictors_df(train_data))[:,1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:,1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k+1, " fold", "LogisticRegression train total AUC:   ", score_train)
    print(k+1, " fold", "LogisticRegression test total AUC:   ", score_test)  
    print(k+1, " fold", "LogisticRegression train Coupon AUC:   ", myauc(train_data))
    print(k+1, " fold", "LogisticRegression test Coupon AUC:   ", myauc(test_data), '\n')
    if k >= 5:
        break

1  fold LogisticRegression train total AUC:    0.8446231115539284
1  fold LogisticRegression test total AUC:    0.8947251773049645
1  fold LogisticRegression train Coupon AUC:    0.7202520721406733
1  fold LogisticRegression test Coupon AUC:    0.631578947368421 

2  fold LogisticRegression train total AUC:    0.844647848737394
2  fold LogisticRegression test total AUC:    0.8956117021276596
2  fold LogisticRegression train Coupon AUC:    0.720234113518366
2  fold LogisticRegression test Coupon AUC:    0.65 

3  fold LogisticRegression train total AUC:    0.844618023653873
3  fold LogisticRegression test total AUC:    0.8964982269503545
3  fold LogisticRegression train Coupon AUC:    0.7201479451523117
3  fold LogisticRegression test Coupon AUC:    0.6578947368421053 

4  fold LogisticRegression train total AUC:    0.8445612024044723
4  fold LogisticRegression test total AUC:    0.8956117021276595
4  fold LogisticRegression train Coupon AUC:    0.7200843853112339
4  fold LogisticRegres

In [9]:
# StratifiedKFold f3
train = train_f3.copy()
target = get_target_df(train_f3).copy()

from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits = 5)
for k, (train_index, test_index) in enumerate(kf.split(train,target)):
    train_data,test_data,train_target,test_target = train.iloc[train_index],train.iloc[test_index],target[train_index],target[test_index]
    clf = LogisticRegression() 
    clf.fit(get_predictors_df(train_data), train_target)
    
    train_pred = clf.predict_proba(get_predictors_df(train_data))[:,1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:,1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k+1, " fold", "LogisticRegression train total AUC:   ", score_train)
    print(k+1, " fold", "LogisticRegression test total AUC:   ", score_test)  
    print(k+1, " fold", "LogisticRegression train Coupon AUC:   ", myauc(train_data))
    print(k+1, " fold", "LogisticRegression test Coupon AUC:   ", myauc(test_data), '\n') 

1  fold LogisticRegression train total AUC:    0.837205336440923
1  fold LogisticRegression test total AUC:    0.8687338333340271
1  fold LogisticRegression train Coupon AUC:    0.7244255662834408
1  fold LogisticRegression test Coupon AUC:    0.7300257567559034 

2  fold LogisticRegression train total AUC:    0.8379959982678591
2  fold LogisticRegression test total AUC:    0.872499106859304
2  fold LogisticRegression train Coupon AUC:    0.7191861790169858
2  fold LogisticRegression test Coupon AUC:    0.7342188632313051 

3  fold LogisticRegression train total AUC:    0.848426080958853
3  fold LogisticRegression test total AUC:    0.8297603402569399
3  fold LogisticRegression train Coupon AUC:    0.7225185415839362
3  fold LogisticRegression test Coupon AUC:    0.7307142843758752 

4  fold LogisticRegression train total AUC:    0.8501477963031956
4  fold LogisticRegression test total AUC:    0.8210264982881539
4  fold LogisticRegression train Coupon AUC:    0.7203960315462784
4  fold

In [10]:
# sklearn models
def get_sklearn_model(model_name,param=None):
    # naive Bayes
    if model_name == 'NB':
        model = MultinomialNB(alpha=0.01)
    # logistic regression
    elif model_name == 'LR':
        model = LogisticRegression(penalty='l2') 
    # KNN  
    elif model_name == 'KNN':
        model = KNeighborsClassifier()  
    # random forest
    elif model_name == 'RF':
        model = RandomForestClassifier()  
    # decision tree
    elif model_name == 'DT':
        model = tree.DecisionTreeClassifier()  
    # SVM
    elif model_name == 'SVC':
        model = SVC(kernel='rbf')
    # GBDT
    elif model_name == 'GBDT':
        model = GradientBoostingClassifier()
    # XGBoost
    elif model_name == 'XGB':
        model = XGBClassifier(eval_metric = "auc")
    # lightGBM
    elif model_name == 'LGB':
        model = LGBMClassifier()
    else:
        print("wrong model name!")
        return
    if param is not None:
        model.set_params(**param)
    return model

In [11]:
def classifier_df_score(train_feat,classifier,cvnum,param=None):  
    clf = get_sklearn_model(classifier,param)
    train = train_feat.copy()
    target = get_target_df(train_feat).copy()
    kf = StratifiedKFold(n_splits=cvnum)
    
    scores = []
    score_coupons = []
    for k, (train_index, test_index) in enumerate(kf.split(train,target)):
        train_data,test_data,train_target,test_target = train.iloc[train_index],train.iloc[test_index],target[train_index],target[test_index]
        clf.fit(get_predictors_df(train_data), train_target) 
        train_pred = clf.predict_proba(get_predictors_df(train_data))[:,1]
        test_pred = clf.predict_proba(get_predictors_df(test_data))[:,1]
        
        score_test = roc_auc_score(test_target, test_pred)
        test_data['pred'] = test_pred
        score_coupon_test = myauc(test_data)
        
        scores.append(score_test)
        score_coupons.append(score_coupon_test)
        
    print (classifier+"total AUC:",scores)
    print (classifier+"Coupon AUC:",score_coupons)

In [12]:
# f1 feature
train = train_f1.copy()
train.head()

Unnamed: 0,discount_rate,distance,if_fd,full_value,reduction_value,label,user_id,coupon_id,date_received
0,0.812785,0.1,1.0,0.5,0.2,0,1439408,11002,20160528
1,0.939117,0.0,1.0,0.066667,0.01,0,1439408,8591,20160613
2,0.939117,0.0,1.0,0.066667,0.01,0,1439408,8591,20160516
3,0.761035,0.0,1.0,0.1,0.05,0,2029232,1532,20160530
4,0.939117,0.0,1.0,0.066667,0.01,0,2029232,12737,20160519


In [13]:
print ('f1, different models 5 fold training AUC：')
classifier_df_score(train,'NB',5)
classifier_df_score(train,'LR',5)
classifier_df_score(train,'RF',5)
classifier_df_score(train,'LGB',5)

f1, different models 5 fold training AUC：
NBtotal AUC: [0.6484186809106656, 0.6538346697838108, 0.6573801264615924, 0.6503909739692905, 0.6633592474057963]
NBCoupon AUC: [0.5363631418076686, 0.5338265093921163, 0.5334927461290747, 0.5415356445668099, 0.5384562409294064]
LRtotal AUC: [0.6629972472671855, 0.6610222969646303, 0.6640819867693308, 0.6626297267050638, 0.6707402175024871]
LRCoupon AUC: [0.5363631418076686, 0.5338265093921163, 0.5334927461290747, 0.5415356445668099, 0.5384562409294064]
RFtotal AUC: [0.6839538946634527, 0.6843709320614357, 0.6845284307657375, 0.6819526828614803, 0.6866863089046271]
RFCoupon AUC: [0.5305449816694537, 0.5277579330496488, 0.5218474257647241, 0.5296521966087013, 0.5363612610003625]
LGBtotal AUC: [0.6851432344233683, 0.6850403372113775, 0.6856382718298754, 0.6828649844140205, 0.6884085118633962]
LGBCoupon AUC: [0.5331439738529373, 0.5260081552823167, 0.5265777225812563, 0.5314082282527371, 0.5406141522622117]


In [14]:
# f2 feature
train = train_f2.copy()
train.head()

Unnamed: 0,discount_rate,distance,if_fd,full_value,reduction_value,total_sales,sales_use_coupon,total_coupon,merchant_distance_min,merchant_distance_max,...,user_merchant_any,user_merchant_buy_common,user_merchant_coupon_transfer_rate,user_merchant_coupon_buy_rate,user_merchant_rate,user_merchant_common_buy_rate,label,user_id,coupon_id,date_received
0,0.863014,0.0,1.0,0.666667,0.2,0.722209,0.898094,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1832624,7610,20160429
1,0.78691,1.0,1.0,0.666667,0.3,0.037988,0.043496,0.214299,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,163606,5054,20160421
2,0.863014,0.2,1.0,0.666667,0.2,0.722209,0.898094,1.0,0.0,1.0,...,0.012346,0.0,0.0,0.0,0.0,0.0,0,94107,7610,20160412
3,0.863014,1.0,1.0,0.666667,0.2,0.722209,0.898094,1.0,0.0,1.0,...,0.012346,0.0,0.0,0.0,0.0,0.0,0,4061024,7610,20160426
4,0.761035,1.0,1.0,0.1,0.05,0.302988,0.392295,0.209746,0.0,1.0,...,0.012346,0.0,0.0,0.0,0.0,0.0,0,4061024,9871,20160409


In [15]:
print ('f2, different models 5 fold training AUC：')
classifier_df_score(train,'NB',5)
classifier_df_score(train,'LR',5)
classifier_df_score(train,'RF',5)
classifier_df_score(train,'LGB',5)

f2, different models 5 fold training AUC：
NBtotal AUC: [0.8220931920528695, 0.8041439476875973, 0.7473969057092373, 0.736430645675718, 0.739916613831939]
NBCoupon AUC: [0.6306880226637085, 0.6121314261743251, 0.6074369206205684, 0.6123032705509794, 0.6024925659472734]
LRtotal AUC: [0.8381022885254619, 0.8331825754529265, 0.7836506247297487, 0.7730278946861123, 0.7756972613801472]
LRCoupon AUC: [0.6297653149478863, 0.6231538354261053, 0.6094752650314427, 0.6103995263399465, 0.6074198999773787]
RFtotal AUC: [0.7939715508748336, 0.7880797463039366, 0.7690829145151368, 0.7774552917084525, 0.7907180366061666]
RFCoupon AUC: [0.5856541639783769, 0.5375957322417594, 0.5240804015000405, 0.5545881363516066, 0.5593411289936363]
LGBtotal AUC: [0.8741437388849905, 0.8746607149543522, 0.8477935673976178, 0.8405834025107148, 0.843133149388622]
LGBCoupon AUC: [0.6331774859157223, 0.5934261975183744, 0.5992099808046554, 0.607595049107945, 0.602563630481328]


In [16]:
# f3 feature
train = train_f3.copy()
train.head()

Unnamed: 0,discount_rate,distance,if_fd,full_value,reduction_value,total_sales,sales_use_coupon,total_coupon,merchant_distance_min,merchant_distance_max,...,this_month_user_receive_same_coupon_lastone,this_month_user_receive_same_coupon_firstone,this_day_receive_all_coupon_count,this_day_user_receive_same_coupon_count,day_gap_before,day_gap_after,label,user_id,coupon_id,date_received
0,0.863014,0.0,1.0,0.666667,0.2,0.722209,0.898094,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1832624,7610,20160429
1,0.78691,1.0,1.0,0.666667,0.3,0.037988,0.043496,0.214299,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,163606,5054,20160421
2,0.863014,0.2,1.0,0.666667,0.2,0.722209,0.898094,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,94107,7610,20160412
3,0.863014,1.0,1.0,0.666667,0.2,0.722209,0.898094,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,4061024,7610,20160426
4,0.761035,1.0,1.0,0.1,0.05,0.302988,0.392295,0.209746,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,4061024,9871,20160409


In [17]:
print ('f3, different models 5 fold training AUC：')
classifier_df_score(train,'NB',5)
classifier_df_score(train,'LR',5)
classifier_df_score(train,'RF',5)
classifier_df_score(train,'LGB',5)

f3, different models 5 fold training AUC：
NBtotal AUC: [0.8600232871434285, 0.8454000250719758, 0.7975100267142217, 0.7866397824196878, 0.7892484049093511]
NBCoupon AUC: [0.7175196206132957, 0.7179948792774353, 0.7164928734195067, 0.7215486680973802, 0.718877128840154]
LRtotal AUC: [0.8687338333340271, 0.872499106859304, 0.8297603402569399, 0.8210264982881539, 0.821649580239769]
LRCoupon AUC: [0.7300257567559034, 0.7342188632313051, 0.7307142843758752, 0.7440700245280267, 0.737294194671806]
RFtotal AUC: [0.8557708591156387, 0.8598136411811244, 0.846319083872292, 0.845607451104803, 0.8496798812236057]
RFCoupon AUC: [0.7196485856491749, 0.7048470027999749, 0.7008327713175349, 0.7236915343096657, 0.7203497685666368]
LGBtotal AUC: [0.9011946606519444, 0.9033263670911882, 0.885645174592282, 0.8809465357370417, 0.8821317527243523]
LGBCoupon AUC: [0.7462783580902147, 0.7386704682409418, 0.7470936700510745, 0.763371314660296, 0.7585157367739869]


f1: underfitting, f2 & f3: improved a lot