# Feature Engineering: Time-based-features

In [None]:
import pandas as pd
import numpy as np
import pickle 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Get the data
data_df = pd.read_csv("private_dataset.txt", encoding='unicode_escape', sep="\t")

f = open('OM_D1_train_data', 'rb')
dfx = pickle.load(f)
f.close()

In [None]:
data_df["TransactionDate"] = pd.to_datetime(data_df["TransactionDate"], format="%Y-%m-%d %H:%M:%S" )

### Filter the dataset to avoid data leakage

In [None]:
# Train data FEATURE period

train_data_feature_period_start_date = pd.to_datetime("2021-03-01 00:00:00")
train_data_feature_period_end_date = pd.to_datetime("2022-06-01 00:00:00") 

# --------------------------------------------------------

# TEST data FEATURE period

test_data_feature_period_start_date = pd.to_datetime("2021-09-01 00:00:00") 
test_data_feature_period_end_date = pd.to_datetime("2022-12-01 00:00:00")

# --------------------------------------------------------

data_df = data_df[data_df.TransactionDate < train_data_feature_period_end_date]


interpurchase time. x day between 1st and 2nd purchase.

### Execute all the changes that we've made on the omnichannel dataset

In [None]:
new_cols = ["Insert", "43", "different", "column", "names", "here"]

data_df.columns = new_cols
data_df.columns 
data_df.drop(['ProductID_2','Phone', 'Email', 'Email2', 'Gender', 'Payment_Info','tbr'],
             axis=1,inplace=True) #ProductID kolonu ile aynı seyleri iceriyor

In [None]:
bask_data_omni = data_df[['CustomerID', 'TransactionDate', 'Price',
                          'Discount_Amount', 'Net_Amount', 'Quantity']]

In [None]:
bask_data_omni = bask_data_omni.groupby(['CustomerID', 'TransactionDate']).sum()
bask_data_omni = bask_data_omni.reset_index()
sorted_df_omni = bask_data_omni.sort_values(by=['CustomerID', 'TransactionDate'],ascending=False).dropna()
sorted_df_omni.reset_index(inplace=True, drop=True)
sorted_df_omni

In [None]:
sorted_df_omni=sorted_df_omni[sorted_df_omni.Price>0]
sorted_df_omni

### Create new features

In [None]:
sorted_df_omni["discount_perc"]= (sorted_df_omni["DiscountAmount"]/sorted_df_omni["Price"])
sorted_df_omni.reset_index(drop=True,inplace=True)

In [None]:
sorted_df_omni["days_diff_omni"] = np.nan
sorted_df_omni["is_1st_purchase_omni"] = np.nan
sorted_df_omni["delta_basket_val"] = np.nan
sorted_df_omni["delta_basket_unit"] = np.nan
sorted_df_omni["delta_discount_perc"] = np.nan
sorted_df_omni

In [None]:
# calculate time between consecutive purchases

for i in range(len(sorted_df_omni.CustomerID)):
    if i != len(sorted_df_omni.CustomerID)-1:
        if sorted_df_omni.CustomerID[i] == sorted_df_omni.CustomerID[i+1]:
            sorted_df_omni["days_diff_omni"][i] = sorted_df_omni.TransactionDate[i]-sorted_df_omni.TransactionDate[i+1]
            sorted_df_omni["is_1st_purchase_omni"][i] = 0 


        else:
            sorted_df_omni["days_diff_omni"][i] = np.nan 
            sorted_df_omni["is_1st_purchase_omni"][i] = 1


    else:
        sorted_df_omni["days_diff_omni"][i]=np.nan
        sorted_df_omni["is_1st_purchase_omni"][i] = np.nan


In [None]:
# calculate time between consecutive purchases

for i in range(len(sorted_df_omni.CustomerID)):
    if i != len(sorted_df_omni.CustomerID)-1:
        if sorted_df_omni.CustomerID[i] == sorted_df_omni.CustomerID[i+1]:
            
            sorted_df_omni["delta_basket_val"][i] = sorted_df_omni.Net_Amount[i]-sorted_df_omni.Net_Amount[i+1]
            sorted_df_omni["delta_basket_unit"][i] = sorted_df_omni.quantity[i]-sorted_df_omni.Quantity[i+1]
            sorted_df_omni["delta_discount_perc"][i] = sorted_df_omni.discount_perc[i]-sorted_df_omni.discount_perc[i+1]

        else:
            sorted_df_omni["delta_basket_val"][i] = 0
            sorted_df_omni["delta_basket_unit"][i] = 0
            sorted_df_omni["delta_discount_perc"][i] = 0

    else:
        sorted_df_omni["delta_basket_val"][i] = 0
        sorted_df_omni["delta_basket_unit"][i] = 0
        sorted_df_omni["delta_discount_perc"][i] = 0

In [None]:
# convert timedelta to days_diff

from datetime import datetime, timedelta
sorted_df_omni.days_diff_omni = sorted_df_omni.days_diff_omni.apply(lambda x: x.days if isinstance(x, timedelta) else x)

In [None]:
cust_df_omni = sorted_df_omni[sorted_df_omni["is_1st_purchase_omni"]==0].groupby("CustomerID").agg(['mean',
                                                                                                    'min',
                                                                                                    'max'])

cust_df_omni.drop(["TransactionDate", "is_1st_purchase_omni"],axis=1, inplace=True)

D8_final_df = cust_df_omni.merge(dfx, how="right", on="CustomerID")

D8_final_df.head(3)

In [None]:
D8_final_df.drop(['index', 'Category3', 'isContactable', 'City','last_coupon_type_used',
                         'isShippedToBilled_sum', 'Device', 'num_of_purchases_w_discount'],
                 axis=1,
                 inplace=True)

df = D8_final_df.copy()


df["Age"] = df.loc[:,"Age"].apply(lambda x: np.where(x<18,np.nan,x))
df["Age"] = df.loc[:,"Age"].apply(lambda x: np.where(x>100,np.nan,x))


df["purchase_freq"].replace(0,np.nan, inplace=True)

In [None]:
# Outlier Removal

colns = ["avg_order_value",'Category2', 'Category4', "num_returns", "offline",
         'Category1', 'Axe1','Axe2', 'Axe3', 'Axe4',"avg_discount_amount","avg_basket_size"]

colns = colns + list(df.columns[1:28])

for col in colns:
    print(df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std()).sum(),"outliers, 6 std far from the mean")
    print(df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std()).sum() /df[col].count(),"of the column")
    print("Dropping...")
    df.drop(df[df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std())==True].index, inplace=True)
    print("Dropped.")


In [None]:
df["purchase_period"] = df["purchase_freq"]
df["purchase_freq"] = 1/df["purchase_freq"]

In [None]:
colns = ["purchase_freq", "purchase_period"]

for col in colns:
    print(df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std()).sum(),"outliers, 6 std far from the mean")
    print(df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std()).sum() /df[col].count(),"of the column")
    print("Dropping...")
    df.drop(df[df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std())==True].index, inplace=True)
    print("Dropped.")


In [None]:
df.label.value_counts()

# Draw histograms with all the 27 new features created

In [None]:
plt.rc('xtick', labelsize=7) 
plt.rc('ytick', labelsize=7)
df.hist(figsize = [30,30], bins=30);
# bu datada customer'a ait ilk alışverişler yok, yeni eklenen feature'larda.

### Check correlation matrix

In [None]:
import seaborn as sns
sns.set()
plt.figure(figsize=(25,25))
plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20)
plt.rcParams.update({'font.size': 10})
sns.heatmap(round(df.corr(),2), annot=True);

In [None]:
df.drop([('Price', 'mean'), ('Price', 'min'),('Price', 'max'),'ZPL1'], axis=1, inplace=True)

In [None]:
df.columns

# Pipeline

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [None]:
X= df.drop(["CustomerID","label"],axis=1)
y= df["label"]

In [None]:
X.info()

In [None]:
len(y)

In [None]:
y.value_counts()

In [None]:
X.columns

new_cols = [    "('DiscountAmount', 'mean')",       "('DiscountAmount', 'min')",
            "('DiscountAmount', 'max')",           "('NetAmount', 'mean')",
                 "('NetAmount', 'min')",            "('NetAmount', 'max')",
                "('quantity', 'mean')",           " ('quantity', 'min')",
                  "('quantity', 'max')",       "('discount_perc', 'mean')",
              "('discount_perc', 'min')",        "('discount_perc', 'max')",
            "('days_diff_omni', 'mean')",      " ('days_diff_omni', 'min')",
            "('days_diff_omni', 'max')",   "('delta_basket_val', 'mean')",
           "('delta_basket_val', 'min')",     "('delta_basket_val', 'max')",
         "('delta_basket_unit', 'mean')",   " ('delta_basket_unit', 'min')",
          "('delta_basket_unit', 'max')", "('delta_discount_perc', 'mean')",
        "('delta_discount_perc', 'min')",  "('delta_discount_perc', 'max')",
                 'num_of_transactions',               'avg_basket_size',
                  'num_of_total_items',               'min_basket_size',
                     'max_basket_size',           'avg_discount_amount',
            'lifetime_discount_amount',               'avg_order_value',
                    'total_net_amount',      'days_since_last_purchase',
           'days_since_first_purchase',                   'num_returns',
                           'Category1',                         'Category2',
                           'Category4',                           'Axe1',
                                'Axe2',                          'Axe3',
                                 'Age',                       'offline',
                       'purchase_freq',               'purchase_period']

X.columns=new_cols

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
num_vars = list(X.columns)

In [None]:
"""pipe_cat = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(handle_unknown="ignore"))
])
"""
pipe_num = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

ct = ColumnTransformer([
    #("categorical_vars", pipe_cat, cat_vars),
    ("numeric_vars", pipe_num, num_vars),
    
], remainder="passthrough")

In [None]:
X.info()

In [None]:
clean_data = df.copy()
import pickle 
fx = open("OM_D8_Train_Data_Clean", 'wb') 
pickle.dump(clean_data, fx)
fx.close()

# Results Table

In [None]:
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, average_precision_score

In [None]:
# Create a data frame to store the results
def print_results(headline, true_value, pred, probs):
    scores=[]
    CM = confusion_matrix(true_value, pred)
    scores.append(headline)
    scores.append(accuracy_score(true_value, pred))      #accuracy
    scores.append(int(CM[1,1]))                          #TP
    scores.append(int(CM[0,1]))                          #FP
    scores.append(int(CM[0][0]))                         #TN
    scores.append(int(CM[1][0]))                         #FN
    scores.append(precision_score(true_value, pred))     #precision
    scores.append(recall_score(true_value, pred))        #recall
    scores.append(roc_auc_score(true_value, probs))      #roc_auc
    p, r, _ = precision_recall_curve(true_value, probs) 
    scores.append(auc(r,p))                              #pr_auc
    scores.append(f1_score(true_value, pred, average="macro"))            #f1-score
    return scores

score_names = ['method','accuracy','TP','FP','TN','FN','precision','recall','roc_auc','pr_auc','f1']
dfAcc = pd.DataFrame(data=np.zeros(shape=(0,11)), columns = score_names)

In [None]:
dfAcc

In [None]:
X.shape

In [None]:
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from imblearn.pipeline import Pipeline as pipe_imb
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import make_pipeline as make_pipeline_imb

# LoR 

In [None]:
ovsmp_pipe = pipe_imb([('ct'        , ct),
                       ('sampler'   , RandomOverSampler(random_state=42,
                                                       sampling_strategy = 0.6)),
                       ('classifier', LogisticRegression(random_state=42, max_iter=100000))
                          ]
                   )

params = [


            {'classifier__C'      :[0.05,0.1,0.2], 
           'classifier__penalty': ['l1'],
           'classifier__solver' : ['liblinear', 'saga'],
            'sampler__sampling_strategy': [0.3,0.4,0.5], },


          {'classifier__C'      : [0.05,0.1,0.2], 
           'classifier__penalty': ['l2'], 
           'classifier__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
            'sampler__sampling_strategy': [0.3,0.4,0.5],
          }]

         

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D3_LoR_new-feats', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

In [None]:
def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

# XGB

In [None]:
# XGB

from xgboost import XGBClassifier

ovsmp_pipe = pipe_imb([('ct'        , ct),
                       ('sampler'   , RandomOverSampler(random_state=42,
                                                       sampling_strategy=0.6)),
                       ('classifier', XGBClassifier(objective='binary:logistic', 
                                                    eval_metric = f1_macro,
                                                    n_estimators=1000, 
                                                    eta=0.01, # default 0.3
                                                    max_depth=3, 
                                                    subsample=0.9, 
                                                    min_child_weight=25, 
                                                    gamma=5, 
                                                    reg_lambda=1, 
                                                    alpha=3, 
                                                    colsample_bytree=0.5, 
                                                    #colsample_bylevel=0.5,
                                                   # scale_pos_weight = 0.35
                                                   )
                                                   )
                            ]
                   )



params = [{
          #  'sampler__sampling_strategy': [0.3,0.5,0.6,0.8],
          #  'classifier__n_estimators':[750,1000,1500],
          # 'classifier__eta': [0.01,0.001,0.0001]
            'classifier__max_depth':[2],
           'classifier__min_child_weight': [100],
         #  'classifier__colsample_bytree':[0.5],
          # 'classifier__subsample' : [0.5,0.7,0.9],
          # 'classifier__alpha':[1,3,5], #  defult 0. Increasing this value will make model more conservative.
          #  'classifier__gamma':[1,3,5], #  defult 0. Increasing this value will make model more conservative.
           # 'classifier__reg_lambda':[5], #  def=1 .Increasing this value will make model more conservative.
         #   'classifier__scale_pos_weight' : [1]
}
         ]



# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D3_XGB_new-feats', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# LGBM

In [None]:
from sklearn.decomposition import PCA
# LIGHTGBM

import lightgbm as lgb

ovsmp_pipe = pipe_imb([('ct'        , ct),
                       ('sampler'   , RandomOverSampler(random_state=42,
                                                       sampling_strategy=0.5)),
                       ('LGBM',   lgb.LGBMClassifier(objective="binary",
                                                   #  class_weight="balanced",                                                   
                                                     n_estimators=1000, 
                                                     learning_rate=0.01,
                                                     min_child_weight=25, 
                                                      max_depth=6, 
                                                     num_leaves=25, 
                                                     min_child_samples=25,                                                      
                                                     reg_alpha=1, 
                                                     reg_lambda=1,
                                                 #    subsample_freq=0, 
                                                     colsample_bytree=0.8, 
                                                     subsample=0.7,
                                                     min_split_gain = 25,
                                                      min_data_in_leaf = 25,
                                                     random_state=42))
                          ]
                   )


param_grid = [{
                #  'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
             #        'LGBM__n_estimators': [750,1000,1300,1500], 
              #   'LGBM__learning_rate' : [0.0001,0.001,0.01],
              #    'LGBM__min_child_weight': [60], #
                  'LGBM__max_depth' : [2,3,4],          #
         #        'LGBM__num_leaves': [500], 
                 'LGBM__min_child_samples': [25,50,75],                                                      
                 'LGBM__reg_alpha' : [1,3], # default 0 
              #   'LGBM__min_data_in_leaf' : [50],
                 'LGBM__reg_lambda' : [1,3], # default 0 
              #   'LGBM__subsample_freq' : [1,5,10,100,500], 
              # 'LGBM__subsample' : [0.5,0.6,0.7,0.8,0.9], 
              # 'LGBM__colsample_bytree' : [0.1], 
            #    'LGBM__min_split_gain' : [50], #
}
         ]


# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 2,
                    n_jobs = -1)
# default lgbm = 0.57

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D3_LGBM_new-feats', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

In [None]:
dfAcc.drop(2)

# SVM

In [None]:
from sklearn.svm import SVC
    
ovsmp_pipe = pipe_imb([('ct'        , ct),
                       ('sampler'   , RandomOverSampler(random_state=42,
                                                       sampling_strategy = 0.6)),
                       ('classifier', SVC(random_state=42,
                                          C=0.05, #default 1
                                          kernel='rbf',
                                          gamma='scale',
                                          probability=True))
                          ]
                   )

params = [
    {         
           # 'sampler__sampling_strategy': [0.5,0.6,0.7,0.8],
          #  'classifier__C': [0.05,0.1,0.2],
           #  'classifier__kernel': ['rbf'],
           # 'classifier__gamma' : ['scale']
      #    },
         #   {
        #    'sampler__sampling_strategy': [0.5,0.6,0.7],
         #   'classifier__C': [0.1,0.3,0.5],
        #     'classifier__kernel': ['poly'],
        #    'classifier__degree': [2,3,4],
        #    'classifier__gamma' : ['scale', 'auto']
          }
    ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)


In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D3_SVM_new-feats', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

In [None]:
dfAcc_pck = dfAcc.copy()
import pickle 
fd = open("OM_D3_results_table-new_feats", 'wb') 
pickle.dump(dfAcc_pck, fd)
fd.close()