# Customer-Product Network Generation and Recursive Feature Extractor

In [None]:
import pandas as pd
import numpy as np
import pickle 
import matplotlib.pyplot as plt
import networkx as nx
import warnings
warnings.filterwarnings("ignore")

In [None]:
f = open('OM_D3_dataset-network_nx', 'rb')
data_nx = pickle.load(f)
f.close()
print(data_nx.shape)

f = open('OM_D2_Train_Data_Cleaned', 'rb')
df = pickle.load(f)
f.close()
print(df.shape)

In [None]:
# Build a control column

data_nx["flag"] = 0

# Expected "1" count in the column

print("Expected 1s:", data_nx["CustomerID"].isin(df["CustomerID"]).sum())

# Take those customers who exist in the train dataset

data_nx["flag"] = np.where(data_nx["CustomerID"].isin(df["CustomerID"]), data_nx["flag"].apply(lambda x: 1),0)

# Get number of 1s and 0s

print(data_nx.flag.value_counts())

In [None]:
# Drop the flag variable

data_nx.drop(["flag"],axis=1,inplace=True)
print(data_nx.shape)
print(df.shape)

## GRAPH GENERATION

In [None]:
G = nx.Graph()

In [None]:
edge_l = []

for r in range(len(data_nx)):
    edge_l.append((data_nx["CustomerID"][r],(data_nx["Ean"][r])))

In [None]:
# create network
G.add_edges_from(edge_l)
len(G.nodes())

In [None]:
# Do some number cross-checks

print(len(data_nx.Ean.unique()) + len(data_nx.CustomerID.unique()))
print(len(data_nx.Ean.unique()), len(data_nx.CustomerID.unique()))

## FEATURE EXTRACTION

### generate RolX Features

In [None]:
from graphrole import RecursiveFeatureExtractor, RoleExtractor
from node2vec import Node2Vec

In [None]:
%time
feat_ext = RecursiveFeatureExtractor(G, max_generations=4)
rolx_feats = feat_ext.extract_features()

print(f'\nFeatures extracted from {feat_ext.generation_count} recursive generations:')
print(rolx_feats.shape, rolx_feats.columns)

df_feats_all = rolx_feats.copy()

In [None]:
df_feats_all_pck = df_feats_all.copy()
import pickle 
f = open("OM_nx_rolx_features", 'wb') 
pickle.dump(df_feats_all_pck, f)
f.close()

## Merge train_data with extracted features

In [None]:
df.head(2)

In [None]:
df_feats_all.head(2)

In [None]:
df_feats_all.reset_index(drop=False,inplace=True) 
df_feats_all

In [None]:
df_feats_all.rename(columns={"index": "CustomerID"}, inplace=True)

In [None]:
data_nx = df.merge(df_feats_all, on="CustomerID", how="inner")

In [None]:
# Do some number cross-checks
print(df.shape)
print(df_feats_all.shape)
print(data_nx.shape)

In [None]:
dataset_nx_pck = data_nx.copy()
import pickle 
f = open("OM_D3_Dataset_2_nx-features", 'wb') 
pickle.dump(dataset_nx_pck, f)
f.close()

# Prediction

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, average_precision_score

In [None]:
# Create a data frame to store the results
def print_results(headline, true_value, pred, probs):
    scores=[]
    CM = confusion_matrix(true_value, pred)
    scores.append(headline)
    scores.append(accuracy_score(true_value, pred))      #accuracy
    scores.append(int(CM[1,1]))                          #TP
    scores.append(int(CM[0,1]))                          #FP
    scores.append(int(CM[0][0]))                         #TN
    scores.append(int(CM[1][0]))                         #FN
    scores.append(precision_score(true_value, pred))     #precision
    scores.append(recall_score(true_value, pred))        #recall
    scores.append(roc_auc_score(true_value, probs))      #roc_auc
    p, r, _ = precision_recall_curve(true_value, probs) 
    scores.append(auc(r,p))                              #pr_auc
    scores.append(f1_score(true_value, pred, average="macro"))            #f1-score
    return scores

score_names = ['method','accuracy','TP','FP','TN','FN','precision','recall','roc_auc','pr_auc','f1']
dfAcc = pd.DataFrame(data=np.zeros(shape=(0,11)), columns = score_names)

In [None]:
X= data_nx.drop(["CustomerID","label"],axis=1)
y= data_nx["label"]

In [None]:
cat_vars= ['isContactable','City', 'last_coupon_type_used','Device']
X.drop(cat_vars,axis=1,inplace=True)
num_vars=list(X.columns)

# Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
num_vars = list(X.columns)

In [None]:
"""pipe_cat = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(handle_unknown="ignore"))
])
"""
pipe_num = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

ct = ColumnTransformer([
    #("categorical_vars", pipe_cat, cat_vars),
    ("numeric_vars", pipe_num, num_vars),
    
], remainder="passthrough")

# Logistic Regression

In [None]:
pipe = Pipeline(steps=[('ct', ct),
                       ('classifier', LogisticRegression(random_state=42, max_iter=100000))
                          ]
                   )

params = [

            {'classifier__C'      : [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l1'],
           'classifier__solver' : ['liblinear', 'saga']},


          {'classifier__C'      : [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l2'], 
           'classifier__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
          }]

         

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D3_LoR_sp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# D3_XGB

In [None]:
def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [None]:
# XGB

from xgboost import XGBClassifier

pipe = Pipeline(steps=[('ct', ct),
                       ('classifier', XGBClassifier(objective='binary:logistic', 
                                                    eval_metric = f1_macro,
                                                    n_estimators=1000, 
                                                    eta=0.01, # default 0.3
                                                    max_depth=8, 
                                                    subsample=0.7, 
                                                    min_child_weight=55, 
                                                    gamma=1, 
                                                    reg_lambda=1, 
                                                    alpha=1, 
                                                    colsample_bytree=0.9, 
                                                    #colsample_bylevel=0.5,
                                                   # scale_pos_weight = 0.35
                                                   )
                                                   )
                            ]
                   )

params = [{ 
            #'classifier__n_estimators':[750,1000,1500],
           #'classifier__eta': [0.01],
           # 'classifier__max_depth':[8,9,10],
          # 'classifier__min_child_weight': [50,55,60],
         #  'classifier__colsample_bytree':[0.7,0.8,0.9],
           # 'classifier__subsample' : [0.6,0.7,0.8],
         #  'classifier__alpha':[0.2,0.3,0.4], #  defult 0. Increasing this value will make model more conservative.
        #    'classifier__gamma':[0.2,0.3,0.4], #  defult 0. Increasing this value will make model more conservative.
          #  'classifier__reg_lambda':[0.5,1,1.5,2,2.5], #  def=1 .Increasing this value will make model more conservative.
          #  'classifier__scale_pos_weight' : [0.25,0.3,0.35]
          }
         ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D3_XGB_sp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# SGD

In [None]:
#SGD

from sklearn.linear_model import SGDClassifier

pipe = Pipeline(steps=[('ct', ct), 
                       ('SGD', SGDClassifier(random_state = 42,
                                             class_weight="balanced", 
                                             warm_start=False, 
                                             average=False,
                                             loss= "log_loss",
                                           #  learning_rate="adaptive",
                                             alpha = 0.05,
                                             eta0 = 0.1,
                                             learning_rate="adaptive",
                                             penalty="l1"
                                             
                                            )
                                            )
                          ]
                   )

param_grid = {#"SGD__alpha":[0.01,0.03,0.05], # The higher the value, the stronger the regularization.
              #"SGD__penalty": ["l1","l2", "elasticnet"],
              #"SGD__class_weight": ["balanced", 0.5,0.7,1,1.5],
              #"SGD__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"],
              #"SGD__eta0" : [0.0001,0.001,0.01,0.1]
             } 


# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D2_SGD_nt', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# RANDOM FOREST

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline(steps=[('ct', ct),
                        ('RF', RandomForestClassifier(random_state = 42, 
                                                       n_estimators=1000,
                                                      criterion="gini", 
                                                       max_depth=7, 
                                                       min_samples_split = 30,
                                                       max_features='sqrt', 
                                                     #  min_samples_leaf=10,
                                                       class_weight = "balanced"
                                                     ))
                      ]
                   )

param_grid = { #"RF__n_estimators": [750,1000,1500],
               # "RF__max_depth": [5,6,7],
              #  "RF__min_samples_split": [45],
               # "RF__max_features": ["sqrt", "log2", 10],
             # "RF__criterion"   : ["gini","entropy","log_loss"],
                                      } 


# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D3_RF_sp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# LIGHTGBM

In [None]:
# LIGHTGBM

import lightgbm as lgb

pipe = Pipeline(steps=[('ct', ct), 
                       ('LGBM',   lgb.LGBMClassifier(objective="binary",
                                                #     class_weight="balanced",                                                   
                                                     n_estimators=1500, 
                                                     learning_rate=0.01,
                                                     min_child_weight=25, 
                                                     max_depth=7, 
                                                     num_leaves=25, 
                                                     min_child_samples=50,                                                      
                                                     reg_alpha=0.8, 
                                                     reg_lambda=0.8,
                                                 #    subsample_freq=0, 
                                                     colsample_bytree=0.9, 
                                                     subsample=0.8,
                                                     min_split_gain = 25,
                                                     min_data_in_leaf = 25,
                                                     random_state=42))
                          ]
                   )

param_grid = {
              #   'LGBM__n_estimators': [750,1000,1500], 
              #   'LGBM__learning_rate' : [0.0001,0.001,0.01],
              #    'LGBM__min_child_weight': [250], 
              #    'LGBM__max_depth' : [5], 
               #  'LGBM__num_leaves': [250], 
              #   'LGBM__min_child_samples': [100,150,200,250,300],                                                      
              #   'LGBM__reg_alpha' : [5,6,7], # default 0 
              #  'LGBM__min_data_in_leaf' : [100,150,200,250,300],
              #   'LGBM__reg_lambda' : [1,1.5,2,2.5,3], # default 0 
              #   'LGBM__subsample_freq' : [1,5,10,100,500], 
             #  'LGBM__subsample' : [0.5,0.6,0.7,0.8,0.9], 
              # 'LGBM__colsample_bytree' : [0.5,0.6,0.7,0.8,0.9], 
              #  'LGBM__min_split_gain' : [122], 
             } 


# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 2,
                    n_jobs = -1)
# default lgbm = 0.57

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D3_LGBM_sp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

In [None]:
dfAcc_pck = dfAcc.copy()
import pickle 
fd = open("OM_D3_results_table", 'wb') 
pickle.dump(dfAcc_pck, fd)
fd.close()

In [None]:
dataset_nx_pck = data_nx.copy()
import pickle 
f = open("NF_D3_Dataset_2_nx-features-added", 'wb') 
pickle.dump(dataset_nx_pck, f)
f.close()

# PERMUTATION IMPORTANCE

In [None]:
from sklearn.inspection import permutation_importance
r = permutation_importance(grid, X_train, y_train,
                               n_repeats=30,
                             random_state=42)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(r.importances);

In [None]:
r.importances_mean # total net amount

In [None]:
X.columns[r.importances_mean>0.01]