In [32]:
# import the required packages
import pandas as pd
import numpy as np

# preprocessing
from sklearn.preprocessing import Imputer
from sklearn.utils import resample

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

# ml libraries
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRegressor

# scores
from sklearn.metrics import mean_squared_error,mean_squared_log_error,confusion_matrix,accuracy_score,roc_auc_score, balanced_accuracy_score

import pickle

In [33]:
class Preprocessing(object):
    def __init__(self, test_size=0.2, random_state=4):
        self.test_size = test_size
        self.random_state = random_state
    
    def transform(self, X):
        # get numeric and categorical columns
        categorical_columns = []
        numeric_columns = []
        for c in X.columns:
            if X[c].map(type).eq(str).any(): #check if there are any strings in column
                categorical_columns.append(c)
            else:
                numeric_columns.append(c)

        # create two DataFrames - categorical and numerical 
        data_numeric = X[numeric_columns]
        data_categorical = pd.DataFrame(X[categorical_columns])
        
        # impute missing values
        imp = Imputer(missing_values=np.nan, strategy='median', axis=0)
        data_numeric = pd.DataFrame(imp.fit_transform(data_numeric), columns = data_numeric.columns) #only apply imputer to numeric columns

        # no missing values in the categorical features as per the initial investigation 

        # join the two masked dataframes back together
        data_joined = pd.concat([data_numeric, data_categorical], axis = 1)
        
        
        data_joined.num_employees = data_joined.num_employees.replace({"None":0,"1":1,"2to5":4,"6to10":8,"11to50":32,"50plus":60})
        data_joined.num_purchases_year = data_joined.num_purchases_year.replace({'1to2':1, '25plus':32, '3to5':4, '11to25':16, 'None':0, '6to10':8})
        data_joined.cost_purchases_year = data_joined.cost_purchases_year.replace({'lessthan1':1, '25to100':64, '1to5':4, '5to25':16, 'None':0, '100plus':126})
        
        onehot_data = pd.get_dummies(data_joined, drop_first=True)
        
        X = onehot_data
        y = onehot_data.convert_30
        X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)
        
        # upsampling in training set
        df_majority = X_train_class[X_train_class.convert_30==0]
        df_minority = X_train_class[X_train_class.convert_30==1]
 
        df_minority_upsampled = resample(df_minority, 
                                 replace=True,    
                                 n_samples=20179, 
                                 random_state=12) 
        df_upsampled = pd.concat([df_majority, df_minority_upsampled])
        X_train_balance = df_upsampled.drop(['convert_30'], axis=1)
        y_train_balance = df_upsampled.convert_30
        
        self.y_train_revenue = X_train_balance["revenue_30"]
        self.y_test_revenue = X_test_class["revenue_30"]
        
        X_train_balance = X_train_balance.drop(['cuid','revenue_30','Unnamed: 0'], axis=1)
        X_test_class = X_test_class.drop(['cuid','revenue_30','Unnamed: 0'], axis=1)
        
        X_test_class = X_test_class.drop("convert_30",axis=1)
        
        self.X_train = X_train_balance
        self.X_test = X_test_class
        self.y_train_conv = y_train_balance
        self.y_test_conv = y_test_class
        
        return self

In [34]:
class FeatureSelection(object):
    def __init__(self, selection_type = "chi", chi_k=30):
        self.selection_type = selection_type
        self.chi_k = chi_k
    
    def _chi(self, X, y):
        X_norm = MinMaxScaler().fit_transform(X)
        chi_selector = SelectKBest(chi2, k=self.chi_k)
        chi_selector.fit(X_norm, y)
        chi_support = chi_selector.get_support()
        chi_feature = X.loc[:,chi_support].columns.tolist()
        return chi_feature

    def fit(self, X, y):
        if self.selection_type == "chi":
            return self._chi(X, y)

In [35]:
df_training = pd.read_csv("df_training_scholarjet.csv")

In [36]:
preprocess = Preprocessing()

In [37]:
data = preprocess.transform(df_training)



In [38]:
fs = FeatureSelection()

In [39]:
chi_features = fs.fit(data.X_train, data.y_train_conv)

In [42]:
chi_features

['dayssincelastord',
 'percdirtythirty',
 'numvisitthreeone',
 'numvisitseventhree',
 'numvisitthirtyseven',
 'numvisitsixtythirty',
 'numloggedinone',
 'numloggedinthreeone',
 'numloggedinseventhree',
 'numloggedinthirtyseven',
 'numsecondsonsiteone',
 'numsecondsonsiteseventhree',
 'numsecondsonsitethirtyseven',
 'numtotalpageviewsthirtyseven',
 'numatcone',
 'numatcthreeone',
 'numatcseventhree',
 'numatcthirtyseven',
 'numideaboardseventhree',
 'dayssincelastvisit',
 'numsearchtermsthreeone',
 'numsearchtermsthirtyseven',
 'percsecondsinbound',
 'percemailopenedone',
 'percemailopenedthreeone',
 'percemailopenedseventhree',
 'percemailopenedthirtyseven',
 'dayssinceenrollment',
 'roll_up_Unmanaged',
 'currentstatus_Enrolled']

In [53]:
pd.DataFrame(np.mean(data.X_train[chi_features[:-2]], axis=0)).T.to_csv("chi_means.csv", index=False)

In [54]:
pd.DataFrame(np.std(data.X_train[chi_features[:-2]], axis=0)).T.to_csv("chi_std.csv", index=False)

In [10]:
class Model(object):
    def _classify_fit(self, X, y):
        self.classif_model = XGBClassifier(
         learning_rate =0.1,
         n_estimators=80,
         max_depth=5,
         min_child_weight=1,
         gamma=0.8,
         reg_alpha = 1.0,
         reg_lambda = 1.0,
         subsample=0.9,
         colsample_bytree=0.8,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=27).fit(X, y)
    
    def _regress_fit(self, X, y):
        self.regress_model = XGBRegressor(
            learning_rate =0.1,
            n_estimators=1000,
            max_depth=7,
            min_child_weight=10,
            gamma=0,
            reg_alpha = 1.0,
            reg_lambda = 0.5,
            subsample=0.9,
            colsample_bytree=0.8,
            objective='reg:squarederror',
            nthread=4,
            scale_pos_weight=1,
            seed=27).fit(X,y,eval_metric="rmse")
    
    def fit(self, X, y_class, y_reg):
        self._classify_fit(X, y_class)   
        self._regress_fit(X, y_reg)
        return self
    
    def predict(self, X):
        X_copy = X.copy()
        
        # step 1
        y_conv = self.classif_model.predict(X_copy)
        X_copy.loc[:,"conv"] = y_conv

        # separate out the ones and zeroes
        X_copy_1 = X_copy[X_copy.conv == 1.0]
        X_copy_1 = X_copy_1.drop(["conv"], axis=1)
        X_copy_0 = X_copy[X_copy.conv == 0.0]
        X_copy_0.loc[:,"revenue"] = 0.0

        # step 2
        y_rev = self.regress_model.predict(X_copy_1)
        X_copy_1.loc[:, "revenue"] = y_rev

        
        # merge data
        X_new = pd.concat([X_copy_0, X_copy_1], axis=0)

        return y_conv, X_new.revenue
    
    def score(self, y_true_conv, y_pred_conv, y_true_revenue, y_pred_revenue):
        metrics = {}
        
        metrics["conv_balanced_accuracy"] = balanced_accuracy_score(y_true_conv, y_pred_conv)
        metrics["revenue_rmse"] = mean_squared_error(y_true_revenue, y_pred_revenue) ** 0.5
        
        response = pd.DataFrame({"y_true_conv": y_true_conv, "y_pred_conv": y_pred_conv, "y_true_revenue": y_true_revenue, "y_pred_revenue": y_pred_revenue})
        
        metrics["effort"] = ((sum(response[(response.y_true_conv == 0.0) & (response.y_pred_conv == 1.0)].y_pred_revenue))/ sum(y_true_revenue)) * 100
        
        metrics["loss"] = ((sum(response[(response.y_true_conv == 1.0) & (response.y_pred_conv == 0.0)].y_true_revenue)) / sum(y_true_revenue)) * 100
        
        metrics["total_true_revenue"] = sum(y_true_revenue)
        
        return metrics

In [11]:
m = Model()

In [12]:
model = m.fit(data.X_train[chi_features], data.y_train_conv, data.y_train_revenue)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [23]:
pkl_filename = "pickle_model_temp.pkl"
with open(pkl_filename, 'wb') as file_model:
    pickle.dump(model, file_model)

TypeError: an integer is required (got type _io.BufferedWriter)

In [14]:
pkl_filename_classify = "pickle_model_classify.pkl"
with open(pkl_filename_classify, 'wb') as file1:
    pickle.dump(model.classif_model, file1)

In [15]:
pkl_filename_reg = "pickle_model_reg.pkl"
with open(pkl_filename_reg, 'wb') as file2:
    pickle.dump(model.regress_model, file2)

In [16]:
y_pred = model.predict(data.X_train[chi_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [17]:
train_score = model.score(data.y_train_conv.values, y_pred[0], data.y_train_revenue.values, y_pred[1])

In [18]:
y_predtest = model.predict(data.X_test[chi_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [19]:
test_score = model.score(data.y_test_conv.values, y_predtest[0], data.y_test_revenue.values, y_predtest[1])

In [20]:
train_score

{'conv_balanced_accuracy': 0.7920858318053421,
 'revenue_rmse': 1317.4646835204394,
 'effort': 0.07319139433440847,
 'loss': 16.47931753331053,
 'total_true_revenue': 14533372.877601676}

In [21]:
test_score

{'conv_balanced_accuracy': 0.7287240939126542,
 'revenue_rmse': 507.7544853428297,
 'effort': 22.9781469495759,
 'loss': 24.199138520724585,
 'total_true_revenue': 416890.98069999926}

In [27]:
model.regress_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=7, min_child_weight=10, missing=None, n_estimators=1000,
             n_jobs=1, nthread=4, objective='reg:squarederror', random_state=0,
             reg_alpha=1.0, reg_lambda=0.5, scale_pos_weight=1, seed=27,
             silent=None, subsample=0.9, verbosity=1)