In [2]:
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV, train_test_split

df = pd.read_csv('C:/Users/Ben Bauer/final_project/VIC_REC.csv', sep = ';')

In [3]:
## chose only one advisor for prototyping
train = df[df['PERS_CODE_ADVISOR']==4855944]

In [4]:
train

Unnamed: 0,TARGET,PERS_CODE_ADVISOR,FIRST_PRODUCT,CUSTOMER_GENDER,CUSTOMER_SALESCHANNEL,ACTIVITY_POINTS,CUSTOMER_AGE,PERSON_COUNTRY,MEMBER_SEG,EMAIL,NEWSLETTER,AVG_PRICE,DAYS_SINCE_LAST_TRANS,ADVISOR_AGE
0,1,4855944,a,w,x,566,48,p,t,h,m,2.58,0,37
1,1,4855944,a,w,x,566,48,p,t,h,m,2.58,1,37
2,1,4855944,a,w,x,566,48,p,t,h,m,2.58,0,37
3,1,4855944,a,w,x,566,48,p,t,h,m,2.58,2,37
4,1,4855944,a,w,x,567,48,p,t,h,m,2.58,1,37
5,1,4855944,a,w,x,567,48,p,t,h,m,2.58,1,37
6,1,4855944,a,w,x,567,48,p,t,h,m,2.58,1,37
7,1,4855944,a,w,x,567,48,p,t,h,m,2.58,1,37
8,1,4855944,a,w,x,567,48,p,t,h,m,2.58,3,37
9,1,4855944,a,w,x,567,48,p,t,h,m,2.58,2,37


In [5]:
y = train.TARGET
train.drop(['PERS_CODE_ADVISOR', 'TARGET'], axis=1, inplace = True)

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.4)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [6]:
# Check number of nulls in each feature column
nulls_per_column = X_train.isnull().sum()
print(nulls_per_column)

FIRST_PRODUCT            0
CUSTOMER_GENDER          0
CUSTOMER_SALESCHANNEL    0
ACTIVITY_POINTS          0
CUSTOMER_AGE             0
PERSON_COUNTRY           0
MEMBER_SEG               0
EMAIL                    0
NEWSLETTER               0
AVG_PRICE                0
DAYS_SINCE_LAST_TRANS    0
ADVISOR_AGE              0
dtype: int64


In [7]:
# Create a boolean mask for categorical columns
categorical_feature_mask = X_train.dtypes == object

# Get list of categorical column names
categorical_columns = X_train.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X_train.columns[~categorical_feature_mask].tolist()

# Pipeline approach

In [None]:
# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature],Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )

# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)
                                         ])

pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union),
                     #("dictifier", Dictifier()),
                     ("vectorizer", DictVectorizer(sort=False)),
                     ("clf", xgb.XGBClassifier(max_depth=3))
                    ])

gbm_param_grid = {
    'clf__learning_rate': np.arange(.05, 1, .05),
    'clf__max_depth': np.arange(3,10, 1),
    'clf__n_estimators': np.arange(50, 200, 50)
}

# Perform RandomizedSearchCV
randomized_roc_auc = RandomizedSearchCV(estimator=pipeline,
                                        param_distributions=gbm_param_grid,
                                        n_iter=2, scoring='roc_auc', cv=2, verbose=1)

# Fit the estimator
randomized_roc_auc.fit(X_train, y_train)

# Compute metrics
#print(randomized_roc_auc.best_score_)
#print(randomized_roc_auc.best_estimator_)

In [8]:
data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
                     'diff':     ['a', 'b', 'a', 'a', 'a', 'a', 'a', 'c'],
                      'children': [4., 6, 3, 3, 2, 3, 5, 4],
                     'salary':   [90, 24, 44, 27, 32, 59, 36, 27]})

from sklearn.preprocessing import LabelBinarizerdata

Unnamed: 0,children,diff,pet,salary
0,4.0,a,cat,90
1,6.0,b,dog,24
2,3.0,a,dog,44
3,3.0,a,fish,27
4,2.0,a,cat,32
5,3.0,a,dog,59
6,5.0,a,cat,36
7,4.0,c,fish,27


# Appl multiple transformations to DF

In [10]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler

mapper = DataFrameMapper([
         ('pet', LabelBinarizer()),
          ('diff', LabelBinarizer()),
         (['children'], StandardScaler())
     ])



pd.DataFrame(mapper.fit_transform(data), columns = mapper.transformed_names_)

Unnamed: 0,pet_cat,pet_dog,pet_fish,diff_a,diff_b,diff_c,children
0,1.0,0.0,0.0,1.0,0.0,0.0,0.208514
1,0.0,1.0,0.0,0.0,1.0,0.0,1.87663
2,0.0,1.0,0.0,1.0,0.0,0.0,-0.625543
3,0.0,0.0,1.0,1.0,0.0,0.0,-0.625543
4,1.0,0.0,0.0,1.0,0.0,0.0,-1.459601
5,0.0,1.0,0.0,1.0,0.0,0.0,-0.625543
6,1.0,0.0,0.0,1.0,0.0,0.0,1.042572
7,0.0,0.0,1.0,0.0,0.0,1.0,0.208514
