# Using a PCAsearch to find ideal number of components to keep in the pipeline

We will use the same pipeline but grid search the number of components. We noticed that the entire balanced training set does not fit into memory, therefore we will need to use a random sample from the training set. for this purpose.

In [2]:
import pandas as pd
import pickle

# Read the pickled balanced training set
X_train_balanced = pd.read_pickle("/Volumes/Iomega_HDD/2016/Data science/Kaggle/User-click-detection-predictive-modeling/X_train_balanced.pkl")
y_train_balanced = pd.read_pickle("/Volumes/Iomega_HDD/2016/Data science/Kaggle/User-click-detection-predictive-modeling/y_train_balanced.pkl")

#let's try to get a small random sample (~10%) from the class-balanced set to train the pipeline
import numpy as np
import random
random.seed(112)
rows = random.sample(list(range(0,X_train_balanced.shape[0])),80000)

X_train_balanced_sample = X_train_balanced.iloc[rows,]
y_train_balanced_sample = y_train_balanced.iloc[rows]


# Label text features
Text_features = ["app","device","os","channel"]

##############################################################
# Define utility function to parse and process text features
##############################################################
# Note we avoid lambda functions since they don't pickle when we want to save the pipeline later   
def column_text_processer_nolambda(df,text_columns = Text_features):
    import pandas as pd
    import numpy as np
    """"A function that will merge/join all text in a given row to make it ready for tokenization. 
    - This function should take care of converting missing values to empty strings. 
    - It should also convert the text to lowercase.
    df= pandas dataframe
    text_columns = names of the text features in df
    """ 
    # Select only non-text columns that are in the df
    text_data = df[text_columns]
    
    # Fill the missing values in text_data using empty strings
    text_data.fillna("",inplace=True)
    
    # Concatenate feature name to each category encoding for each row
    # E.g: encoding 3 at device column will read as device3 to make each encoding unique for a given feature
    for col_index in list(text_data.columns):
        text_data[col_index] = col_index + text_data[col_index].astype(str)
    
    # Join all the strings in a given row to make a vector
    # text_vector = text_data.apply(lambda x: " ".join(x), axis = 1)
    text_vector = []
    for index,rows in text_data.iterrows():
        text_item = " ".join(rows).lower()
        text_vector.append(text_item)

    # return text_vector as pd.Series object to enter the tokenization pipeline
    return pd.Series(text_vector)

#######################################################################
# Define custom processing functions to add the log_total_clicks and 
# log_total_click_time features, and remove the unwanted base features
#######################################################################
def column_time_processer(X_train):
    import pandas as pd
    import numpy as np

    # Convert click_time to datetime64 dtype 
    X_train.click_time = pd.to_datetime(X_train.click_time)

    # Calculate the log_total_clicks for each ip and add as a new feature to temp_data
    temp_data = pd.DataFrame(np.log(X_train.groupby(["ip"]).size()),
                                    columns = ["log_total_clicks"]).reset_index()


    # Calculate the log_total_click_time for each ip and add as a new feature to temp_data
    # First define a function to process selected ip group 
    def get_log_total_click_time(group):
        diff = (max(group.click_time) - min(group.click_time)).seconds
        return np.log(diff+1)

    # Then apply this function to each ip group and extract the total click time per ip group
    log_time_frame = pd.DataFrame(X_train.groupby(["ip"]).apply(get_log_total_click_time),
                                  columns=["log_total_click_time"]).reset_index()

    # Then add this new feature to the temp_data
    temp_data = pd.merge(temp_data,log_time_frame, how = "left",on = "ip")

    # Combine temp_data with X_train to maintain X_train key order
    temp_data = pd.merge(X_train,temp_data,how = "left",on = "ip")

    # Drop features that are not needed
    temp_data = temp_data[["log_total_clicks","log_total_click_time"]]

    # Return only the numeric features as a tensor to integrate into the numeric feature branch of the pipeline
    return temp_data


#############################################################################
# We need to wrap these custom utility functions using FunctionTransformer
from sklearn.preprocessing import FunctionTransformer
# FunctionTransformer wrapper of utility functions to parse text and numeric features
# Note how we avoid putting any arguments into column_text_processer or column_time_processer
#############################################################################
get_numeric_data = FunctionTransformer(func = column_time_processer, validate=False) 
get_text_data = FunctionTransformer(func = column_text_processer_nolambda,validate=False) 

#############################################################################
# Create the token pattern: TOKENS_ALPHANUMERIC
# #Note this regex will match either a whitespace or a punctuation to tokenize 
# the string vector on these preferences, in our case we only have white spaces in our text  
#############################################################################
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'   

# Load a validation set to use in the new pca pipeline
X_val1 = pd.read_pickle("X_val1.pkl")
y_val1 = pd.read_pickle("y_val1.pkl")

Let's prepare our main search loop:

In [None]:
import datetime
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import warnings
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2 # We will use chi-squared as a scoring function to select features for classification
from sklearn.metrics import auc
from SparseInteractions import * #Load SparseInteractions (from : https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/features/SparseInteractions.py) as a module since it was saved into working directory as SparseInteractions.py
from sklearn.decomposition import TruncatedSVD

warnings.filterwarnings('ignore')

disk_directory = "/Volumes/Iomega_HDD/2016/Data science/Kaggle/User-click-detection-predictive-modeling/"

component_list = [10,600,700,800]

for n_components in component_list:
    print("Starting pipeline search using: "+ str(n_components) + " components.")
    
    # Generic pipeline to cycle for each component search
    userclick_pipeline_pcasearch = Pipeline([

        ("union",FeatureUnion(
            # Note that FeatureUnion() also accepts list of tuples, the first half of each tuple 
            # is the name of the transformer within the FeatureUnion

            transformer_list = [

                ("numeric_subpipeline",Pipeline([        # Note we have subpipeline branches inside the main pipeline
                    ("parser",get_numeric_data), # Step1: parse the numeric data (note how we avoid () when using FunctionTransformer objects)
                    ("imputer",Imputer()) # Step2: impute any missing data using default (mean), note we don't expect missing values in this case. 
                ])), # End of: numeric_subpipeline

                ("text_subpipeline",Pipeline([
                    ("parser",get_text_data), # Step1: parse the text data 
                    ("tokenizer",HashingVectorizer(token_pattern= TOKENS_ALPHANUMERIC, # Step2: use HashingVectorizer for automated tokenization and feature extraction
                                                 ngram_range = (1,1),
                                                 non_negative=True, 
                                                 norm=None, binary=True )), # Note here we use binary=True since our hack is to use tokenization to generate dummy variables  
                    ('dim_red', SelectKBest(chi2,300)) # Step3: use dimension reduction to select 300 best features using chi2 as scoring function
                ]))
            ]

        )),# End of step: union, this is the fusion point to main pipeline, all features are numeric at this stage

        # Common steps:

        ("int", SparseInteractions(degree=2)), # Add polynomial interaction terms up to the second degree polynomial
        ("scaler",StandardScaler(with_mean=False)), # Standardize the features for a more gaussian distribution. 
        ("dim_red", TruncatedSVD(n_components=n_components))      
    ])# End of: userclick_pipeline_pcasearch

    # Fit and transform the X_train_balanced_sample set to get the features using new pipeline
    start = datetime.datetime.now()
    X_train_balanced_sample_trans_pl_pcasearch = userclick_pipeline_pcasearch.fit(X_train_balanced_sample,y_train_balanced_sample).transform(X_train_balanced_sample)
    process_time = datetime.datetime.now() - start
    print("Completed pipeline fit and transform using "+ str(n_components)+ " components, it took: " + str((process_time.seconds)/60) + " minutes.")
    
    # Train the classifier and get estimates
    start = datetime.datetime.now()

    clf = LogisticRegression()
    clf.fit(X_train_balanced_sample_trans_pl_pcasearch,y_train_balanced_sample)

    process_time = datetime.datetime.now() - start
    print("Completed model fit and it took: " + str((process_time.seconds)/60) + " minutes.")

    probs = clf.predict_proba(X_train_balanced_sample_trans_pl_pcasearch)[:,1]
    print("ROC score in the training set: " + str(roc_auc_score(y_train_balanced_sample,probs)))

    # Transform the validation set
    start = datetime.datetime.now()
    X_val1_trans_pl_pcasearch = userclick_pipeline_pcasearch.transform(X_val1)
    print("Completed validation set transformation, it took: " + str((process_time.seconds)/60) + " minutes.")
    probs = clf.predict_proba(X_val1_trans_pl_pcasearch)[:,1]
    print("ROC score in the validation set 1: " + str(roc_auc_score(y_val1,probs)))
    
    # Save the current pipeline
    filename = disk_directory + "userclick_pipeline_pca" + str(n_components) + ".pkl"
    with open(filename,"wb") as f:
        pickle.dump(userclick_pipeline_pcasearch,f)
    print("Saved the pipeline search using: "+ str(n_components)+ " components " + "as :" + filename)
    
    print("Completed pipeline search using: "+ str(n_components) + " components.")
    print("--" * 40)

Starting pipeline search using: 10 components.
Completed pipeline fit and transform using 10 components, it took: 2.25 minutes.
Completed model fit and it took: 0.0 minutes.
ROC score in the training set: 0.865398828872
Completed validation set transformation, it took: 0.0 minutes.
ROC score in the validation set 1: 0.880240252656
Saved the pipeline search using: 10 components as :/Volumes/Iomega_HDD/2016/Data science/Kaggle/User-click-detection-predictive-modeling/userclick_pipeline_pca10.pkl
Completed pipeline search using: 10 components.
--------------------------------------------------------------------------------
Starting pipeline search using: 600 components.
Completed pipeline fit and transform using 600 components, it took: 3.533333333333333 minutes.
Completed model fit and it took: 5.316666666666666 minutes.
ROC score in the training set: 0.960074287179
Completed validation set transformation, it took: 5.316666666666666 minutes.
ROC score in the validation set 1: 0.931542083

After the 700 components, kernel breaks down, therefore this is not feasible to search components in this way. Let's try to modify the pipeline for different strategies in feature selection and dimension reduction.

How about simple feature selection at the end of the pipeline instead of PCA, could it be faster?

In [4]:
import datetime
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import warnings
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.feature_selection import SelectKBest # We will use default scoring function to select features for classification
from sklearn.metrics import auc
from SparseInteractions import * #Load SparseInteractions (from : https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/features/SparseInteractions.py) as a module since it was saved into working directory as SparseInteractions.py


warnings.filterwarnings('ignore')

disk_directory = "/Volumes/Iomega_HDD/2016/Data science/Kaggle/User-click-detection-predictive-modeling/"

component_list = [10,100,600,800,1000]

for n_components in component_list:
    print("Starting pipeline search using: "+ str(n_components) + " components.")
    
    # Generic pipeline to cycle for each component search
    userclick_pipeline_pcasearch = Pipeline([

        ("union",FeatureUnion(
            # Note that FeatureUnion() also accepts list of tuples, the first half of each tuple 
            # is the name of the transformer within the FeatureUnion

            transformer_list = [

                ("numeric_subpipeline",Pipeline([        # Note we have subpipeline branches inside the main pipeline
                    ("parser",get_numeric_data), # Step1: parse the numeric data (note how we avoid () when using FunctionTransformer objects)
                    ("imputer",Imputer()) # Step2: impute any missing data using default (mean), note we don't expect missing values in this case. 
                ])), # End of: numeric_subpipeline

                ("text_subpipeline",Pipeline([
                    ("parser",get_text_data), # Step1: parse the text data 
                    ("tokenizer",HashingVectorizer(token_pattern= TOKENS_ALPHANUMERIC, # Step2: use HashingVectorizer for automated tokenization and feature extraction
                                                 ngram_range = (1,1),
                                                 non_negative=True, 
                                                 norm=None, binary=True )), # Note here we use binary=True since our hack is to use tokenization to generate dummy variables  
                    ('dim_red', SelectKBest(k = 300)) # Step3: use dimension reduction to select 300 best features using chi2 as scoring function
                ]))
            ]

        )),# End of step: union, this is the fusion point to main pipeline, all features are numeric at this stage

        # Common steps:
        ("int", SparseInteractions(degree=2)), # Add polynomial interaction terms up to the second degree polynomial
        ("scaler",StandardScaler(with_mean=False)), # Standardize the features for a more gaussian distribution. 
        ("dim_red2", SelectKBest(k = n_components))      
    ])# End of: userclick_pipeline_pcasearch

    # Fit and transform the X_train_balanced_sample set to get the features using new pipeline
    start = datetime.datetime.now()
    userclick_pipeline_pcasearch.fit(X_train_balanced_sample,y_train_balanced_sample)
    print("Completed pipeline fit using "+ str(n_components)+ " components, it took: " + str((process_time.seconds)/60) + " minutes.")
    
    start = datetime.datetime.now()
    X_train_balanced_sample_trans_pl_pcasearch = userclick_pipeline_pcasearch.transform(X_train_balanced_sample)
    process_time = datetime.datetime.now() - start
    print("Completed pipeline transform using "+ str(n_components)+ " components, it took: " + str((process_time.seconds)/60) + " minutes.")
    
    # Train the classifier and get estimates
    start = datetime.datetime.now()

    clf = LogisticRegression()
    clf.fit(X_train_balanced_sample_trans_pl_pcasearch,y_train_balanced_sample)

    process_time = datetime.datetime.now() - start
    print("Completed model fit and it took: " + str((process_time.seconds)/60) + " minutes.")

    probs = clf.predict_proba(X_train_balanced_sample_trans_pl_pcasearch)[:,1]
    print("ROC score in the training set: " + str(roc_auc_score(y_train_balanced_sample,probs)))

    # Transform the validation set
    start = datetime.datetime.now()
    X_val1_trans_pl_pcasearch = userclick_pipeline_pcasearch.transform(X_val1)
    print("Completed validation set transformation, it took: " + str((process_time.seconds)/60) + " minutes.")
    probs = clf.predict_proba(X_val1_trans_pl_pcasearch)[:,1]
    print("ROC score in the validation set 1: " + str(roc_auc_score(y_val1,probs)))
    
    # Save the current pipeline
    filename = disk_directory + "userclick_pipeline_nfeatures" + str(n_components) + ".pkl"
    with open(filename,"wb") as f:
        pickle.dump(userclick_pipeline_pcasearch,f)
    print("Saved the pipeline search using: "+ str(n_components)+ " components " + "as :" + filename)
    
    print("Completed pipeline search using: "+ str(n_components) + " components.")
    print("--" * 40)

Starting pipeline search using: 10 components.
Completed pipeline fit using 10 components, it took: 0.0 minutes.
Completed pipeline transform using 10 components, it took: 0.8666666666666667 minutes.
Completed model fit and it took: 0.0 minutes.
ROC score in the training set: 0.811026316688
Completed validation set transformation, it took: 0.0 minutes.
ROC score in the validation set 1: 0.558237052807
Saved the pipeline search using: 10 components as :/Volumes/Iomega_HDD/2016/Data science/Kaggle/User-click-detection-predictive-modeling/userclick_pipeline_nfeatures10.pkl
Completed pipeline search using: 10 components.
--------------------------------------------------------------------------------
Starting pipeline search using: 100 components.
Completed pipeline fit using 100 components, it took: 0.0 minutes.
Completed pipeline transform using 100 components, it took: 0.7666666666666667 minutes.
Completed model fit and it took: 0.016666666666666666 minutes.
ROC score in the training se

This type of selection is indeed faster and also fits into memory. However, out-of-the-box performance of the selected features are not as good as the PCA features. How about we perform a feature union of first 20 Principal components and 1000 - 2000 best features selected before decomposition? Beside, let's try to use the entire training set. 

In [None]:
import datetime
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import warnings
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.feature_selection import SelectKBest # We will use default scoring function to select features for classification
from sklearn.metrics import auc
from sklearn.decomposition import TruncatedSVD
from SparseInteractions import * #Load SparseInteractions (from : https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/features/SparseInteractions.py) as a module since it was saved into working directory as SparseInteractions.py


warnings.filterwarnings('ignore')

disk_directory = "/Volumes/Iomega_HDD/2016/Data science/Kaggle/User-click-detection-predictive-modeling/"

component_list = [20,50,500,1000,1200,1500,2000]

for n_components in component_list:
    print("Starting pipeline search using: "+ str(n_components) + " components.")
    
    # Generic pipeline to cycle for each component search
    userclick_pipeline_pcasearch = Pipeline([

        ("union",FeatureUnion(
            # Note that FeatureUnion() also accepts list of tuples, the first half of each tuple 
            # is the name of the transformer within the FeatureUnion

            transformer_list = [

                ("numeric_subpipeline",Pipeline([        # Note we have subpipeline branches inside the main pipeline
                    ("parser",get_numeric_data), # Step1: parse the numeric data (note how we avoid () when using FunctionTransformer objects)
                    ("imputer",Imputer()) # Step2: impute any missing data using default (mean), note we don't expect missing values in this case. 
                ])), # End of: numeric_subpipeline

                ("text_subpipeline",Pipeline([
                    ("parser",get_text_data), # Step1: parse the text data 
                    ("tokenizer",HashingVectorizer(token_pattern= TOKENS_ALPHANUMERIC, # Step2: use HashingVectorizer for automated tokenization and feature extraction
                                                 ngram_range = (1,1),
                                                 non_negative=True, 
                                                 norm=None, binary=True )), # Note here we use binary=True since our hack is to use tokenization to generate dummy variables  
                    ('dim_red', SelectKBest(k = 300)) # Step3: use dimension reduction to select 300 best features using chi2 as scoring function
                ]))
            ]

        )),# End of step: union, this is the fusion point to main pipeline, all features are numeric at this stage

        # Common steps:
        ("int", SparseInteractions(degree=2)), # Add polynomial interaction terms up to the second degree polynomial
        ("scaler",StandardScaler(with_mean=False)), # Standardize the features for a more gaussian distribution. 
        
        # A new feature union
        ("final_union",FeatureUnion(
            # Note that FeatureUnion() also accepts list of tuples, the first half of each tuple 
            # is the name of the transformer within the FeatureUnion

            transformer_list = [ ("dim_red_feature", SelectKBest(k = n_components)),
                                 ("dim_red_pca", TruncatedSVD(n_components = 20)) ]
        ))   
             
    ])# End of: userclick_pipeline_pcasearch

    # Fit and transform the X_train_balanced_sample set to get the features using new pipeline
    start = datetime.datetime.now()
    userclick_pipeline_pcasearch.fit(X_train_balanced,y_train_balanced)
    print("Completed pipeline fit using "+ str(n_components)+ " components, it took: " + str((process_time.seconds)/60) + " minutes.")
    
    start = datetime.datetime.now()
    X_train_balanced_trans_pl_pcasearch = userclick_pipeline_pcasearch.transform(X_train_balanced)
    process_time = datetime.datetime.now() - start
    print("Completed pipeline transform using "+ str(n_components)+ " components, it took: " + str((process_time.seconds)/60) + " minutes.")
    
    print("The shape of the transformed set is: " + str(X_train_balanced_trans_pl_pcasearch.shape))
    # Train the classifier and get estimates
    start = datetime.datetime.now()

    clf = LogisticRegression()
    clf.fit(X_train_balanced_trans_pl_pcasearch,y_train_balanced)

    process_time = datetime.datetime.now() - start
    print("Completed model fit and it took: " + str((process_time.seconds)/60) + " minutes.")

    probs = clf.predict_proba(X_train_balanced_trans_pl_pcasearch)[:,1]
    print("ROC score in the training set: " + str(roc_auc_score(y_train_balanced,probs)))

    # Transform the validation set
    start = datetime.datetime.now()
    X_val1_trans_pl_pcasearch = userclick_pipeline_pcasearch.transform(X_val1)
    process_time = datetime.datetime.now() - start
    print("Completed validation set transformation, it took: " + str((process_time.seconds)/60) + " minutes.")
    print("The shape of the transformed validation set is: " + str(X_val1_trans_pl_pcasearch.shape))
    probs = clf.predict_proba(X_val1_trans_pl_pcasearch)[:,1]
    print("ROC score in the validation set 1: " + str(roc_auc_score(y_val1,probs)))
    
    # Save the current pipeline
    filename = disk_directory + "userclick_pipeline_pca20_nfeatures" + str(n_components) + ".pkl"
    with open(filename,"wb") as f:
        pickle.dump(userclick_pipeline_pcasearch,f)
    print("Saved the pipeline search using: "+ str(n_components)+ " components " + "as :" + filename)
    
    print("Completed pipeline search using: "+ str(n_components) + " components.")
    print("--" * 40)

Starting pipeline search using: 20 components.
