# Feature Engineering with Pipelines

## Part I: Procedural Implementation:

In [1]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = preprocessing.LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = preprocessing.LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [2]:
import pandas as pd
import numpy as np
import datetime

from matplotlib import pyplot as plt
from functools import reduce
from sklearn import preprocessing

### Calculate ratio of fraudulent transaction by each categorical variable

def calculate_ratio_fraud(analysis_df, sel_var):
    '''
    Args: 
        analysis_df: Dataframe with transaction level details
        sel_var: variable of interest for ratio calculation (country, device, )
        
    Output:
        Dataframe that merges the ratio of fraudulent transaction specific to selected variable to analysis_df
    '''
    tmp = analysis_df.groupby([sel_var, 'class']).user_id.nunique()\
    .unstack(level = 1)\
    .reset_index()\
    .rename(columns = {0:'Not Fraud', 1: 'Fraud'}).fillna(0.0)
    tmp['ratio_fraud_' + sel_var] = tmp['Fraud']/(tmp['Fraud'] + tmp['Not Fraud'])
    tmp['num_trans_' + sel_var] = tmp['Fraud'] + tmp['Not Fraud']
    return analysis_df[['user_id', sel_var]]\
            .merge(tmp[[sel_var, 'ratio_fraud_' + sel_var, 'num_trans_' + sel_var]], on = sel_var)

def calculate_time_latency(df):
    '''Calculates the difference between sign up and purchase times'''
    df['time_latency'] = (df.purchase_time - df.signup_time).dt.total_seconds()/60/60
    return df

def merge_multiple_dataframes(dfs, key, method):
    '''
    Args: 
        dfs list of dataframes to be merged
        key list of column names to be used for join
        method merge-type(inner, outer, left)
        
    Output:
        combined dataframe
    '''
    return reduce(lambda  left, right: pd.merge(left, right, on = key, how=method), dfs)

def apply_label_encoding(df):
    return MultiColumnLabelEncoder(columns = df.columns).fit_transform(df)
    
def create_features(path_to_analysis_dataset):
    '''
    Args: 
        path to analysis dataset
        
    Output:
        Dataframe transforms raw data into specific feature elements ready to be used for classfication
    '''
    analysis_df = pd.read_csv(path_to_analysis_dataset)\
    .drop('Unnamed: 0', axis = 1)
    
    ## Convert signup and purchase times to pandas datetime
    analysis_df.signup_time = pd.to_datetime(analysis_df.signup_time, format = '%m/%d/%Y %H:%M')
    analysis_df.purchase_time = pd.to_datetime(analysis_df.purchase_time, format = '%m/%d/%Y %H:%M')
    
    ## Fill missing values with NA
    analysis_df = analysis_df.fillna('NA')
    
    ## Calucate fraud ratios
    fraud_by_dev = calculate_ratio_fraud(analysis_df, 'device_id')
    fraud_by_country = calculate_ratio_fraud(analysis_df, 'country')
    fraud_by_age = calculate_ratio_fraud(analysis_df, 'age')
    fraud_by_gender = calculate_ratio_fraud(analysis_df, 'sex')
    fraud_by_source = calculate_ratio_fraud(analysis_df, 'source')
    fraud_by_browser = calculate_ratio_fraud(analysis_df, 'browser')
    
    ## Calculate latency between sign-up and purchase time
    latency_df = calculate_time_latency(analysis_df)
    
    ## Merge all features
    feature_df = merge_multiple_dataframes([
                                        fraud_by_dev, fraud_by_country, 
                                        fraud_by_gender, 
                                        fraud_by_age, 
                                        fraud_by_browser, 
                                        fraud_by_source, 
                                        analysis_df[['user_id', 'purchase_value', 'class']],
                                        latency_df[['user_id', 'time_latency']]
                                       ], 
                                       key = ['user_id'], method = 'outer')
    
    df_cat = apply_label_encoding(feature_df[['country', 'sex', 'browser', 'source']])
    return pd.concat([feature_df.drop(['country', 'sex', 'browser', 'source'], axis = 1), df_cat], axis = 1).set_index(['user_id', 'device_id'])


## Part II: Object Oriented API

In [3]:
# Import Libraries as needed. You may adjust the aliases, if you choose
import pandas as pd
import numpy as np
import datetime
from functools import reduce
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin

data = pd.read_csv('Analysis_dataset.csv')

####  Exploratory Analysis:

In [4]:
#dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0      120000 non-null  int64  
 1   user_id         120000 non-null  int64  
 2   signup_time     120000 non-null  object 
 3   purchase_time   120000 non-null  object 
 4   purchase_value  120000 non-null  int64  
 5   device_id       120000 non-null  object 
 6   source          120000 non-null  object 
 7   browser         120000 non-null  object 
 8   sex             120000 non-null  object 
 9   age             120000 non-null  int64  
 10  ip_address      120000 non-null  float64
 11  class           120000 non-null  int64  
 12  country         102582 non-null  object 
dtypes: float64(1), int64(5), object(7)
memory usage: 11.9+ MB


In [5]:
#first few rows of the dataset
data.head()

Unnamed: 0.1,Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,0,285108,7/15/2015 4:36,9/10/2015 14:17,31,HZAKVUFTDOSFD,Direct,Chrome,M,49,2818400000.0,0,United States
1,1,131009,1/24/2015 12:29,4/13/2015 4:53,31,XGQAJSOUJIZCC,SEO,IE,F,21,3251268000.0,0,United Kingdom
2,2,328855,3/11/2015 0:54,4/5/2015 12:23,16,VCCTAYDCWKZIY,Direct,IE,M,26,2727760000.0,0,United States
3,3,229053,1/7/2015 13:19,1/9/2015 10:12,29,MFFIHYNXCJLEY,SEO,Chrome,M,34,2083420000.0,0,Korea Republic of
4,4,108439,2/8/2015 21:11,4/9/2015 14:26,26,WMSXWGVPNIFBM,Ads,FireFox,M,33,3207913000.0,0,Brazil


In [6]:
#columns that have null values
data.isnull().sum()

Unnamed: 0            0
user_id               0
signup_time           0
purchase_time         0
purchase_value        0
device_id             0
source                0
browser               0
sex                   0
age                   0
ip_address            0
class                 0
country           17418
dtype: int64

**custom transformer called `DropColTransformer` to drop the columns specified in an optional `columns` construction parameter

In [7]:
class DropColTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):
        self.columns=columns # array of column names to drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
                
        '''
        Remove columns of X specified in self.columns using
        drop() function. If no column specified, remove 'Unnamed: 0'
        column.
        '''

        output=X.copy()
        
        if self.columns is None:
            self.columns=['Unnamed: 0']
            
        output.drop(columns=self.columns, inplace=True)
         
        print("Summary:")
        print("Removed Columns: {}".format(self.columns))
        return output

In [8]:
assert DropColTransformer

In [9]:
test = DropColTransformer()
assert type(test) is DropColTransformer

<br>

**custom transformer called `FillMissingTransformer` that fills in the missing values in the dataset that takes an optional `value` parameter that specifies what value to substitute for missing values

In [10]:
class FillMissingTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, value=None):
        self.value=value #value that specifies what value to substitute for missing values

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        '''
        Replace all missing values with self.value using
        fillna() function. If no value is specified,
        replace missing values with 'NA'
        '''
        output=X.copy()
        
        if self.value is None: 
            self.value='NA'
        
        output.fillna(self.value, inplace=True)
        
        print("Replaced all missing values with: {}".format(self.value))

        return output

In [11]:
assert FillMissingTransformer

In [12]:
test = FillMissingTransformer()
assert type(test) is FillMissingTransformer

<br>

**custom transformer called `DateTransformer` to cast date features into `pandas datetime` format that takes an optional construction parameter `columns` to specify which columns need to be transformed, and an optional parameter `strftime` that specifies the format to be parsed

In [13]:
class DateTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None, strftime=None):
        self.columns=columns #columns that need to transformed 
        self.strftime=strftime #format in which columns needs to be parsed

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        '''
        Change data type of columns, specified in self.columns,
        to date format. If no columns are specified, transform 
        'purchase_time' and 'signup_time' columns. If no date 
        format is specified, keep the format as '%m/%d/%Y %H:%M'
        '''
        
        output= X.copy()
        
        if (self.columns is None) and (self.strftime!=None): #if columns are NOT provided but format is provided
            self.columns=['signup_time', 'purchase_time']
            
        elif (self.columns!= None) and (self.strftime is None): #if columns are provided but format is NOT provided
            self.strftime='%m/%d/%Y %H:%M'
        
        elif (self.columns is None) and (self.strftime is None): #if both columns and format are NOT provided
            self.columns=['signup_time', 'purchase_time']
            self.strftime='%m/%d/%Y %H:%M'
        
        #tranforming
        for col in self.columns:
                output[col] = pd.to_datetime(output[col], format = self.strftime)
        
    
        print("Columns that are converted to datetime type: {}".format(self.columns))

        return output
    

In [14]:
assert DateTransformer

In [15]:
test = DateTransformer()
assert type(test) is DateTransformer

**custom transformer called `NumericalTransformer` to calculate the fraud ratio and latency.

In [16]:
class NumericalTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):
        self.columns=columns  #columns for which the fraud ratio should be calculated

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        '''
        Calculate latency with the help of purchase time and 
        signup time. 
        Calculate fraud ratios for the columns specified in 
        self.columns. If no columns are specified, then calculate 
        fraud on columns-'device_id', 'country', 'age', 'sex',
        'source', and  'browser'.
        '''
        
        output=X.copy()
        
        #latency
        #converting columns to datetime here again (in case pipeline does not
        #use DateTransformer for some special reason even then we should be able to
        #calculate latency)
        p_time=pd.to_datetime(output['purchase_time'], format = '%m/%d/%Y %H:%M')
        s_time=pd.to_datetime(output['signup_time'], format = '%m/%d/%Y %H:%M')
        output['time_latency'] = (p_time-s_time).dt.total_seconds()/60/60

        
        #calculating fraud ratios
        if self.columns is None: #if no columns are specified
            self.columns=['device_id', 'country', 'age', 'sex', 'source', 'browser']
        
        for col in self.columns:
            output= self.calculate_ratio_fraud(output, col)  
        

        print("Columns for which the fraud ratio is calculated: {}".format(self.columns))
        print("Calculated latency.")

        return output
    
    
    #helping function to calculate fraud ratio
    def calculate_ratio_fraud(self, df, sel_var):
        
        '''
        Args: 
            df: Dataframe with transaction level details
            sel_var: variable of interest for ratio calculation

        Output:
            Dataframe that merges the ratio of fraudulent transaction specific to selected variable to df
        '''
        
        tmp = df.groupby([sel_var, 'class']).user_id.nunique().unstack(level = 1).reset_index()\
        .rename(columns = {0:'Not Fraud', 1: 'Fraud'}).fillna(0.0)    
        tmp['ratio_fraud_' + sel_var] = tmp['Fraud']/(tmp['Fraud'] + tmp['Not Fraud'])
        tmp['num_trans_' + sel_var] = tmp['Fraud'] + tmp['Not Fraud']
        return df[df.columns].merge(tmp[[sel_var, 'ratio_fraud_' + sel_var, 'num_trans_' + sel_var]], on = sel_var)    

In [17]:
assert NumericalTransformer

In [18]:
test = NumericalTransformer()
assert type(test) is NumericalTransformer

**Pipline Object called `fullPipeline` that concatonates all of the custom transformers

In [19]:
#creating pipeline
from sklearn.pipeline import Pipeline
fullPipeline=Pipeline([
    ('drop columns', DropColTransformer()),
    ('fill missing values', FillMissingTransformer()),
    ('change to date format', DateTransformer()),
    ('calculate fraud ratio and latency', NumericalTransformer())
])


In [20]:
fullPipeline

Pipeline(steps=[('drop columns', DropColTransformer()),
                ('fill missing values', FillMissingTransformer()),
                ('change to date format', DateTransformer()),
                ('calculate fraud ratio and latency', NumericalTransformer())])

In [21]:
#transformed data
pipe_df=fullPipeline.fit_transform(data)
pipe_df

Summary:
Removed Columns: ['Unnamed: 0']
Replaced all missing values with: NA
Columns that are converted to datetime type: ['signup_time', 'purchase_time']
Columns for which the fraud ratio is calculated: ['device_id', 'country', 'age', 'sex', 'source', 'browser']
Calculated latency.


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,...,ratio_fraud_country,num_trans_country,ratio_fraud_age,num_trans_age,ratio_fraud_sex,num_trans_sex,ratio_fraud_source,num_trans_source,ratio_fraud_browser,num_trans_browser
0,285108,2015-07-15 04:36:00,2015-09-10 14:17:00,31,HZAKVUFTDOSFD,Direct,Chrome,M,49,2.818400e+09,...,0.096830,46184.0,0.056534,1079.0,0.095442,70126,0.105643,24242,0.099441,48652
1,271944,2015-02-27 09:53:00,2015-02-27 13:22:00,52,MYGTSESPGKWYT,Direct,Chrome,M,49,3.355398e+09,...,0.096830,46184.0,0.056534,1079.0,0.095442,70126,0.105643,24242,0.099441,48652
2,310118,2015-04-24 09:15:00,2015-08-21 11:38:00,18,DFPTDWCPIFUVW,Direct,Chrome,M,49,2.149598e+09,...,0.096830,46184.0,0.056534,1079.0,0.095442,70126,0.105643,24242,0.099441,48652
3,10860,2015-02-27 18:06:00,2015-03-25 08:48:00,77,DKENIFWLHRSFZ,Direct,Chrome,M,49,2.221886e+09,...,0.096830,46184.0,0.056534,1079.0,0.095442,70126,0.105643,24242,0.099441,48652
4,168544,2015-02-28 06:25:00,2015-05-29 14:53:00,27,AQTFHXEICDOZQ,Direct,Chrome,M,49,1.129217e+09,...,0.096830,46184.0,0.056534,1079.0,0.095442,70126,0.105643,24242,0.099441,48652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,263441,2015-05-08 20:57:00,2015-07-25 20:09:00,57,ADKOFUKXRXPQV,Ads,Opera,F,55,4.002250e+09,...,0.085142,17418.0,0.109422,329.0,0.091671,49874,0.091696,47461,0.091429,2975
119996,136421,2015-03-08 14:39:00,2015-04-07 05:58:00,14,OUOOYDNCETPVI,Ads,Opera,F,47,5.650443e+08,...,0.096830,46184.0,0.089015,1584.0,0.091671,49874,0.091696,47461,0.091429,2975
119997,43294,2015-03-06 12:23:00,2015-04-18 15:49:00,37,RHOZFEBVKGMVZ,Ads,Opera,F,54,1.509441e+09,...,0.103911,3580.0,0.104348,460.0,0.091671,49874,0.091696,47461,0.091429,2975
119998,110982,2015-02-20 22:31:00,2015-06-02 23:25:00,69,KTWITEKHABJAZ,Ads,Opera,F,54,3.025340e+09,...,0.091589,3341.0,0.104348,460.0,0.091671,49874,0.091696,47461,0.091429,2975


In [22]:
#infomatiion regarding the transformed data
pipe_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120000 entries, 0 to 119999
Data columns (total 25 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   user_id                120000 non-null  int64         
 1   signup_time            120000 non-null  datetime64[ns]
 2   purchase_time          120000 non-null  datetime64[ns]
 3   purchase_value         120000 non-null  int64         
 4   device_id              120000 non-null  object        
 5   source                 120000 non-null  object        
 6   browser                120000 non-null  object        
 7   sex                    120000 non-null  object        
 8   age                    120000 non-null  int64         
 9   ip_address             120000 non-null  float64       
 10  class                  120000 non-null  int64         
 11  country                120000 non-null  object        
 12  time_latency           120000 non-null  floa

In [23]:
assert fullPipeline

In [24]:
test = fullPipeline
assert type(test) is Pipeline