In [134]:
# !pip install --upgrade pip --quiet
# !pip install -r requirements.txt --quiet

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score as roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from dateutil.rrule import rrule, DAILY
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
import datetime



# Loading cleaned_data

In [2]:
def load_cleaned_data():
#     current_file = os.path.abspath(os.path.dirname())
    current_file = ""

    csv_filename = os.path.join(current_file, './cleaned_data/campaign_data.csv')
    campaign_data = pd.read_csv(csv_filename, parse_dates=['start_date', 'end_date'])

    csv_filename = os.path.join(current_file, './cleaned_data/coupon_item_mapping_data.csv')
    coupon_item_mapping_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/customer_demographics_data.csv')
    customer_demographics_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/customer_transaction_data.csv')
    customer_transaction_data = pd.read_csv(csv_filename, parse_dates=['date'])

    csv_filename = os.path.join(current_file, './cleaned_data/item_data.csv')
    item_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/train_data.csv')
    train_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, '../test_data/test_QyjYwdj.csv')
    test_data = pd.read_csv(csv_filename)
    
    return campaign_data, coupon_item_mapping_data, customer_demographics_data, \
           customer_transaction_data, item_data, train_data, test_data

In [4]:
campaign_data_pure, coupon_item_mapping_data_pure, customer_demographics_data_pure, \
customer_transaction_data_pure, item_data_pure, train_data_pure, test_data_pure  = load_cleaned_data()


In [5]:
# Saving a copy of cleaned_data read from csv so as to avoid re-reading, in case any changes are made in the 
# respective dataframes and original version is desired.
# This is not the best solution; if either he dataset is large or further computations require huge memory.
campaign_data, coupon_item_mapping_data, customer_demographics_data, \
customer_transaction_data, item_data, train_data, test_data \
= campaign_data_pure, coupon_item_mapping_data_pure, customer_demographics_data_pure, \
customer_transaction_data_pure, item_data_pure, train_data_pure, test_data_pure


# Feature extraction utilities

In [8]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [9]:
def one_hot_encoder(df, column_name):
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
    category_df = pd.DataFrame(df[column_name])
    encoded_categories = one_hot_encoder.fit_transform(category_df)
    
    encoded_df = pd.DataFrame(encoded_categories.toarray(), columns=one_hot_encoder.get_feature_names([column_name]))
   
    return pd.concat([df, encoded_df], axis=1)


In [11]:
# Find the number of days of overlap from 'start_date' to 'end_date', 
# among the time intervals given in 'intervals_df'

def find_overlap_duration(start_date, end_date, intervals_df):
    dates_to_check_against = set(
        [dt for dt in rrule(DAILY, dtstart=start_date, until=end_date)])
    
    number_of_dates_originally = len(dates_to_check_against)
    
    for i in intervals_df.index:
        dates_to_check = set(
        [dt for dt in rrule(DAILY, dtstart=intervals_df.start_date[i], until=intervals_df.end_date[i])])
    
        dates_to_check_against = dates_to_check_against.difference(dates_to_check)
        
    
    return number_of_dates_originally - len(dates_to_check_against)

In [None]:
def month_feature_binarizer(df):
    months_array = df['months']
    multi_label_binarizer = MultiLabelBinarizer()
    months_encoded = multi_label_binarizer.fit_transform(months_array)

    months_encoded_df = pd.DataFrame(months_encoded, columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    return pd.concat([df, months_encoded_df], axis=1)

In [None]:
!mkdir feature_set

def save_feature_set(df, file_name):
    df.to_csv("feature_set/" + file_name + ".csv", index=False)

In [36]:
# Experimental directory is to save raw_features which are still not encoded and easily human-interpretable.
!mkdir feature_set/experimental

def save_experimental_feature_set(df, file_name):
    df.to_csv("feature_set/experimental/" + file_name + ".csv", index=False)

mkdir: feature_set/experimental: File exists


# 'campaign' based features' extraction 

In [15]:
def analyse_campaign_runs(campaign_data, custom_parameter=False):

    campaign_data.start_date = pd.to_datetime(campaign_data.start_date)
    campaign_data.end_date = pd.to_datetime(campaign_data.end_date)


    same_type_overlapping_campaigns = []
    other_type_overlapping_campaigns = []
    same_type_overlap_duration = []
    other_type_overlap_duration = []
    duration_in_days = []
    months = []
    number_of_weekends = []

    for i in campaign_data.index: 
        campaign_id_in_question = campaign_data.campaign_id[i]
        campaign_type_in_question = campaign_data.campaign_type[i]
        start_date_in_question = campaign_data.start_date[i]
        end_date_in_question = campaign_data.end_date[i]

        cond_2 = (campaign_data.start_date <= start_date_in_question) & (campaign_data.end_date >= start_date_in_question) 
        cond_1 = (campaign_data.start_date >= start_date_in_question) & (campaign_data.start_date <= end_date_in_question) 
        cond_overlapping_campaigns = (cond_1) | (cond_2)

        cond_same_type = (campaign_data.campaign_type == campaign_type_in_question) 
        cond_other_type = (campaign_data.campaign_type != campaign_type_in_question) 

        cond_same_type = (cond_overlapping_campaigns) & (cond_same_type)
        cond_other_type = (cond_overlapping_campaigns) & (cond_other_type)

        # Same-type metrics
        same_type_campaigns = campaign_data.loc[(cond_same_type) \
                                                & (campaign_data.campaign_id != campaign_id_in_question)]

        same_type_overlapping_campaigns_count = same_type_campaigns.shape[0]
        same_type_overlapping_campaigns.append(same_type_overlapping_campaigns_count)

        same_type_overlap_duration_for_this_campaign = find_overlap_duration(\
            start_date_in_question, end_date_in_question, same_type_campaigns[['start_date', 'end_date']])
        same_type_overlap_duration.append(same_type_overlap_duration_for_this_campaign)


        # Other-type metrics
        other_type_campaigns = campaign_data.loc[cond_other_type]

        other_type_overlapping_campaigns_count = other_type_campaigns.shape[0]
        other_type_overlapping_campaigns.append(other_type_overlapping_campaigns_count)

        other_type_overlap_duration_for_this_campaign = find_overlap_duration(\
            start_date_in_question, end_date_in_question, other_type_campaigns[['start_date', 'end_date']])
        other_type_overlap_duration.append(other_type_overlap_duration_for_this_campaign)


        # Months Running
        dates_running = set(rrule(DAILY, dtstart=start_date_in_question, until=end_date_in_question))
        months_for_this_challenge = list(set([dt.month for dt in dates_running]))
        months.append(months_for_this_challenge)

        # Duration
        duration_in_days_for_this_challenge = len(dates_running)
        duration_in_days.append(duration_in_days_for_this_challenge)
        
        # Weekends
        number_of_weekends_for_this_challenge = sum([dt.isoweekday() >= 6 for dt in list(dates_running)])
        number_of_weekends.append(number_of_weekends_for_this_challenge)
        
    campaign_data['duration_in_days'] = pd.Series(duration_in_days, name='duration_in_days')
    campaign_data['months'] = pd.Series(months, name='months')
    campaign_data['same_type_overlapping_campaigns'] = pd.Series(same_type_overlapping_campaigns, name='same_type_overlapping_campaigns')
    campaign_data['other_type_overlapping_campaigns'] = pd.Series(other_type_overlapping_campaigns, name='other_type_overlapping_campaigns_count')
    campaign_data['same_type_overlap_duration'] = pd.Series(same_type_overlap_duration, name='same_type_overlap_duration')
    campaign_data['other_type_overlap_duration'] = pd.Series(other_type_overlap_duration, name='other_type_overlap_duration')
    campaign_data['number_of_weekends'] = pd.Series(number_of_weekends, name='number_of_weekends')

#     return np.c_[campaign_data]
    return campaign_data

In [43]:
def get_campaign_based_raw_features():
    attributes_to_process = list(campaign_data.columns.values)

    campaign_type_processor = Pipeline([
        ('attribute_selector', DataFrameSelector(attributes_to_process)),
        ('campaign_runs_analyser', FunctionTransformer(analyse_campaign_runs, validate=False, 
                                                 kw_args={"custom_parameter": False})),
#         ('month_labelizer', FunctionTransformer(month_feature_binarizer, validate=False)),
#         ('one_hot_encoder', FunctionTransformer(one_hot_encoder, validate=False, 
#                                                 kw_args={"column_name": 'campaign_type'}))
    ])
    
    return campaign_type_processor.fit_transform(campaign_data)

In [44]:
# Get campaign_based_features and save to 'feature_set/experimental' directory.
campaign_based_raw_features = get_campaign_based_raw_features()
save_experimental_feature_set(campaign_based_features, 'campaign_based_features')

In [180]:
def get_campaign_based_features():
    raw_features = get_campaign_based_raw_features()

    campaign_type_processor = Pipeline([
        ('month_labelizer', FunctionTransformer(month_feature_binarizer, validate=False)),
        ('one_hot_encoder', FunctionTransformer(one_hot_encoder, validate=False, 
                                                kw_args={"column_name": 'campaign_type'}))
    ])
    
    encoded_features = campaign_type_processor.fit_transform(raw_features)
    
    encoded_features = encoded_features.drop(columns=['start_date', 'end_date', 'campaign_type', 'months'])

    return encoded_features

In [181]:
# Get campaign_based_features and save to 'feature_set' directory.
campaign_based_features = get_campaign_based_features()
save_feature_set(campaign_based_features, 'campaign_based_features')

# campaign_customer combination features

In [61]:
campaign_customer_combinations_train = train_data[['campaign_id', 'customer_id']].drop_duplicates()
campaign_customer_combinations_test = test_data[['campaign_id', 'customer_id']].drop_duplicates()

campaign_customer_combinations = pd.concat([campaign_customer_combinations_train, \
                                            campaign_customer_combinations_test]).drop_duplicates()


In [53]:
campaign_customer_combinations = campaign_customer_combinations.merge(campaign_data)

In [135]:
def get_date_in_current_year(old_date):
    try:
        new_date = datetime.date(year=2020, month=old_date.month, day=old_date.day)
    except:
        print("error in:", old_date)
        return 0
    return new_date

# customer_transaction_data_pure['dummy_date'] = customer_transaction_data_pure.date.apply(lambda x : get_date_in_current_year(x))


In [184]:
def analyse_transactions_by_customer_till_campaign_start(row):
    customer_id_in_question = row.customer_id
    campaign_id_in_question = row.campaign_id
    start_date_in_question = campaign_data.loc[campaign_data.campaign_id == campaign_id_in_question, "start_date"].iloc[0]
    end_date_in_question = campaign_data.loc[campaign_data.campaign_id == campaign_id_in_question, "end_date"].iloc[0]
    
    # DF filtering
    cond_for_previous_transactions = (customer_transaction_data.customer_id == customer_id_in_question) \
    & (customer_transaction_data.date < start_date_in_question)
    
    transactions_by_customer_till_campaign_start = \
    customer_transaction_data.loc[cond_for_previous_transactions]
    
    cond_for_any_discount = (transactions_by_customer_till_campaign_start.coupon_discount < 0) \
                     | (transactions_by_customer_till_campaign_start.other_discount < 0)

    transactions_by_customer_till_campaign_start_with_any_discount = \
    transactions_by_customer_till_campaign_start.loc[cond_for_any_discount]

    
    cond_for_coupon_discount = (transactions_by_customer_till_campaign_start.coupon_discount < 0)
    
    transactions_by_customer_till_campaign_start_with_coupon_discount = \
    transactions_by_customer_till_campaign_start_with_any_discount.loc[cond_for_coupon_discount]


    
    # Number of transaction metrics
    no_of_transactions_by_customer_till_campaign_start = \
    transactions_by_customer_till_campaign_start.count().date
    
    percentage_of_transactions_by_customer_till_campaign_start_with_any_discount = \
    transactions_by_customer_till_campaign_start_with_any_discount.count().date \
    / no_of_transactions_by_customer_till_campaign_start

    percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount = \
    transactions_by_customer_till_campaign_start_with_coupon_discount.count().date \
    / no_of_transactions_by_customer_till_campaign_start

    # Transaction amount metrics
    transaction_amount_by_customer_till_campaign_start = \
    (transactions_by_customer_till_campaign_start.cost_price).sum()
    
    transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount = \
    (transactions_by_customer_till_campaign_start_with_any_discount.cost_price).sum()

    transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount = \
    (transactions_by_customer_till_campaign_start_with_coupon_discount.cost_price).sum()

    average_percent_discount_by_customer_till_campaign_start = \
    ((transactions_by_customer_till_campaign_start.coupon_discount \
     + transactions_by_customer_till_campaign_start.other_discount) \
     / transactions_by_customer_till_campaign_start.cost_price).mean()
    
    return [campaign_id_in_question, \
            customer_id_in_question, \
            no_of_transactions_by_customer_till_campaign_start, \
            percentage_of_transactions_by_customer_till_campaign_start_with_any_discount, \
            percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount, \
            transaction_amount_by_customer_till_campaign_start, \
            transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount, \
            transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount, \
            average_percent_discount_by_customer_till_campaign_start]
     
    
    

In [185]:
def get_customer_transactions_till_campaign_start_based_features():
    customer_transactions_till_campaign_start_based_features = \
    campaign_customer_combinations\
    .progress_apply(lambda row: pd.Series(analyse_transactions_by_customer_till_campaign_start(row),\
                                      index=['campaign_id', \
                                             'customer_id', \
                                             'no_of_transactions_by_customer_till_campaign_start', \
                                             'percentage_of_transactions_by_customer_till_campaign_start_with_any_discount', \
                                             'percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount', \
                                             'transaction_amount_by_customer_till_campaign_start', \
                                             'transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount', \
                                             'transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount', \
                                             'average_percent_discount_by_customer_till_campaign_start']),\
                axis=1)
    
    return customer_transactions_till_campaign_start_based_features


In [216]:
def analyse_customer_redemption_record_till_campaign_start(row):
    campaign_id_in_question = row.campaign_id
    start_date_in_question = campaign_data.loc[campaign_data.campaign_id == campaign_id_in_question, "start_date"].iloc[0]
    customer_id_in_question = row.customer_id
    
    cond = (train_and_campaign_data.customer_id == customer_id_in_question)
    cond = cond & (train_and_campaign_data.end_date < start_date_in_question)
    
    redemption_status = train_and_campaign_data.loc[cond].redemption_status
    
    if (redemption_status.count() != 0):
        customer_redemption_record_till_campaign_start = \
        redemption_status.sum() / redemption_status.count()
    else:
        customer_redemption_record_till_campaign_start = 0
            

    return [campaign_id_in_question, 
            customer_id_in_question, \
            customer_redemption_record_till_campaign_start]
    
    

In [214]:
def get_customer_redemption_record_till_campaign_start_features():
    customer_redemption_record_till_campaign_start_feature = \
    campaign_customer_combinations\
    .progress_apply(lambda row: pd.Series(analyse_customer_redemption_record_till_campaign_start(row),\
                                      index=['campaign_id', \
                                             'customer_id', \
                                             'customer_redemption_record_till_campaign_start']),\
                axis=1)

    return customer_redemption_record_till_campaign_start_feature


In [188]:
def analyse_customer_transactions_for_similar_past_period(row):
    customer_id_in_question = row.customer_id
    campaign_id_in_question = row.campaign_id
    start_date_in_question = campaign_data.loc[campaign_data.campaign_id == campaign_id_in_question, "start_date"].iloc[0]
    end_date_in_question = campaign_data.loc[campaign_data.campaign_id == campaign_id_in_question, "end_date"].iloc[0]
    dates_in_question = [datetime.date(2020, dt.month, dt.day) for dt in rrule(DAILY, dtstart=start_date_in_question, until=end_date_in_question)]
        
    
    cond_for_similar_period_transactions = (customer_transaction_data.customer_id == customer_id_in_question) \
    & ( customer_transaction_data.date < start_date_in_question ) \
    & ( customer_transaction_data.dummy_date.isin(dates_in_question) )
    
                      
    transactions_by_customer_for_similar_period = \
    customer_transaction_data_pure.loc[cond_for_similar_period_transactions]
    
    cond_for_any_discount = (transactions_by_customer_for_similar_period.coupon_discount < 0) \
                     | (transactions_by_customer_for_similar_period.other_discount < 0)

    transactions_by_customer_for_similar_period_with_any_discount = \
    transactions_by_customer_for_similar_period.loc[cond_for_any_discount]

    
    cond_for_coupon_discount = (transactions_by_customer_for_similar_period.coupon_discount < 0)
    
    transactions_by_customer_for_similar_period_with_coupon_discount = \
    transactions_by_customer_for_similar_period_with_any_discount.loc[cond_for_coupon_discount]


    
    # Number of transaction metrics
    no_of_transactions_by_customer_for_similar_period = \
    transactions_by_customer_for_similar_period.count().date
    
    percentage_of_transactions_by_customer_for_similar_period_with_any_discount = \
    transactions_by_customer_for_similar_period_with_any_discount.count().date \
    / no_of_transactions_by_customer_for_similar_period

    percentage_of_transactions_by_customer_for_similar_period_with_coupon_discount = \
    transactions_by_customer_for_similar_period_with_coupon_discount.count().date \
    / no_of_transactions_by_customer_for_similar_period

    # Transaction amout metrics
    transaction_amount_by_customer_for_similar_period = \
    (transactions_by_customer_for_similar_period.cost_price).sum()
    
    transaction_amount_by_customer_for_similar_period_where_customer_got_any_discount = \
    (transactions_by_customer_for_similar_period_with_any_discount.cost_price).sum()

    transaction_amount_by_customer_for_similar_period_where_customer_got_coupon_discount = \
    (transactions_by_customer_for_similar_period_with_coupon_discount.cost_price).sum()

    average_percent_discount_by_customer_for_similar_period = \
    ((transactions_by_customer_for_similar_period.coupon_discount \
     + transactions_by_customer_for_similar_period.other_discount) \
     / transactions_by_customer_for_similar_period.cost_price).mean()
    

    return [campaign_id_in_question, \
            customer_id_in_question, \
            no_of_transactions_by_customer_for_similar_period, \
            percentage_of_transactions_by_customer_for_similar_period_with_any_discount, \
            percentage_of_transactions_by_customer_for_similar_period_with_coupon_discount, \
            transaction_amount_by_customer_for_similar_period, \
            transaction_amount_by_customer_for_similar_period_where_customer_got_any_discount, \
            transaction_amount_by_customer_for_similar_period_where_customer_got_coupon_discount, \
            average_percent_discount_by_customer_for_similar_period]  
    

In [189]:
def get_customer_transactions_for_similar_past_period_based_features():
    
    customer_transactions_for_similar_past_period_based_features = \
    campaign_customer_combinations\
    .progress_apply(lambda row: pd.Series(analyse_customer_transactions_for_similar_past_period(row),\
                                      index=['campaign_id', \
                                             'customer_id', \
                                             'no_of_transactions_by_customer_for_similar_period', \
                                             'percentage_of_transactions_by_customer_for_similar_period_with_any_discount', \
                                             'percentage_of_transactions_by_customer_for_similar_period_with_coupon_discount', \
                                             'transaction_amount_by_customer_for_similar_period', \
                                             'transaction_amount_by_customer_for_similar_period_where_customer_got_any_discount', \
                                             'transaction_amount_by_customer_for_similar_period_where_customer_got_coupon_discount', \
                                             'average_percent_discount_by_customer_for_similar_period']),\
                axis=1)
    
    return customer_transactions_for_similar_past_period_based_features

In [190]:
def get_campaign_customer_features():
    campaign_customer_features = campaign_customer_combinations
    
    campaign_customer_features = campaign_customer_features\
    .merge(get_customer_redemption_record_till_campaign_start_features())
    
#     customer_transaction_data['dummy_date'] = \
#     customer_transaction_data.date.apply(lambda x : get_date_in_current_year(x))
    
    campaign_customer_features = campaign_customer_features\
    .merge(get_customer_transactions_for_similar_past_period_based_features())
    
#     customer_transaction_data = customer_transaction_data.drop(columns=['dummy_date'])

    campaign_customer_features = campaign_customer_features\
    .merge(get_customer_redemption_record_till_campaign_start_features())
    
    return campaign_customer_features
    

In [219]:
## Add dummy_date and drop after extracting features. 
## TODO: move it inside a function.

customer_transaction_data['dummy_date'] = \
customer_transaction_data.date.apply(lambda x : get_date_in_current_year(x))
    
campaign_customer_features = get_campaign_customer_features()

customer_transaction_data = customer_transaction_data.drop(columns=['dummy_date'])

HBox(children=(IntProgress(value=0, max=6967), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6967), HTML(value='')))






HBox(children=(IntProgress(value=0, max=6967), HTML(value='')))




In [220]:
save_feature_set(campaign_customer_features, 'campaign_customer_features')

# coupon_customer combination features

In [140]:
coupon_customer_combinations_train = train_data[['coupon_id', 'customer_id']].drop_duplicates()
coupon_customer_combinations_test = test_data[['coupon_id', 'customer_id']].drop_duplicates()

coupon_customer_combinations = pd.concat([coupon_customer_combinations_train, \
                                          coupon_customer_combinations_test]).drop_duplicates()


In [191]:
def analyse_customer_coupon_intersection(row):
    customer_id_in_question = row.customer_id
    coupon_id_in_question = row.coupon_id
    
    item_ids_purchased_by_customer = \
    set(customer_transaction_data.loc[customer_transaction_data.customer_id == customer_id_in_question].item_id.unique())

    brands_purchased_by_customer = \
    set(item_data_pure.loc[item_data_pure.item_id.isin(item_ids_purchased_by_customer)].brand.unique())

    categories_purchased_by_customer = \
    set(item_data_pure.loc[item_data_pure.item_id.isin(item_ids_purchased_by_customer)].category.unique())

    item_ids_covered_by_coupon = \
    set(coupon_item_mapping_data_pure.loc[(coupon_item_mapping_data_pure.coupon_id == coupon_id_in_question)].item_id.unique())
    
    brands_covered_by_coupon = \
    set(item_data_pure.loc[item_data_pure.item_id.isin(item_ids_covered_by_coupon)].brand.unique())
    
    categories_covered_by_coupon = \
    set(item_data_pure.loc[item_data_pure.item_id.isin(item_ids_covered_by_coupon)].category.unique())
    
    item_ids_purchased_by_customer_and_covered_by_coupon = \
    item_ids_purchased_by_customer.intersection(item_ids_covered_by_coupon)

    brands_purchased_by_customer_and_covered_by_coupon = \
    brands_purchased_by_customer.intersection(brands_covered_by_coupon)
    
    categories_purchased_by_customer_and_covered_by_coupon = \
    categories_purchased_by_customer.intersection(categories_covered_by_coupon)
    
    no_of_item_ids_purchased_by_customer = len(item_ids_purchased_by_customer)
    no_of_item_ids_covered_by_coupon = len(item_ids_covered_by_coupon)
    no_of_item_ids_purchased_by_customer_and_covered_by_coupon = len(item_ids_purchased_by_customer_and_covered_by_coupon)

    no_of_brands_purchased_by_customer = len(brands_purchased_by_customer)
    no_of_brands_covered_by_coupon = len(brands_covered_by_coupon)
    no_of_brands_purchased_by_customer_and_covered_by_coupon = len(brands_purchased_by_customer_and_covered_by_coupon)

    no_of_categories_purchased_by_customer = len(categories_purchased_by_customer)
    no_of_categories_covered_by_coupon = len(categories_covered_by_coupon)
    no_of_categories_purchased_by_customer_and_covered_by_coupon = len(categories_purchased_by_customer_and_covered_by_coupon)


    percent_of_item_ids_purchased_by_customer_and_covered_by_coupon = \
    no_of_brands_purchased_by_customer_and_covered_by_coupon / no_of_item_ids_purchased_by_customer
    
    percent_of_brands_purchased_by_customer_and_covered_by_coupon = \
    no_of_brands_purchased_by_customer_and_covered_by_coupon / no_of_brands_purchased_by_customer
    
    percent_of_categories_purchased_by_customer_and_covered_by_coupon = \
    no_of_categories_purchased_by_customer_and_covered_by_coupon / no_of_categories_purchased_by_customer
    
    return [customer_id_in_question, \
           coupon_id_in_question, \
           no_of_item_ids_purchased_by_customer_and_covered_by_coupon, \
           no_of_brands_purchased_by_customer_and_covered_by_coupon, \
           no_of_categories_purchased_by_customer_and_covered_by_coupon, \
           percent_of_item_ids_purchased_by_customer_and_covered_by_coupon, \
           percent_of_brands_purchased_by_customer_and_covered_by_coupon, \
           percent_of_categories_purchased_by_customer_and_covered_by_coupon]
    
    
    

In [192]:
def get_customer_coupon_intersection_features():
    customer_coupon_intersection_features = \
    coupon_customer_combinations\
    .progress_apply(lambda row: pd.Series(analyze_customer_coupon_intersection(row),\
                                      index=['customer_id', \
                                             'coupon_id', \
                                             'no_of_item_ids_purchased_by_customer_and_covered_by_coupon', \
                                             'no_of_brands_purchased_by_customer_and_covered_by_coupon', \
                                             'no_of_categories_purchased_by_customer_and_covered_by_coupon', \
                                             'percent_of_item_ids_purchased_by_customer_and_covered_by_coupon', \
                                             'percent_of_brands_purchased_by_customer_and_covered_by_coupon', \
                                             'percent_of_categories_purchased_by_customer_and_covered_by_coupon']),\
                axis=1)

    return customer_coupon_intersection_features

In [193]:
def get_coupon_customer_features():
    customer_coupon_intersection_features = get_customer_coupon_intersection_features()
    
    coupon_customer_features = coupon_customer_combinations
    
    coupon_customer_features = coupon_customer_features\
    .merge(customer_coupon_intersection_features)
    
    return coupon_customer_features

In [224]:
coupon_customer_features = get_coupon_customer_features()

HBox(children=(IntProgress(value=0, max=125265), HTML(value='')))




In [225]:
save_feature_set(coupon_customer_features, 'coupon_customer_features')

# customer based features

In [145]:
customer_ids_train = train_data[['customer_id']].drop_duplicates()
customer_ids_test = test_data[['customer_id']].drop_duplicates()

customer_ids = pd.concat([customer_ids_train, customer_ids_test]).drop_duplicates()


In [196]:
def analyse_customer_brand_type_preference(row):
    customer_id_in_question = row.customer_id
    
    item_ids_purchased_by_customer = \
    set(customer_transaction_data_pure.loc[customer_transaction_data_pure.customer_id == customer_id_in_question].item_id.unique())

    brand_type_purchased_by_customer = \
    item_data_pure.loc[item_data_pure.item_id.isin(item_ids_purchased_by_customer)].brand_type
    
    customer_brand_type_preference = \
    brand_type_purchased_by_customer.value_counts()['Local'] / brand_type_purchased_by_customer.count()

    return [customer_id_in_question, \
           customer_brand_type_preference]
    
    
    

In [197]:
def analyse_average_price_of_items_bought_by_customer(row):
    customer_id_in_question = row.customer_id

    try: 
        average_price_of_items_bought_by_customer = \
        customer_transaction_data_pure.loc[customer_transaction_data_pure.customer_id == customer_id_in_question].rate.mean()
    except:
        average_price_of_items_bought_by_customer = 0

    return [customer_id_in_question, \
           average_price_of_items_bought_by_customer]    
    
    

In [198]:
def get_customer_brand_type_preference_features():
    customer_brand_type_preference_features = \
    customer_ids\
    .progress_apply(lambda row: pd.Series(get_customer_brand_type_preference(row),\
                                      index=['customer_id', \
                                             'local_brand_type_preference']),\
                axis=1)
    
    return customer_brand_type_preference_features

In [199]:
def get_average_price_of_items_bought_by_customer_features():
    average_price_of_items_bought_by_customer_features = \
    customer_ids\
    .progress_apply(lambda row: pd.Series(get_average_price_of_items_bought_by_customer(row),\
                                      index=['customer_id', \
                                             'average_price_of_items_bought_by_customer']),\
                axis=1)

    return average_price_of_items_bought_by_customer_features

In [200]:
def get_customer_based_features():
    
    customer_brand_type_preference_features = \
    get_customer_brand_type_preference_features()
    
    average_price_of_items_bought_by_customer_features = \
    get_average_price_of_items_bought_by_customer_features()
    
    
    customer_based_features = customer_ids
    
    customer_based_features = customer_based_features.\
    merge(customer_brand_type_preference_features)
    
    customer_based_features = customer_based_features.\
    merge(average_price_of_items_bought_by_customer_features)
    
    return customer_based_features

In [201]:
customer_based_features = get_customer_based_features()

HBox(children=(IntProgress(value=0, max=1582), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1582), HTML(value='')))




In [202]:
save_feature_set(customer_based_features, 'customer_based_features')

# coupon based features

In [150]:
coupon_ids_train = train_data[['coupon_id']].drop_duplicates()
coupon_ids_test = test_data[['coupon_id']].drop_duplicates()

coupon_ids = pd.concat([coupon_ids_train, coupon_ids_test]).drop_duplicates()


In [167]:
def analyse_average_price_of_items_with_coupon(row):
    coupon_id_in_question = row.coupon_id
    
    item_ids_covered_by_coupon = \
    set(coupon_item_mapping_data_pure.loc[coupon_item_mapping_data_pure.coupon_id == coupon_id_in_question].item_id.unique())

    try: 
        average_cost_price_of_items_with_coupon = \
        customer_transaction_data_pure.loc[customer_transaction_data_pure.item_id.isin(item_ids_covered_by_coupon)].rate.mean()
    except:
        average_cost_price_of_items_with_coupon = 0

    return [coupon_id_in_question, \
           average_cost_price_of_items_with_coupon]    
    

In [168]:
def analyse_coupon_brand_type_bent(row):
    coupon_id_in_question = row.coupon_id
    
    item_ids_covered_by_coupon = \
    set(coupon_item_mapping_data_pure.loc[coupon_item_mapping_data_pure.coupon_id == coupon_id_in_question].item_id.unique())

    brand_type_covered_by_coupon = \
    item_data_pure.loc[item_data_pure.item_id.isin(item_ids_covered_by_coupon)].brand_type
    
    try:
        coupon_brand_type_bent = \
        brand_type_covered_by_coupon.value_counts()['Local'] / brand_type_covered_by_coupon.count()
    except:
        coupon_brand_type_bent = 0

    return [coupon_id_in_question, \
           coupon_brand_type_bent]
    
    

In [169]:
def get_average_price_of_items_with_coupon_features():
    average_price_of_items_with_coupon_features = \
    coupon_ids\
    .progress_apply(lambda row: pd.Series(get_average_price_of_items_with_coupon(row),\
                                      index=['coupon_id', \
                                             'average_price_of_items_with_coupon']),\
                axis=1)
    
    return average_price_of_items_with_coupon_features



In [205]:
def get_coupon_brand_type_bent_features():
    coupon_brand_type_bent_features = \
    coupon_ids\
    .progress_apply(lambda row: pd.Series(get_coupon_brand_type_bent(row),\
                                      index=['coupon_id', \
                                             'local_brand_type_bent']),\
                axis=1)
    
    return coupon_brand_type_bent_features

In [203]:
def get_coupon_based_features():
    coupon_brand_type_bent_features = get_coupon_brand_type_bent_features()
    average_price_of_items_with_coupon_features = get_average_price_of_items_with_coupon_features()
    
    coupon_based_features = coupon_ids
    coupon_based_features = coupon_ids.merge(coupon_brand_type_bent_features)
    coupon_based_features = coupon_based_features.merge(average_price_of_items_with_coupon_features)

    return coupon_based_features

In [206]:
coupon_based_features = get_coupon_based_features()

HBox(children=(IntProgress(value=0, max=1116), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1116), HTML(value='')))




In [207]:
save_feature_set(coupon_based_features, 'coupon_based_features')

# campaign_coupon combination features

In [156]:
campaign_coupon_combinations_train = train_data[['coupon_id', 'campaign_id']].drop_duplicates()
campaign_coupon_combinations_test = test_data[['coupon_id', 'campaign_id']].drop_duplicates()

campaign_coupon_combinations = pd.concat([campaign_coupon_combinations_train, \
                                          campaign_coupon_combinations_test]).drop_duplicates()

In [208]:
def analyse_campaign_coupon_redemption_record_till_campaign_start(row):
    campaign_id_in_question = row.campaign_id
    start_date_in_question = campaign_data.loc[campaign_data.campaign_id == campaign_id_in_question, "start_date"].iloc[0]
    coupon_id_in_question = row.coupon_id
    
    cond = (train_and_campaign_data.coupon_id == coupon_id_in_question)
    cond = cond & (train_and_campaign_data.end_date < start_date_in_question)
    
    redemption_status = train_and_campaign_data.loc[cond].redemption_status
    
    if (redemption_status.count() != 0):
        campaign_coupon_redemption_record_till_campaign_start = \
        redemption_status.sum() / redemption_status.count()
    else:
        campaign_coupon_redemption_record_till_campaign_start = 0
            

    return [campaign_id_in_question, 
            coupon_id_in_question, \
            campaign_coupon_redemption_record_till_campaign_start]
    
    

In [209]:
def get_campaign_coupon_redemption_record_till_campaign_start_features():
    campaign_coupon_redemption_record_till_campaign_start_features = \
    campaign_coupon_combinations\
    .progress_apply(lambda row: pd.Series(get_campaign_coupon_redemption_record_till_campaign_start(row),\
                                      index=['campaign_id', \
                                             'coupon_id', \
                                             'campaign_coupon_redemption_record_till_campaign_start']),\
                axis=1)

    return campaign_coupon_redemption_record_till_campaign_start_features

In [210]:
def get_campaign_coupon_features():
    campaign_coupon_redemption_record_till_campaign_start_features = \
    get_campaign_coupon_redemption_record_till_campaign_start_features()
        
    campaign_coupon_features = campaign_coupon_combinations
    
    campaign_coupon_features = campaign_coupon_features\
    .merge(campaign_coupon_redemption_record_till_campaign_start_features)

    return campaign_coupon_features

In [211]:
campaign_coupon_features = get_campaign_coupon_features()

HBox(children=(IntProgress(value=0, max=1358), HTML(value='')))




In [212]:
save_feature_set(campaign_coupon_features, 'campaign_coupon_features')