In [None]:
import pandas as pd
import numpy as np
import math
import json

%matplotlib inline

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

In [None]:
def id_mapper(df , column):
    '''
    Map a column in a DataFrame and create a dict to change its value to a sequence (1,2,3...) for easier use. 
        
    INPUT:
        df - (DataFrame) 
        column - (str) name of the column to create dictionary
    OUTPUT:
        coded_dict - (dict) A dictionary with the given column values as key and the 'new' encoded sequence as value
    '''  
    coded_dict = dict()
    cter = 1
    
    for x in df[column]:
        if x not in coded_dict:
            coded_dict[x] = cter
            cter+=1
            
    return coded_dict

In [None]:
def offer_type_mapper(df=portfolio):
    '''
   Create a data frame to relate each offer with offer type
    
    IMPUT:  df - (DataFrame) - portfolio as default dataframe 
    OUTPUT: 
            offer_type - (DataFrame) - relation between offer id and type of offer  
            coded_dict - (dict) - relation between type offer sequence and real type offer (see id_mapper)
    
    '''    
    # get sequence to name type of offers
    coded_dict = id_mapper(df, 'offer_type')
    
    coded_df = df.replace({"offer_type":coded_dict})
    offer_type = coded_df[['id' , 'offer_type' , 'duration']]
    
    return offer_type , coded_dict

In [None]:
def arrange_events(df , offer_df):
    '''
    Rearange the dataframe -transcript- by merging each offer into one row, creating columns for each event and time as values.
    
    IMPUT: df (DataFrame) - default Dataframe is transcript
    OUTPUT: df (DataFrame) - modified dataFrame
    '''
    
    # read dictionary from 'value' feature and create columns 
    df = pd.concat([df.drop(['value'], axis=1), df['value'].apply(pd.Series)], axis=1)
    
    # merge offer id and offer_id columns
    df['offer id'] = df['offer id'].combine_first(df['offer_id'])
    df = df.drop(columns = ['offer_id'])
    
    # split into three dataFrames and then merge rows with transaction and offer completed in the same time
    df1 = df[df['event'] == 'offer completed'][['person' , 'event' , 'time' , 'offer id' , 'reward']]
    df2 = df[df['event'] == 'transaction'][['person' ,  'time' , 'amount']]
    df3 = df[df['event'] != 'offer completed']
    df3 = df3[df3['event'] != 'transaction'][['person' , 'event' , 'time' , 'offer id']]         
    # merge the two dataFrames on time
    df_trans_completed = pd.merge(df1, df2, how='outer', on=['person', 'time'])
    # merge with main dataFrames
    df = pd.merge(df3, df_trans_completed, how='outer', on=['person', 'time' , 'event' , 'offer id'])
    
    # create columns of type of event with the value of time
    df = pd.concat([df, df.pivot_table(values='time', index=df.index, columns='event', aggfunc='first')], axis=1, sort=False)
    
    # fill NaN values in the offer id feature as 'no offer' to keep track of the transactions without an offer
    df['offer id'] = df['offer id'].fillna(value = 'no offer')
    df = df.rename(columns={'person':'user id'})

    # merge with offer_type dataframe
    df = pd.merge(df, offer_df, how='outer', on=['offer id'])
    df.rename(columns={'offer_type': 'offer type'} , inplace = True)    
    
    return df

In [None]:
def fill_amount(df):

    dict_values = {}
    index_lst = []

    a = df.loc[(df['event'] == 'offer received') & (df['offer type'] == 2)]

    for i in range(a.shape[0]):   
        b = df.loc[(df['time'] >= a['time'].values[i]) & (df['time'] <= (a['time'].values[i] + a['duration'].values[i])) & (df['offer id'] == 'no offer')]
        if (b.shape[0] != 0):
            index_lst.append(b.index[0])
            c = b['amount'].to_list()[0]
            dict_values.update({a.index.to_list()[i]: c})
        else:
            dict_values.update({a.index.to_list()[i]: np.nan})

    df["amount"].fillna(dict_values, inplace=True)
    df.drop(index=index_lst, axis=0 , inplace=True)

    return df

In [None]:
def offer_merge(df):
    
    '''
    For each user, map and select rows of a singular offer and merge them into one.
    
    IMPUT: df - (DataFrame) - modified transcription as default dataframe.
    OUTPUT: df - (DataFrame) - rearange user data where each offer is in one row. 
    '''    
    offers_received_lst = df['offer id'].unique().tolist()
    total_offers_received = df['offer received'].count()
    temp_df = df.head(0)
    user_id = df['user id'].unique()[0]
    
    
    for offer in offers_received_lst:
        
        #create data frame of an offer
        offer_df = df[df['offer id'] == offer].copy()        
        # check if the same offer has been receved more than one time if so, create flags to treat each offer independently.
        if offer_df['offer received'].count() > 1:
            cter = 0
            flag = []
                        
            #create list to flag each offer            
            for index, row in offer_df.iterrows():
                if not np.isnan(row['offer received']):
                    cter+=1
                    flag.append(cter)                    
                else:
                    flag.append(cter)                    
            offer_df['flag'] = flag 
            offer_df = offer_df.groupby(['flag' , 'offer id']).mean().reset_index().drop(columns='flag')

        else:
            offer_df = offer_df.groupby('offer id').mean().reset_index()
            
        temp_df = temp_df.append(offer_df , sort=False)
    
    temp_df = temp_df.reset_index()
    temp_df = temp_df.drop(columns=['index'])
    
    df = temp_df
    
    return df , user_id

In [None]:
def check_completed_offers(df , user_id):
    '''
    For a given user, checks and drop transactions that were not influenced by an offer
    
    IMPUT: df - (DataFrame)
    OUTPUT: df - (DataFrame) - rearange data 
    '''
     
    # fill NaN values with 0 for offers that were not completed
    df[['reward' , 'amount']] = df[['reward' , 'amount']].fillna(value = 0)

    # add column with the type of offer   
    # df = pd.merge(df, map_offer_type, how='left', left_on=['offer id'] , right_on=['id'])
    
    # fill with offer type 4, for transactions that are not related with an offer
    df['offer type'] = df['offer type'].fillna(value = 4)
    df['user id'] = df['user id'].fillna(value = user_id)
    
    # check if an offer was completed before it was viewed or if it was not viewed, if so, drop it (the offer did not influenciate the transaction)
    for row in range(len(df)):
        if df.loc[row]['offer viewed'] > df.loc[row]['offer completed']:
            df = df.drop([row])
        elif np.isnan(df.loc[row]['offer viewed']) and not np.isnan(df.loc[row]['offer completed']):
            df = df.drop([row])
        else:
            pass
        
    return df

In [None]:
def get_events(df):
    '''
    for each user rearange transactions influenced by an offer
    and for each type of offer get:
    transaction amount, number of offers recived, number of offers viewed and number of offers completed
    note: it takes some time to process
    
    IMPUT: df - (dtaFrame)
    OUTPUT:
        amount_lst (lst) - list of dictionaries that contains amount spend and type of offer for each user
        offers_lst (lst) - list of dictionaries that contains number of offers recived for each type
        offers_view_lst (lst) - list of dictionaries that contains number of offers viewed for each type
        offers_completed_lst (lst) - list of dictionaries that contains number of offers completed for each type
    
    '''
    
    user_id_lst = profile['id'].tolist()
    amount_lst = []
    offers_lst = []
    offers_view_lst = []
    offers_completed_lst = []
    
    for user in user_id_lst:
        
        user_events = df[df['user id'] == user]
        user_fill_amount = fill_amount(user_events)
        user_events, user_id = offer_merge(user_fill_amount)
        user_events = check_completed_offers (user_events , user_id)

        amount = {'user id' : user}
        offers = {'user id' : user}
        offers_view = {'user id' : user}
        offers_completed = {'user id' : user}

        amount.update(user_events.groupby('offer type').mean()['amount'].to_dict())
        offers.update(user_events.groupby('offer type').count()['offer id'].to_dict())
        offers_view.update(user_events.groupby('offer type').count()['offer viewed'].to_dict())
        offers_completed.update(user_events.groupby('offer type').count()['offer completed'].to_dict())

        amount_lst.append(amount)
        offers_lst.append(offers)
        offers_view_lst.append(offers_view)
        offers_completed_lst.append(offers_completed)
        
    return amount_lst , offers_lst , offers_view_lst , offers_completed_lst 

In [None]:
def df_from_lst (lst):
    '''
    create dataframe from a list of dictionaries
    IMPUT: lst (list)
    OUTPUT: DF (dataFrame)
     '''
    
    df = pd.DataFrame(lst).drop(columns=4)
    df.fillna(value = 0 , inplace = True)
    
    return df    

# main

In [None]:
# split gender into dummies columns
profile_mod = pd.concat([profile , pd.get_dummies(profile['gender'])],axis=1)
profile_mod.drop(['gender' , 'became_member_on'],axis=1, inplace=True)

In [None]:
#map offer type 
map_offer_type, dict_offer_type = offer_type_mapper(portfolio)
#transform days
map_offer_type['duration'] = map_offer_type['duration'] * 24
map_offer_type.rename(columns={'id': 'offer id'} , inplace = True)

map_offer_type

In [None]:
# arrange transcript df
arrange_transcript = arrange_events(transcript , map_offer_type)

In [None]:
#note: this will take some time to execute, you can grab a coffee ;)
amount_lst , offers_lst , offers_view_lst , offers_completed_lst = get_events(arrange_transcript)

In [None]:
# Amount data
amount_type = pd.DataFrame(amount_lst)
amount_type.rename(columns={1: 'type 1', 2: 'type 2', 3: 'type 3' , 4: 'type 4'} , inplace = True)

In [None]:
user_offers = df_from_lst (offers_lst)
offers_viewed = df_from_lst (offers_view_lst)
offers_completed = df_from_lst (offers_completed_lst)

user_offers.rename(columns={1: 'offers type 1', 2: 'offers type 2', 3: 'offers type 3'} , inplace = True)
offers_viewed.rename(columns={1: 'viewed type 1', 2: 'viewed type 2', 3: 'viewed type 3'} , inplace = True)
offers_completed.rename(columns={1: 'completed type 1', 2: 'completed type 2', 3: 'completed type 3'} , inplace = True)

In [None]:
# merge data frames
amount_offer = pd.merge(amount_type, user_offers, how='inner' , on="user id")
amount_offer = pd.merge(amount_offer, offers_viewed, how='inner' , on="user id")
amount_offer = pd.merge(amount_offer, offers_completed, how='inner' , on="user id")

In [None]:
amount_offer

In [None]:
#split into 3 datasets
amount_type_1 = amount_offer[['user id' , 'type 1' , 'offers type 1' , 'viewed type 1' , 'completed type 1']].copy() # 'dif 1'
amount_type_2 = amount_offer[['user id' , 'type 2' , 'offers type 2' , 'viewed type 2' , 'completed type 2']].copy()
amount_type_3 = amount_offer[['user id' , 'type 3' , 'offers type 3' , 'viewed type 3' , 'completed type 3']].copy()
amount_type_4 = amount_offer[['user id' , 'type 4']].copy()

In [None]:
amount_type_1 = pd.merge(profile_mod, amount_type_1, how='inner' , left_on="id" , right_on="user id")
amount_type_1.drop(['id'],axis=1, inplace=True)

amount_type_2 = pd.merge(profile_mod, amount_type_2, how='inner' , left_on="id" , right_on="user id")
amount_type_2.drop(['id'],axis=1, inplace=True)

amount_type_3 = pd.merge(profile_mod, amount_type_3, how='inner' , left_on="id" , right_on="user id")
amount_type_3.drop(['id'],axis=1, inplace=True)

amount_type_4 = pd.merge(profile_mod, amount_type_4, how='inner' , left_on="id" , right_on="user id")
amount_type_4.drop(['id'],axis=1, inplace=True)

In [None]:
# clear NaN for each data Set, users that did not recive that type of offer
amount_type_1.dropna(axis=0 , inplace=True)
amount_type_1.drop(amount_type_1[amount_type_1['viewed type 1'] == 0].index , inplace=True)

amount_type_2.dropna(axis=0 , inplace=True)
amount_type_2.drop(amount_type_2[amount_type_2['viewed type 2'] == 0].index , inplace=True)

amount_type_3.dropna(axis=0 , inplace=True)
amount_type_3.drop(amount_type_3[amount_type_3['viewed type 3'] == 0].index , inplace=True)

amount_type_4.dropna(axis=0 , inplace=True)

In [None]:
# save dataFrames to CSV files
amount_type_1.to_csv('data/amount_type_1.csv' , index=False)
amount_type_2.to_csv('data/amount_type_2.csv' , index=False)
amount_type_3.to_csv('data/amount_type_3.csv' , index=False)
amount_type_4.to_csv('data/amount_type_4.csv' , index=False)