In [1]:
import pandas as pd
import numpy as np
import altair as alt

from IPython.display import display

%matplotlib inline

In [2]:
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

In [3]:
display(portfolio.shape)
display(profile.shape)
display(transcript.shape)

(10, 6)

(17000, 5)

(306534, 4)

In [4]:
display(portfolio.head(1))
display(profile.head(1))
display(transcript.head(1))

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd


Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,


Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0


In [5]:
display(portfolio.info())
display(profile.info())
display(transcript.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reward      10 non-null     int64 
 1   channels    10 non-null     object
 2   difficulty  10 non-null     int64 
 3   duration    10 non-null     int64 
 4   offer_type  10 non-null     object
 5   id          10 non-null     object
dtypes: int64(3), object(3)
memory usage: 608.0+ bytes


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               17000 non-null  int64  
 2   id                17000 non-null  object 
 3   became_member_on  17000 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 664.2+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   person  306534 non-null  object
 1   event   306534 non-null  object
 2   value   306534 non-null  object
 3   time    306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


None

The profile dataframe does contain nulls. Note age also has nulls encoded as 118.

## Investigating `portfolio` data

The schema for `portfolio` dataframe (copied verbatim from Udacity's provided description):

**portfolio.json**
* id (string) - offer id
* offer_type (string) - type of offer ie BOGO, discount, informational
* difficulty (int) - minimum required spend to complete an offer
* reward (int) - reward given for completing an offer
* duration (int) - time for offer to be open, in days
* channels (list of strings)

Note: BOGO stands for "buy one get one free"

In [6]:
# We can display the whole 10 lines
portfolio

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7
5,3,"[web, email, mobile, social]",7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2
6,2,"[web, email, mobile, social]",10,10,discount,fafdcd668e3743c1bb461111dcafc2a4
7,0,"[email, mobile, social]",0,3,informational,5a8bc65990b245e5a138643cd4eb9837
8,5,"[web, email, mobile, social]",5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d
9,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5


In [7]:
portfolio.describe()

Unnamed: 0,reward,difficulty,duration
count,10.0,10.0,10.0
mean,4.2,7.7,6.5
std,3.583915,5.831905,2.321398
min,0.0,0.0,3.0
25%,2.0,5.0,5.0
50%,4.0,8.5,7.0
75%,5.0,10.0,7.0
max,10.0,20.0,10.0


Possible data cleaning steps:
- dummy variable channels and offer type
- channels in encoded as a list

In [8]:
portfolio.offer_type.value_counts()

bogo             4
discount         4
informational    2
Name: offer_type, dtype: int64

In [114]:
def clean_portfolio(df, channel_types=['web', 'email', 'mobile', 'social']):
    """
    Cleans the raw portfolio dataframe through various cleaning steps.
    
    Input:
    df - the raw portfolio dataframe from portfolio.json
    channel_types - a list of str's for the different media channels
    
    Output:
    clean_df - the cleaned dataframe.
    
    Cleaning steps:
    - Takes channels column and converts it into 0/1's columns for each channel.
    - Reorder the columns.
    - Rename campaign id.    
    """
    # Get a dataframe for channel data
    # Iterates through each list in the .channels series
    # to pull out whether each channel is present or not.
    campaigns = []
    for campaign in df.channels:
        campaign = set(campaign)
        channels = []
        for channel in channel_types:
            if channel in campaign:
                channels.append(1)
            else:
                channels.append(0)
        campaigns.append(channels)

    channel_frame = pd.DataFrame(campaigns)
    channel_frame.columns = channel_types
    
    # Reorder the original dataframe as well replacing the channels column
    # with the new channel dataframe
    clean_df = pd.concat([df[['id']],
                          channel_frame,
                          df[['offer_type', 'duration', 'difficulty', 'reward']]],
                          axis=1)
    
    # Rename id as campaign_id
    clean_df = clean_df.rename(columns={'id':'offer_id'})
                          
    return clean_df

In [123]:
test_portfolio = clean_portfolio(portfolio)

In [124]:
test_portfolio.head()

Unnamed: 0,offer_id,web,email,mobile,social,offer_type,duration,difficulty,reward
0,ae264e3637204a6fb9bb56bc8210ddfd,0,1,1,1,bogo,7,10,10
1,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1,bogo,5,10,10
2,3f207df678b143eea3cee63160fa8bed,1,1,1,0,informational,4,0,0
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,1,0,bogo,7,5,5
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,1,0,0,discount,10,20,5


In [12]:
portfolio.head()

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [13]:
def reconciliate_ids():
    """
    """
    pass

## Investigating `profile`

In [14]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [15]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               17000 non-null  int64  
 2   id                17000 non-null  object 
 3   became_member_on  17000 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 664.2+ KB


In [16]:
profile.describe()

Unnamed: 0,age,became_member_on,income
count,17000.0,17000.0,14825.0
mean,62.531412,20167030.0,65404.991568
std,26.73858,11677.5,21598.29941
min,18.0,20130730.0,30000.0
25%,45.0,20160530.0,49000.0
50%,58.0,20170800.0,64000.0
75%,73.0,20171230.0,80000.0
max,118.0,20180730.0,120000.0


In [17]:
profile.gender.value_counts()

M    8484
F    6129
O     212
Name: gender, dtype: int64

In [18]:
profile.gender.isna().mean()

0.12794117647058822

In [19]:
(profile.age == 118).mean()

0.12794117647058822

In [20]:
(profile.income.isna().mean())

0.12794117647058822

In [21]:
(profile.gender.isna() == (profile.age == 118)).all() == \
(profile.gender.isna() == profile.income.isna()).all()

True

All the NA's are together.

In [22]:
profile.became_member_on.isna().mean()

0.0

## Cleaning `profile`

Possible steps:
- clean up age by dummying out an NaN category?
- Standardize the NA's?
- convert to date times the became member on
- dummy variable NaN?
- fill NaNs?
- multiple ways of dealing with data.
- convert the ids, and cross reference.

In [23]:
test2 = profile.copy()

In [33]:
test2.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55.0,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75.0,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,,a03223e636434f42ac4c3df47e8bac43,20170804,


In [26]:
test2['age'] = test2['age'].replace(118, np.nan)

In [34]:
test2['gender'] = test2['gender'].fillna(np.nan)

In [35]:
test2

Unnamed: 0,gender,age,id,became_member_on,income
0,,,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55.0,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75.0,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,,a03223e636434f42ac4c3df47e8bac43,20170804,
...,...,...,...,...,...
16995,F,45.0,6d5f3a774f3d4714ab0c092238f3a1d7,20180604,54000.0
16996,M,61.0,2cb4f97358b841b9a9773a7aa05a9d77,20180713,72000.0
16997,M,49.0,01d26f638c274aa0b965d24cefe3183f,20170126,73000.0
16998,F,83.0,9dc1421481194dcd9400aec7c9ae6366,20160307,50000.0


In [37]:
pd.to_datetime(test2['became_member_on'], format='%Y%m%d')

0       2017-02-12
1       2017-07-15
2       2018-07-12
3       2017-05-09
4       2017-08-04
           ...    
16995   2018-06-04
16996   2018-07-13
16997   2017-01-26
16998   2016-03-07
16999   2017-07-22
Name: became_member_on, Length: 17000, dtype: datetime64[ns]

In [38]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [112]:
def clean_profile(df):
    """
    Cleans the raw portfolio dataframe through various cleaning steps.
    
    Input:
    df - the raw profile dataframe from profile.json
    
    Output:
    clean_df - the cleaned dataframe.
    
    Cleaning steps:
    - Standardize the different types of NAs (None, age == 118) into np.nan.
    - Convert 'became_member_on' into datetime objects.
    - Reorder the columns.
    - Rename the user id.
    """
    df = df.copy()
    # Standardize NAs to numpy nan
    df['age'] = df['age'].replace(118, np.nan)
    df['gender'] = df['gender'].fillna(np.nan)
    # Convert str to datetime
    df['became_member_on'] = pd.to_datetime(df['became_member_on'], format='%Y%m%d')    
    # Reorder the columns
    clean_df = df[['id', 'gender', 'age', 'income', 'became_member_on']]
    # Relabel id as user_id
    clean_df = clean_df.rename(columns={'id':'customer_id'})
    
    return clean_df
    

In [113]:
test_profile = clean_profile(profile)
test_profile.head()


Unnamed: 0,customer_id,gender,age,income,became_member_on
0,68be06ca386d4c31939f3a4f0e3dd783,,,,2017-02-12
1,0610b486422d4921ae7d2bf64640c50b,F,55.0,112000.0,2017-07-15
2,38fe809add3b4fcf9315a9694bb96ff5,,,,2018-07-12
3,78afa995795e4d85b5d9ceeca43f5fef,F,75.0,100000.0,2017-05-09
4,a03223e636434f42ac4c3df47e8bac43,,,,2017-08-04


## Investigating `transcript`



In [44]:
transcript.shape

(306534, 4)

In [45]:
transcript.head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [46]:
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   person  306534 non-null  object
 1   event   306534 non-null  object
 2   value   306534 non-null  object
 3   time    306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


In [47]:
transcript.describe()

Unnamed: 0,time
count,306534.0
mean,366.38294
std,200.326314
min,0.0
25%,186.0
50%,408.0
75%,528.0
max,714.0


In [48]:
transcript.event.value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [49]:
transcript[transcript.event == 'offer completed'].head()

Unnamed: 0,person,event,value,time
12658,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,offer completed,{'offer_id': '2906b810c7d4411798c6938adc9daaa5...,0
12672,fe97aa22dd3e48c8b143116a8403dd52,offer completed,{'offer_id': 'fafdcd668e3743c1bb461111dcafc2a4...,0
12679,629fc02d56414d91bca360decdfa9288,offer completed,{'offer_id': '9b98b8c7a33c4b65b9aebfe6a799e6d9...,0
12692,676506bad68e4161b9bbaffeb039626b,offer completed,{'offer_id': 'ae264e3637204a6fb9bb56bc8210ddfd...,0
12697,8f7dd3b2afe14c078eb4f6e6fe4ba97d,offer completed,{'offer_id': '4d5c57ea9a6940dd891ad53e9dbe8da0...,0


## transcript cleaning
- clean up value for offer id (they're different types for different things
- change person
- simplify event


In [53]:
type(transcript.value[12697])

dict

In [72]:
test3 = pd.DataFrame(transcript.value.to_list())

In [73]:
test3.head()

Unnamed: 0,offer id,amount,offer_id,reward
0,9b98b8c7a33c4b65b9aebfe6a799e6d9,,,
1,0b1e1539f2cc45b7b9fa7c272da2e1d7,,,
2,2906b810c7d4411798c6938adc9daaa5,,,
3,fafdcd668e3743c1bb461111dcafc2a4,,,
4,4d5c57ea9a6940dd891ad53e9dbe8da0,,,


In [80]:
test3['offer id'].isna().mean()

0.5628478406963012

In [79]:
test3['offer_id'].isna().mean()

0.8904558711268571

In [85]:
(test3['offer id'].notnull() & test3['offer_id'].notnull()).any()

False

In [90]:
test3['offer id'].notnull().sum()

134002

In [91]:
test3['offer_id'].notnull().sum()

33579

In [92]:
test3['offer id'].notnull().sum() + test3['offer_id'].notnull().sum()

167581

In [93]:
test3['offer_id'].combine_first(test3['offer id']).notnull().sum()

167581

In [106]:
(test3['offer_id'].notnull() & test3['offer id'].notnull()).any()

False

In [62]:
transcript.event.value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [71]:
transcript.event.str.replace(' ', '_').value_counts()

transaction        138953
offer_received      76277
offer_viewed        57725
offer_completed     33579
Name: event, dtype: int64

In [137]:
def clean_transcript(df):
    """
    Cleans the raw transcript dataframe through various cleaning steps.
    
    Input:
    df - the raw transcript dataframe from transcript.json
    
    Output:
    clean_df - the cleaned dataframe.
    
    Cleaning steps:
    - Replace spaces in event strings to underscores.
    - Expand out "value" column.
    """
    df = df.copy()
    # Replace space with underscore in event column
    df['event'] = df['event'].str.replace(' ', '_')
    
    # Expand the value column into multiple columns
    value_frame = pd.DataFrame(df['value'].to_list())
    # Combine 'offer id' and 'offer_id' columns
    # First get the non nulls from each and check that they don't overlap
    a = value_frame['offer id'].notnull()
    b = value_frame['offer_id'].notnull()
    assert ~((a & b).any()) # True if no overlap
    # Make the combined column and drop 'offer id'
    value_frame['offer_id'] = value_frame['offer_id'].combine_first(value_frame['offer id'])
    value_frame = value_frame.drop(labels=['offer id'], axis=1)
    
    # Concatenate and reorder the columns
    clean_df = pd.concat([df, value_frame], axis=1)
    clean_df = clean_df[['person', 'time', 'event', 'amount', 'reward', 'offer_id']]
    
    # Rename the 'person column'
    clean_df = clean_df.rename(columns={'person':'customer_id'})
    
    return clean_df
    

In [138]:
test_transcript = clean_transcript(transcript)

In [139]:
test_transcript.head()

Unnamed: 0,customer_id,time,event,amount,reward,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,0,offer_received,,,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,0,offer_received,,,0b1e1539f2cc45b7b9fa7c272da2e1d7
2,e2127556f4f64592b11af22de27a7932,0,offer_received,,,2906b810c7d4411798c6938adc9daaa5
3,8ec6ce2a7e7949b1bf142def7d0e0586,0,offer_received,,,fafdcd668e3743c1bb461111dcafc2a4
4,68617ca6246f4fbc85e91a2a49552598,0,offer_received,,,4d5c57ea9a6940dd891ad53e9dbe8da0


In [110]:
test4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   person    306534 non-null  object 
 1   time      306534 non-null  int64  
 2   event     306534 non-null  object 
 3   amount    138953 non-null  float64
 4   reward    33579 non-null   float64
 5   offer_id  167581 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 14.0+ MB


In [111]:
test4.describe()

Unnamed: 0,time,amount,reward
count,306534.0,138953.0,33579.0
mean,366.38294,12.777356,4.904137
std,200.326314,30.250529,2.886647
min,0.0,0.05,2.0
25%,186.0,2.78,2.0
50%,408.0,8.89,5.0
75%,528.0,18.07,5.0
max,714.0,1062.28,10.0


## Reconciliating IDs
There's two sets of ids: one belonging to each customer, and one belonging to each offer from a campaign. These are hashes, and it'd be easier to work with if they were simply numbers.

In [140]:
test_portfolio.head()

Unnamed: 0,offer_id,web,email,mobile,social,offer_type,duration,difficulty,reward
0,ae264e3637204a6fb9bb56bc8210ddfd,0,1,1,1,bogo,7,10,10
1,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1,bogo,5,10,10
2,3f207df678b143eea3cee63160fa8bed,1,1,1,0,informational,4,0,0
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,1,0,bogo,7,5,5
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,1,0,0,discount,10,20,5


In [141]:
test_profile.head()

Unnamed: 0,customer_id,gender,age,income,became_member_on
0,68be06ca386d4c31939f3a4f0e3dd783,,,,2017-02-12
1,0610b486422d4921ae7d2bf64640c50b,F,55.0,112000.0,2017-07-15
2,38fe809add3b4fcf9315a9694bb96ff5,,,,2018-07-12
3,78afa995795e4d85b5d9ceeca43f5fef,F,75.0,100000.0,2017-05-09
4,a03223e636434f42ac4c3df47e8bac43,,,,2017-08-04


In [142]:
test_transcript.head()

Unnamed: 0,customer_id,time,event,amount,reward,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,0,offer_received,,,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,0,offer_received,,,0b1e1539f2cc45b7b9fa7c272da2e1d7
2,e2127556f4f64592b11af22de27a7932,0,offer_received,,,2906b810c7d4411798c6938adc9daaa5
3,8ec6ce2a7e7949b1bf142def7d0e0586,0,offer_received,,,fafdcd668e3743c1bb461111dcafc2a4
4,68617ca6246f4fbc85e91a2a49552598,0,offer_received,,,4d5c57ea9a6940dd891ad53e9dbe8da0


In [147]:
# First, let's check the customer ids and offer ids are perfect intersections
set(test_profile['customer_id']).symmetric_difference(set(test_transcript['customer_id']))

set()

In [148]:
set(test_portfolio['offer_id']).symmetric_difference(set(test_transcript['offer_id']))

{nan}

In [149]:
test_transcript.offer_id.isna().mean()b

0.45330371182315826

In [None]:
# as the ids are the same between sets (except for the nan in the offer_id)
# we can create a key:numbering from the portfolio and the profile and use that
# for the transcript data.
