In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

Loading the 'customers' file:

In [49]:
customers = pd.read_csv('data/customers.csv')

In [38]:
print(customers.info())
print(customers.shape)
print(customers.describe)
print(customers.head())

Unnamed: 0,customer_id,became_member_on,gender,age,income
0,68be06ca386d4c31939f3a4f0e3dd783,20170212,,118,
1,0610b486422d4921ae7d2bf64640c50b,20170715,F,55,112000.0
2,38fe809add3b4fcf9315a9694bb96ff5,20180712,,118,
3,78afa995795e4d85b5d9ceeca43f5fef,20170509,F,75,100000.0
4,a03223e636434f42ac4c3df47e8bac43,20170804,,118,
...,...,...,...,...,...
16995,6d5f3a774f3d4714ab0c092238f3a1d7,20180604,F,45,54000.0
16996,2cb4f97358b841b9a9773a7aa05a9d77,20180713,M,61,72000.0
16997,01d26f638c274aa0b965d24cefe3183f,20170126,M,49,73000.0
16998,9dc1421481194dcd9400aec7c9ae6366,20160307,F,83,50000.0


In [172]:
customers['gender'].unique()

array(['F', 'M', 'O'], dtype=object)

There are 3 categories M - Male, F - Female, and O - Others, so having null values in the 'gender' column does not make sense.

Creating filters to fiter out the null values:

In [50]:
non_null_gender = customers['gender'].notnull()

Checking the rows with null values in 'gender' column just to verify we aren't losing anything important:

In [None]:
null_gender = customers['gender'].isnull()
customer_gender_is_null = customers[null_gender]
customer_gender_is_null.sort_values(by= 'became_member_on', ascending = False).values.tolist()

New dataframe after filtering out the null values:

In [51]:
customers = customers[non_null_gender]

Changing the datatype of column 'became_member_on' to string so that we can extract the date values into a date format of "year-month-day":

In [52]:
customers['became_member_on'] = customers['became_member_on'].astype(str)

extracting the date values and concatening them to the correct date format:

In [170]:
customers['became_member_on'] = customers['became_member_on'].str[:4] + '-' + customers['became_member_on'].str[4:6]+'-'+customers['became_member_on'].str[-2:]

Verifying the change has been successfully done:

In [None]:
customers.values.tolist()

In [57]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14825 entries, 1 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       14825 non-null  object 
 1   became_member_on  14825 non-null  object 
 2   gender            14825 non-null  object 
 3   age               14825 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 694.9+ KB


Finally, download the cleaned data as a new CSV file: 

In [171]:
# customers.to_csv('customers_cleaned.csv')

Next, loading the 'events' file:

In [59]:
events = pd.read_csv('data/events.csv')

In [None]:
print(events.info())
print(events.head())

Checking what are the values in column 'event':

In [156]:
events['event'].unique()

array(['offer received', 'offer viewed', 'transaction', 'offer completed'],
      dtype=object)

Extracting offer received and offer viewed events only and merging them into a separate new dataframe to get all offers received or viewed:

In [157]:
event_OR_filter = events['event'] == 'offer received'
event_OV_filter = events['event'] == 'offer viewed'
event_OR = events[event_OR_filter]
event_OV = events[event_OV_filter]
merged_OROV = pd.merge(event_OR,event_OV, how= 'outer')

This is the new dataframe that contains all offer received or viewed:

In [164]:
offer_received_or_viewed = merged_OROV

Finally, downloading the modified data as a new CSV file: 

In [166]:
offer_received_or_viewed.to_csv('offer_received_or_viewed.csv')

A transaction is considered for an offer only if the time of offer completed and transaction is same for a customer

Extracting offer completed and transaction events only in two separate Dataframes and then based on the criteria that the customer_id and time is same, we combine the two dataframes with an inner join to get all completed offer with associated transactions:

In [148]:
event_OC_filter = events['event'] == 'offer completed'
event_T_filter = events['event'] == 'transaction'
event_OC = events[event_OC_filter]
event_T = events[event_T_filter]
merged_OCT = pd.merge(event_OC, event_T, on=['customer_id', 'time'], how='inner')

This is the new dataframe that contains completed offers with related transaction based on same customer_id and time values:

In [167]:
offer_completed_with_transaction = merged_OCT

Finally, downloading the modified data as a new CSV file: 

In [168]:
offer_completed_with_transaction.to_csv('offer_completed_with_transaction.csv')

Next, loading the 'offers' file:

In [93]:
offers = pd.read_csv('data/offers.csv')

In [101]:
offers

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels
0,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,10,7,"['email', 'mobile', 'social']"
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,"['web', 'email', 'mobile', 'social']"
2,3f207df678b143eea3cee63160fa8bed,informational,0,0,4,"['web', 'email', 'mobile']"
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5,5,7,"['web', 'email', 'mobile']"
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,"['web', 'email']"
5,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,7,3,7,"['web', 'email', 'mobile', 'social']"
6,fafdcd668e3743c1bb461111dcafc2a4,discount,10,2,10,"['web', 'email', 'mobile', 'social']"
7,5a8bc65990b245e5a138643cd4eb9837,informational,0,0,3,"['email', 'mobile', 'social']"
8,f19421c1d4aa40978ebb69ca19b0e20d,bogo,5,5,5,"['web', 'email', 'mobile', 'social']"
9,2906b810c7d4411798c6938adc9daaa5,discount,10,2,7,"['web', 'email', 'mobile']"
