Importing the Pandas library

In [12]:
import pandas as pd

Upload data from CSV into the DataFrame

In [13]:
events = pd.read_csv('data/events.csv')
events =pd.DataFrame(events)

Get an overview of the dataset

In [14]:
print(events.shape)
print(events.info())
print(events.describe())
print(events.head())

(306534, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_id  306534 non-null  object
 1   event        306534 non-null  object
 2   value        306534 non-null  object
 3   time         306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB
None
                time
count  306534.000000
mean      366.382940
std       200.326314
min         0.000000
25%       186.000000
50%       408.000000
75%       528.000000
max       714.000000
                        customer_id           event  \
0  78afa995795e4d85b5d9ceeca43f5fef  offer received   
1  a03223e636434f42ac4c3df47e8bac43  offer received   
2  e2127556f4f64592b11af22de27a7932  offer received   
3  8ec6ce2a7e7949b1bf142def7d0e0586  offer received   
4  68617ca6246f4fbc85e91a2a49552598  offer received   

                                              value

Remove duplicates and missing values

In [15]:
events = events.drop_duplicates()
events = events.dropna()
print(events.shape)

(306137, 4)


Checking content of the 'event' column

In [16]:
events['event'].unique()

array(['offer received', 'offer viewed', 'transaction', 'offer completed'],
      dtype=object)

Checking content of the 'value' column

In [17]:
events['value'].unique()

array(["{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'}",
       "{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'}",
       "{'offer id': '2906b810c7d4411798c6938adc9daaa5'}", ...,
       "{'amount': 685.07}", "{'amount': 405.04}", "{'amount': 476.33}"],
      dtype=object)

Counting the different types of events:

In [18]:

offer_viewed = events[events['event'].isin(['offer viewed'])]
offer_completed = events[events['event'].isin(['offer completed'])]
transaction = events[events['event'].isin(['transaction'])]
print(offer_viewed['event'].count())
print(offer_completed['event'].count())
print(transaction['event'].count())

57725
33182
138953


There are 4 types of events recorded: offer received, offer viewed, offer completed, and transaction. These events will be separated into 4 separate tables: offer_received, offer_viewed, offer_completed, and transaction

#### Creating & transforming 'offer_received' dataset:
Extracting only the offer id from column 'value' and renaming the column to 'offer id':

In [19]:
offer_received = events[events['event'].isin(['offer received'])]
offer_received['value'] = offer_received['value'].str[14:-2]
offer_received.rename(columns = {'value' : 'offer id'}, inplace = True)
offer_received

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_received['value'] = offer_received['value'].str[14:-2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_received.rename(columns = {'value' : 'offer id'}, inplace = True)


Unnamed: 0,customer_id,event,offer id,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,9b98b8c7a33c4b65b9aebfe6a799e6d9,0
1,a03223e636434f42ac4c3df47e8bac43,offer received,0b1e1539f2cc45b7b9fa7c272da2e1d7,0
2,e2127556f4f64592b11af22de27a7932,offer received,2906b810c7d4411798c6938adc9daaa5,0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,fafdcd668e3743c1bb461111dcafc2a4,0
4,68617ca6246f4fbc85e91a2a49552598,offer received,4d5c57ea9a6940dd891ad53e9dbe8da0,0
...,...,...,...,...
257882,d087c473b4d247ccb0abfef59ba12b0e,offer received,ae264e3637204a6fb9bb56bc8210ddfd,576
257883,cb23b66c56f64b109d673d5e56574529,offer received,2906b810c7d4411798c6938adc9daaa5,576
257884,6d5f3a774f3d4714ab0c092238f3a1d7,offer received,2298d6c36e964ae4a3e7e9706d1fb8c2,576
257885,9dc1421481194dcd9400aec7c9ae6366,offer received,ae264e3637204a6fb9bb56bc8210ddfd,576


#### Creating & transforming 'offer_viewed' dataset:
Extracting only the offer id from column 'value' and renaming the column to 'offer id':

In [20]:
offer_viewed = events[events['event'].isin(['offer viewed'])]
offer_viewed['value'] = offer_viewed['value'].str[14:-2]
offer_viewed.rename(columns = {'value' : 'offer id'}, inplace = True)
offer_viewed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_viewed['value'] = offer_viewed['value'].str[14:-2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_viewed.rename(columns = {'value' : 'offer id'}, inplace = True)


Unnamed: 0,customer_id,event,offer id,time
12650,389bc3fa690240e798340f5a15918d5c,offer viewed,f19421c1d4aa40978ebb69ca19b0e20d,0
12651,d1ede868e29245ea91818a903fec04c6,offer viewed,5a8bc65990b245e5a138643cd4eb9837,0
12652,102e9454054946fda62242d2e176fdce,offer viewed,4d5c57ea9a6940dd891ad53e9dbe8da0,0
12653,02c083884c7d45b39cc68e1314fec56c,offer viewed,ae264e3637204a6fb9bb56bc8210ddfd,0
12655,be8a5d1981a2458d90b255ddc7e0d174,offer viewed,5a8bc65990b245e5a138643cd4eb9837,0
...,...,...,...,...
306441,d56386cf344c4829bbf420d1895dca37,offer viewed,5a8bc65990b245e5a138643cd4eb9837,714
306450,9b51e8797290403b90d09d864dec4b94,offer viewed,3f207df678b143eea3cee63160fa8bed,714
306483,84fb57a7fe8045a8bf6236738ee73a0f,offer viewed,5a8bc65990b245e5a138643cd4eb9837,714
306490,abc4359eb34e4e2ca2349da2ddf771b6,offer viewed,3f207df678b143eea3cee63160fa8bed,714


#### Creating & transforming 'offer_completed' dataset:
Extracting the offer id and reward values from column 'value' and inserting into new columns 'offer id' and 'reward':

In [21]:
offer_completed = events[events['event'].isin(['offer completed'])]
offer_completed['offer id'] = offer_completed['value'].str[14:-15]
offer_completed['reward'] = offer_completed['value'].str[-2]
offer_completed['offer id'] = offer_completed['offer id'].map(lambda x: x.rstrip("'"))
offer_completed = offer_completed.drop(columns = ['value'])

offer_completed # Get an overview of the 'offer_completed' dataset

# offer_completed['value'].values.tolist() / To see entire content of the 'value' column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_completed['offer id'] = offer_completed['value'].str[14:-15]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_completed['reward'] = offer_completed['value'].str[-2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_completed['offer id'] = offer_completed['offer id'].map(lambda x: x.rst

Unnamed: 0,customer_id,event,time,offer id,reward
12658,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,offer completed,0,2906b810c7d4411798c6938adc9daaa5,2
12672,fe97aa22dd3e48c8b143116a8403dd52,offer completed,0,fafdcd668e3743c1bb461111dcafc2a4,2
12679,629fc02d56414d91bca360decdfa9288,offer completed,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5
12692,676506bad68e4161b9bbaffeb039626b,offer completed,0,ae264e3637204a6fb9bb56bc8210ddfd,0
12697,8f7dd3b2afe14c078eb4f6e6fe4ba97d,offer completed,0,4d5c57ea9a6940dd891ad53e9dbe8da0,0
...,...,...,...,...,...
306475,0c027f5f34dd4b9eba0a25785c611273,offer completed,714,2298d6c36e964ae4a3e7e9706d1fb8c2,3
306497,a6f84f4e976f44508c358cc9aba6d2b3,offer completed,714,2298d6c36e964ae4a3e7e9706d1fb8c2,3
306506,b895c57e8cd047a8872ce02aa54759d6,offer completed,714,fafdcd668e3743c1bb461111dcafc2a4,2
306509,8431c16f8e1d440880db371a68f82dd0,offer completed,714,fafdcd668e3743c1bb461111dcafc2a4,2


#### Creating & transforming 'transaction_made' dataset:
Extracting the amount values from column 'value' and renaming the column to 'transaction_amount':

In [22]:
transaction_made = events[events['event'].isin(['transaction'])]
transaction_made = transaction_made[transaction_made['value'].str.len() < 18]
transaction_made['value'] = transaction_made['value'].str[10:-1]
transaction_made.rename(columns = {'value' : 'Amount'}, inplace = True)
transaction_made['Amount']= transaction_made['Amount'].astype(float)
transaction_made['Amount']= transaction_made['Amount'].round(2)

In [25]:
transaction_made

Unnamed: 0,customer_id,event,Amount,time
12657,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,34.56,0
12659,54890f68699049c2a04d415abc25e717,transaction,13.23,0
12670,b2f1cd155b864803ad8334cdf13c4bd2,transaction,19.51,0
12671,fe97aa22dd3e48c8b143116a8403dd52,transaction,18.97,0
12678,629fc02d56414d91bca360decdfa9288,transaction,33.90,0
...,...,...,...,...
306526,24f56b5e1849462093931b164eb803b5,transaction,22.64,714
306528,5ca2620962114246ab218fc648eb3934,transaction,2.20,714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,9.53,714
306531,a00058cf10334a308c68e7631c529907,transaction,3.61,714


In [24]:
offer_received.to_csv('event_offer_received.csv')
offer_viewed.to_csv('event_offer_viewed.csv')
offer_completed.to_csv('event_offer_completed.csv')
transaction_made.to_csv('event_transaction.csv')

ValueError: Cannot set a DataFrame with multiple columns to the single column combined_column