# Part 1: Analyze File Structure


In [1]:
import pandas as pd
import os
import gzip
import shutil
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = 150

## Number and Size of Files

First, we read the data directory from a config file: 

In [2]:
with open('config') as f:
    path = f.readline()

display files and sizes:

In [4]:
pd.set_option('display.max_colwidth', None)
files = pd.DataFrame()
for filename in os.listdir(path):
    full_path = os.path.join(path, filename)
    files =pd.concat([files, pd.DataFrame([[filename, round(os.path.getsize(full_path)/1024)]])], ignore_index=True)

files.columns = ['filename', 'size [KB]']
files.head()

Unnamed: 0,filename,size [KB]
0,SAS-Full-Raw-Data-Export-SCO-2015-generic-28060-2019_08_13_16_39_17.100.csv.gz,97671
1,SAS-Full-Raw-Data-Export-SCO-2015-generic-28060-2019_08_13_16_39_17.10.csv.gz,97677
2,SAS-Full-Raw-Data-Export-SCO-2015-generic-28060-2019_08_13_16_39_17.101.csv.gz,97681
3,SAS-Full-Raw-Data-Export-SCO-2015-generic-28060-2019_08_13_16_39_17.102.csv.gz,97670
4,SAS-Full-Raw-Data-Export-SCO-2015-generic-28060-2019_08_13_16_39_17.103.csv.gz,97678


In [7]:
print("Number of Files: {}".format(len(files)))
print("Total Size: {} MB".format(int(files['size [KB]'].sum()/1024)))

Number of Files: 237
Total Size: 22416 MB


## Analyze the structure
- we choose a sample file (nr 230 in the list)
- unzip it to a temporary file
- read the first chunk of 10000 entries


In [15]:
tmp_file = '.\\data\\tmp.csv'
sample_filename = os.listdir(path)[230]

# unzip to tmp_file
with gzip.open(os.path.join(path,sample_filename)) as f_in:
    with open(tmp_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

reader = pd.read_csv(tmp_file, sep=';', chunksize=10000)
chunk = next(reader)

## column names

In [16]:
chunk.dtypes

user.Id                                                    int64
user.Email                                                object
user.MobileNumber                                        float64
user.FaxNumber                                           float64
user.Identifier                                           object
sentToMTA.Timestamp                                       object
sentToMTA.ExternalTransactionId                          float64
sentToMTA.SendoutChannel                                  object
skip.Timestamp                                            object
skip.Category                                             object
skip.ExternalTransactionId                               float64
bounce.Timestamp                                          object
bounce.Category                                           object
feedback.Timestamp                                       float64
feedback.Category                                        float64
render.Timestamp         

## some entries...
but first drop personal data

In [17]:
chunk = chunk.drop(columns=['user.Email', 'user.MobileNumber', 'user.LastName'])
chunk.head()

Unnamed: 0,user.Id,user.FaxNumber,user.Identifier,sentToMTA.Timestamp,sentToMTA.ExternalTransactionId,sentToMTA.SendoutChannel,skip.Timestamp,skip.Category,skip.ExternalTransactionId,bounce.Timestamp,bounce.Category,feedback.Timestamp,feedback.Category,render.Timestamp,render.GeoLocation.ISOCountryCode,render.GeoLocation.RegionCode,render.GeoLocation.City,render.GeoLocation.Latitude,render.GeoLocation.Longitude,render.UserAgent.Platform,render.UserAgent.OperatingSystemName,render.UserAgent.OperatingSystemVersion,render.UserAgent.ClientName,render.UserAgent.ClientVersion,render.UserAgent.RawString,click.Timestamp,click.Link.Id,click.Link.Category.Id,click.Link.Category.Name,click.Link.Url,click.Link.SystemLinkType,click.Link.Type,click.GeoLocation.ISOCountryCode,click.GeoLocation.RegionCode,click.GeoLocation.City,click.GeoLocation.Latitude,click.GeoLocation.Longitude,click.UserAgent.Platform,click.UserAgent.OperatingSystem,click.UserAgent.OperatingSystemVersion,click.UserAgent.ClientName,click.UserAgent.ClientVersion,click.UserAgent.RawString,unsubscribe.Timestamp,forward.Timestamp,conversion.cp.Timestamp,conversion.cp.Id,conversion.cp.Name,conversion.cp.OrderId,conversion.cp.ItemCount,conversion.cp.Amount,conversion.cp.RequestParameters,group.Id,group.Name,group.Email,group.Category.Id,group.Category.Name,message.Id,message.Name,message.Category.Id,message.Category.Name,message.Subject.Unresolved,message.ExternalId,message.Type,message.variation.Id,selection.Id,selection.Name,record.Timestamp,record.Type,record.sentToMTA.Timestamp,record.ExternalTransactionId,sendout.Type,sendout.AddresseeRole,sendout.RecipientType,user.DateOfBirth,user.FirstName,user.ISOCountryCode,user.ISOLanguageCode,user.Nickname,user.PartnerId,user.Source,user.TimeZone,user.Title,user.ZipCode,user.CustomAttribute['BeSCity'],user.CustomAttribute['BeSHome'],user.CustomAttribute['BeSHome2'],user.CustomAttribute['BeSHome3'],user.CustomAttribute['BeSStreet'],user.CustomAttribute['BesZip'],user.CustomAttribute['BIC'],user.CustomAttribute['City'],user.CustomAttribute['Coupon'],user.CustomAttribute['Coupon02'],user.CustomAttribute['IBAN'],user.CustomAttribute['Last_Open_Any_Newsletter_Date'],user.CustomAttribute['PartnerID'],user.CustomAttribute['ProductID'],user.CustomAttribute['SalesOrganization'],user.CustomAttribute['SCSMembernumber'],user.CustomAttribute['SCS_Boutique'],user.CustomAttribute['SCS_Boutique_City'],user.CustomAttribute['SCS_Boutique_Name1'],user.CustomAttribute['SCS_Boutique_Name2'],user.CustomAttribute['SCS_Boutique_Street'],user.CustomAttribute['SCS_Due_Date'],user.CustomAttribute['SCS_Entry_Date'],user.CustomAttribute['SCS_Expiry_Dat'],user.CustomAttribute['SCS_Magazine_Language_ISO'],user.CustomAttribute['SSFCardNumber'],user.CustomAttribute['Street'],user.CustomAttribute['Street2'],user.CustomAttribute['Street3'],user.CustomAttribute['Telephone'],user.CustomAttribute['TitleKey']
0,9225239026,,,2018-09-15T07:33:20Z,,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,fr_all@newsletter.swarovski-newsletter.com,241,General,2700098965,180915_fr_FR_FW18_RemixNecklaces_GwPPwP_All_no_TestN,,,Boîte à bijoux Remix offerte │Nouveautés : zoom sur les colliers,,normal,,,,2018-09-15T07:33:20Z,SentToMTA,2018-09-15T07:33:20Z,,MAJOR,NORMAL,TO,1986-06-05T02:00:00.000+0200,Tan,FR,fr,,,i,,2.0,,,,,,,,,,BD18Q2_FR_JY8YP4,,,2017-12-22T06:18:59Z,C032808108,,,,,,,,,,,,,,,,,,
1,9225239026,,,2018-09-04T08:46:36Z,,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,fr_all@newsletter.swarovski-newsletter.com,241,General,2700098092,180904_fr_FR_FW18_RemixCharms_GwPPwP_All_no_TestN,,,Boîte à bijoux offerte ! Comment personnaliser votre style cette saison,,normal,,,,2018-09-04T08:46:36Z,SentToMTA,2018-09-04T08:46:36Z,,MAJOR,NORMAL,TO,1986-06-05T02:00:00.000+0200,Tan,FR,fr,,,i,,2.0,,,,,,,,,,BD18Q2_FR_JY8YP4,,,2017-12-22T06:18:59Z,C032808108,,,,,,,,,,,,,,,,,,
2,9225239026,,,2018-08-29T08:46:44Z,,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,fr_all@newsletter.swarovski-newsletter.com,241,General,2700097824,180829_fr_FR_FW18_RemixNewCollection_GwPPwP_All_no_TestN,,,Boîte à bijoux offerte ! Découvrez les nouveautés Swarovski Remix Collection,,normal,,,,2018-08-29T08:46:44Z,SentToMTA,2018-08-29T08:46:44Z,,MAJOR,NORMAL,TO,1986-06-05T02:00:00.000+0200,Tan,FR,fr,,,i,,2.0,,,,,,,,,,BD18Q2_FR_JY8YP4,,,2017-12-22T06:18:59Z,C032808108,,,,,,,,,,,,,,,,,,
3,9225239026,,,2018-08-25T08:46:01Z,,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,fr_all@newsletter.swarovski-newsletter.com,241,General,2700097605,180825_fr_FR_FW18_IconicUGC_None_All_no_TestN,,,Découvrez l'Instagram de Swarovski,,normal,,,,2018-08-25T08:46:01Z,SentToMTA,2018-08-25T08:46:01Z,,MAJOR,NORMAL,TO,1986-06-05T02:00:00.000+0200,Tan,FR,fr,,,i,,2.0,,,,,,,,,,BD18Q2_FR_JY8YP4,,,2017-12-22T06:18:59Z,C032808108,,,,,,,,,,,,,,,,,,
4,9225239026,,,2018-08-20T08:31:09Z,,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,fr_all@newsletter.swarovski-newsletter.com,241,General,2700096897,180820_fr_FR_Outlet18_Outlet2H_Outlet_All_no_TestN,,,OUTLET / Trouvez le look estival parfait,,normal,,,,2018-08-20T08:31:09Z,SentToMTA,2018-08-20T08:31:09Z,,MAJOR,NORMAL,TO,1986-06-05T02:00:00.000+0200,Tan,FR,fr,,,i,,2.0,,,,,,,,,,BD18Q2_FR_JY8YP4,,,2017-12-22T06:18:59Z,C032808108,,,,,,,,,,,,,,,,,,


## Timestamps

Timestamps mark the events
- sentToMTA
- skipped
- bounced
- feedback
- render
- clicked
- forwarded
- unsubscribed
- conversion

We want to check if these events are disjoint

In [18]:
sum = 0
timestamps = []
for col_name in chunk.columns:
    if 'Timestamp' in col_name:
        timestamps.append([col_name, chunk[chunk[col_name].notna()].shape[0]])

pd_timestamps = pd.DataFrame(timestamps, columns=['event', 'count'])
display(pd_timestamps)
print('\nsum = ' + str(pd_timestamps['count'].sum()), )

Unnamed: 0,event,count
0,sentToMTA.Timestamp,8042
1,skip.Timestamp,214
2,bounce.Timestamp,2
3,feedback.Timestamp,0
4,render.Timestamp,1385
5,click.Timestamp,349
6,unsubscribe.Timestamp,8
7,forward.Timestamp,0
8,conversion.cp.Timestamp,0
9,record.Timestamp,10000



sum = 30000


## Findings
- every record has `record.Timestamp` and `record.sentToMTATimestamp` set
- exactly one of the remaining Timestamps is set in each record