# Part 1: Analyze File Structure


In [1]:
import pandas as pd
import os
import gzip
import shutil
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = 150

## Number and Size of Files

First, we read the data directory from a config file: 

In [2]:
with open('config') as f:
    path = f.readline()

display files and sizes:

In [8]:
files = pd.DataFrame()
for i, filename in enumerate(os.listdir(path)):
    full_path = os.path.join(path, filename)
    files =pd.concat([files, pd.DataFrame([['file' + str(i)+'.csv', round(os.path.getsize(full_path)/1024)]])], ignore_index=True)

files.columns = ['filename', 'size [KB]']
files.head()

Unnamed: 0,filename,size [KB]
0,file0.csv,97671
1,file1.csv,97677
2,file2.csv,97681
3,file3.csv,97670
4,file4.csv,97678


In [9]:
print("Number of Files: {}".format(len(files)))
print("Total Size: {} MB".format(int(files['size [KB]'].sum()/1024)))

Number of Files: 237
Total Size: 22416 MB


## Analyze the structure
- we choose a sample file (nr 230 in the list)
- unzip it to a temporary file
- read the first chunk of 10000 entries


In [30]:
tmp_file = '.\\data\\tmp.csv'
sample_filename = os.listdir(path)[230]

# unzip to tmp_file
with gzip.open(os.path.join(path,sample_filename)) as f_in:
    with open(tmp_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

reader = pd.read_csv(tmp_file, sep=';', chunksize=10000)
chunk = next(reader)

drop columns with all NA values

In [31]:
chunk.dropna(axis=1, how='all', inplace=True)

## some entries...
but first drop personal data

In [32]:
chunk = chunk.loc[:, ~chunk.columns.str.contains('user')]
chunk = chunk.drop(columns=['group.Email', 'message.Subject.Unresolved', 'message.Name'])
chunk.head()

Unnamed: 0,sentToMTA.Timestamp,sentToMTA.SendoutChannel,skip.Timestamp,skip.Category,bounce.Timestamp,bounce.Category,render.Timestamp,render.GeoLocation.ISOCountryCode,render.GeoLocation.RegionCode,render.GeoLocation.City,render.GeoLocation.Latitude,render.GeoLocation.Longitude,render.UserAgent.Platform,render.UserAgent.OperatingSystemName,render.UserAgent.OperatingSystemVersion,render.UserAgent.ClientName,render.UserAgent.ClientVersion,render.UserAgent.RawString,click.Timestamp,click.Link.Id,click.Link.Category.Id,click.Link.Category.Name,click.Link.Url,click.Link.Type,click.GeoLocation.ISOCountryCode,click.GeoLocation.RegionCode,click.GeoLocation.City,click.GeoLocation.Latitude,click.GeoLocation.Longitude,click.UserAgent.Platform,click.UserAgent.OperatingSystem,click.UserAgent.OperatingSystemVersion,click.UserAgent.ClientName,click.UserAgent.ClientVersion,click.UserAgent.RawString,unsubscribe.Timestamp,group.Id,group.Name,group.Category.Id,group.Category.Name,message.Id,message.Type,message.variation.Id,selection.Id,selection.Name,record.Timestamp,record.Type,record.sentToMTA.Timestamp,sendout.Type,sendout.AddresseeRole,sendout.RecipientType
0,2018-09-15T07:33:20Z,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,241,General,2700098965,normal,,,,2018-09-15T07:33:20Z,SentToMTA,2018-09-15T07:33:20Z,MAJOR,NORMAL,TO
1,2018-09-04T08:46:36Z,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,241,General,2700098092,normal,,,,2018-09-04T08:46:36Z,SentToMTA,2018-09-04T08:46:36Z,MAJOR,NORMAL,TO
2,2018-08-29T08:46:44Z,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,241,General,2700097824,normal,,,,2018-08-29T08:46:44Z,SentToMTA,2018-08-29T08:46:44Z,MAJOR,NORMAL,TO
3,2018-08-25T08:46:01Z,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,241,General,2700097605,normal,,,,2018-08-25T08:46:01Z,SentToMTA,2018-08-25T08:46:01Z,MAJOR,NORMAL,TO
4,2018-08-20T08:31:09Z,EMAIL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,900291830,France_all,241,General,2700096897,normal,,,,2018-08-20T08:31:09Z,SentToMTA,2018-08-20T08:31:09Z,MAJOR,NORMAL,TO


## column names

In [33]:
chunk.dtypes

sentToMTA.Timestamp                         object
sentToMTA.SendoutChannel                    object
skip.Timestamp                              object
skip.Category                               object
bounce.Timestamp                            object
bounce.Category                             object
render.Timestamp                            object
render.GeoLocation.ISOCountryCode           object
render.GeoLocation.RegionCode               object
render.GeoLocation.City                     object
render.GeoLocation.Latitude                float64
render.GeoLocation.Longitude               float64
render.UserAgent.Platform                   object
render.UserAgent.OperatingSystemName        object
render.UserAgent.OperatingSystemVersion     object
render.UserAgent.ClientName                 object
render.UserAgent.ClientVersion              object
render.UserAgent.RawString                  object
click.Timestamp                             object
click.Link.Id                  

## Timestamps

Timestamps mark the events
- sentToMTA
- skipped
- bounced
- feedback
- render
- clicked
- forwarded
- unsubscribed
- conversion

We want to check if these events are disjoint

In [34]:
sum = 0
timestamps = []
for col_name in chunk.columns:
    if 'Timestamp' in col_name:
        timestamps.append([col_name, chunk[chunk[col_name].notna()].shape[0]])

pd_timestamps = pd.DataFrame(timestamps, columns=['event', 'count'])
display(pd_timestamps)
print('\nsum = ' + str(pd_timestamps['count'].sum()), )

Unnamed: 0,event,count
0,sentToMTA.Timestamp,8042
1,skip.Timestamp,214
2,bounce.Timestamp,2
3,render.Timestamp,1385
4,click.Timestamp,349
5,unsubscribe.Timestamp,8
6,record.Timestamp,10000
7,record.sentToMTA.Timestamp,10000



sum = 30000


## Findings
- every record has `record.Timestamp` and `record.sentToMTATimestamp` set
- exactly one of the remaining Timestamps is set in each record