# Importing packages

In this exercise, we will use the software library **pandas** for data manipulation and analysis 

In [None]:
import pandas

# Reading files

First, we define the file names of the CRM data
* **cons_filename** contains the Consituent Information
* **email_filename** contains the Constituent Email Addresses
* **subs_filename** contains the Constituent Subscription Status

In [None]:
cons_filename = 'https://als-hiring.s3.amazonaws.com/fake_data/2020-07-01_17%3A11%3A00/cons.csv'
email_filename = 'https://als-hiring.s3.amazonaws.com/fake_data/2020-07-01_17%3A11%3A00/cons_email.csv'
subs_filename = 'https://als-hiring.s3.amazonaws.com/fake_data/2020-07-01_17%3A11%3A00/cons_email_chapter_subscription.csv'

Then, we read each of those files using *read_csv* pandas function and show the first rows of each dataframe

In [26]:
cons_df = pandas.read_csv(cons_filename, sep = ',', header = 0)
print("The Constituent Information dataframe contains %d rows and %d columns" %cons_df.shape)
cons_df.head()

The Constituent Information dataframe contains 700000 rows and 29 columns


Unnamed: 0,cons_id,prefix,firstname,middlename,lastname,suffix,salutation,gender,birth_dt,title,...,change_password_next_login,consent_type_id,create_dt,create_app,create_user,modified_dt,modified_app,modified_user,status,note
0,1,,,Lee,,MD,,E,,vSkSIzEQJdXnqeTTTXSG,...,0,5958,"Fri, 1983-08-26 06:02:03",1484,6162,"Sun, 2015-12-27 09:28:02",4022,6349,1,
1,2,,,,,II,boFqBKgLlSgEZsFrgCZd,E,"Mon, 2004-11-15",,...,1,4236,"Mon, 1979-03-05 21:08:54",4176,5476,"Tue, 1989-06-20 13:28:57",9010,5698,1,
2,3,,,David,King,,,D,"Fri, 1994-04-08",bxGxufoNzpKvjwNIxgRj,...,1,1263,"Fri, 2008-08-22 19:20:28",4702,8239,"Fri, 2020-06-05 18:13:57",8837,1175,1,
3,4,Ms.,Mark,Gregg,,,,J,,,...,0,949,"Sun, 1984-04-29 11:18:18",7096,7875,"Tue, 2012-01-31 07:26:35",7529,3984,1,
4,5,,,Logan,,DDS,ArvPJuEozHPaqbirovMi,D,"Sun, 2008-08-10",kXKlKCyCBoXUBVejPOVO,...,0,7126,"Thu, 2007-07-19 18:28:09",706,5727,"Tue, 1971-04-27 06:53:53",9356,6062,1,


In [27]:
email_df = pandas.read_csv(email_filename, sep = ',', header = 0)
print("The Constituent Email Addresses dataframe contains %d rows and %d columns" %email_df.shape)
email_df.head()

The Constituent Email Addresses dataframe contains 1400000 rows and 16 columns


Unnamed: 0,cons_email_id,cons_id,cons_email_type_id,is_primary,email,canonical_local_part,domain,double_validation,create_dt,create_app,create_user,modified_dt,modified_app,modified_user,status,note
0,1,548198,3361,1,xmartinez@vincent.com,,gmail.com,,"Wed, 1994-01-26 23:49:16",4072,9954,"Sat, 2014-04-19 19:10:39",1990,7595,1,
1,2,491137,2474,1,hmiller@haynes.biz,jqCyozTDojYuylQPTHfm,hotmail.com,,"Thu, 1999-12-09 06:18:27",1600,5716,"Sat, 1984-07-14 05:55:27",4686,3248,1,
2,3,413429,5175,1,aaron64@yahoo.com,FCBeBiVoqnnKDWjnllhN,yahoo.com,kRLZexQEwYIMbwFNvQxg,"Wed, 1992-11-18 16:46:27",7358,2685,"Sun, 1995-12-24 13:13:01",3857,7405,1,
3,4,347346,4117,1,wyattvincent@hotmail.com,,gmail.com,zSbfmlqXimGyWVBUGdQg,"Sat, 1983-11-26 16:49:14",881,3444,"Sun, 1975-01-19 14:32:56",8713,7713,1,
4,5,443000,6781,1,tspencer@hotmail.com,VaQIYlKcUkIywkKKEptD,gmail.com,,"Wed, 2000-11-15 13:28:34",5380,5456,"Sun, 1994-03-13 16:38:37",765,8618,1,


In [28]:
subs_df = pandas.read_csv(subs_filename, sep = ',', header = 0)
print("The Constituent Subscription Status dataframe contains %d rows and %d columns" %subs_df.shape)
subs_df.head()

The Constituent Subscription Status dataframe contains 350000 rows and 6 columns


Unnamed: 0,cons_email_chapter_subscription_id,cons_email_id,chapter_id,isunsub,unsub_dt,modified_dt
0,1,332188,1,1,"Sat, 1971-06-12 15:38:44","Thu, 1990-06-28 10:54:20"
1,2,536526,1,1,"Wed, 2006-07-12 01:50:45","Thu, 1979-09-20 06:02:35"
2,3,134711,1,1,"Tue, 1987-01-06 13:05:15","Sun, 1974-03-03 15:11:50"
3,4,660345,1,1,"Sat, 2016-08-06 11:06:09","Wed, 1995-09-13 23:45:03"
4,5,184268,1,1,"Sun, 2000-05-28 02:20:45","Sat, 1983-12-10 08:09:58"


# Exercise 1

First, we do some data manipulation for each of the 3 files. This manipulation adds 'sub_' prefix to each of the variable names to indicate a subset of data

For the **Consituent Information** dataframe, we keep the following relevant columns:
* **cons_id** is the unique identifier of the constituent
* **source** is the source code of the data
* **create_dt** is the person creation datetime
* **modified_dt** is the person updated datetime

In [8]:
sub_cons_df = cons_df[['cons_id', 'source', 'create_dt', 'modified_dt']]

For the **Consituent Email Addresses** dataframe, we first keep the primary email addresses and then we select the following relevant columns:
* **cons_email_id** is the unique identifier of the constituent's email
* **cons_id** is the unique identifier of the constituent
* **email** is the constituent's email address

In [None]:
sub_email_df = email_df[email_df.is_primary == 1].reset_index(drop=True)
sub_email_df = sub_email_df[['cons_email_id', 'cons_id', 'email']]

For the **Consituent Subscription Status** dataframe, we are interested in subcriptions with chapter_id = 1 and we select the following relevant columns:
* **cons_email_id** is the unique identifier of the constituent's email
* **cons_email_chapter_subscription_id** is the unique identifier of the constituent's email subscription
* **isunsub** is the constituent's email address unsubscribed

In [None]:
subs_df = subs_df[subs_df.chapter_id == 1].reset_index(drop=True)
sub_subs_df = subs_df[['cons_email_id','cons_email_chapter_subscription_id','isunsub']]

In [9]:
cons_email_df = pandas.merge(sub_cons_df, sub_email_df, how='left', on='cons_id')
print(cons_email_df.shape)

cons_email_subs_df = pandas.merge(cons_email_df, sub_subs_df, how='left', on='cons_email_id')
print(cons_email_subs_df.shape)

(700000, 6)
(700000, 8)


In [10]:
people_columns = {"email":"email"
                  , "source":"code"
                  , "isunsub":"is_unsub"
                  , "create_dt":"created_dt"
                  , "modified_dt":"updated_dt"}
people_df = cons_email_subs_df[people_columns.keys()]
people_df = people_df.rename(columns = people_columns)

In [11]:
people_df.head()
people_df.fillna(value={'is_unsub':0.0}, inplace = True)

In [12]:
people_df['created_dt'] = pandas.to_datetime(people_df['created_dt'], format='%a, %Y-%m-%d %H:%M:%S')

In [13]:
people_df['updated_dt'] = pandas.to_datetime(people_df['updated_dt'], format='%a, %Y-%m-%d %H:%M:%S')

In [14]:
people_df.to_csv('people.csv', index=False)

# Exercise 2

In [18]:
people_df['acquisition_date'] = people_df['created_dt'].dt.date
acquisition_df = people_df.groupby(['acquisition_date']).size()

In [19]:
acquisition_df

acquisition_date
1970-01-01    34
1970-01-02    38
1970-01-03    32
1970-01-04    39
1970-01-05    44
              ..
2020-06-27    39
2020-06-28    38
2020-06-29    30
2020-06-30    47
2020-07-01    29
Length: 18445, dtype: int64

In [22]:
acquisition_df.to_csv('acquisition_facts.csv', index=False)