In [35]:
import pandas as pd
import sys

sys.path.insert(1, '../scripts/')
from s3_support import *

In [36]:
url = get_file_url("sfc-export", "Task.csv")

df = pd.read_csv(url, encoding="ISO-8859-1", low_memory=False)

In [37]:
drop_cols = ["CallDurationInSeconds", "CallType", "CallDisposition", "CallObject", "RecurrenceActivityId", 
             "IsRecurrence", "RecurrenceStartDateOnly", "RecurrenceEndDateOnly", "RecurrenceTimeZoneSidKey", 
             "RecurrenceType", "RecurrenceInterval", "RecurrenceDayOfWeekMask", "RecurrenceDayOfMonth", 
             "RecurrenceInstance", "RecurrenceMonthOfYear", "RecurrenceRegeneratedType", "bizible2__BizibleId__c", 
             "bizible2__Bizible_Touchpoint_Date__c", "Import2_Id__c", "IsDeleted", "IsArchived"]

df.drop(drop_cols, axis=1, inplace=True)

In [38]:
print("{} rows".format(len(df)))
print("Columns ({}):".format(len(df.columns)))
print(list(df.columns))

534469 rows
Columns (26):
['Id', 'WhoId', 'WhatId', 'WhoCount', 'WhatCount', 'Subject', 'ActivityDate', 'Status', 'Priority', 'OwnerId', 'Description', 'Type', 'AccountId', 'IsClosed', 'CreatedDate', 'CreatedById', 'LastModifiedDate', 'LastModifiedById', 'SystemModstamp', 'EmailMessageId', 'ActivityOriginType', 'ReminderDateTime', 'IsReminderSet', 'CompletedDateTime', 'Time_Zone__c', 'Activity_Type__c']


In [39]:
ids_to_ignore = ["Id", "WhoId", "WhatId", "EmailMessageId", "AccountId"]
group_columns = ["Status", "Priority", "OwnerId", "Type"]
# add CompletedDateTime - CreatedDate to get an approximate duration

df[[c for c in df.columns if c not in ids_to_ignore]].describe()

Unnamed: 0,WhoCount,WhatCount,IsClosed,ActivityOriginType,IsReminderSet
count,534469.0,534469.0,534469.0,147384.0,534469.0
mean,0.943914,0.165686,0.993895,2.669951,0.00182
std,0.259145,0.371799,0.077896,1.963318,0.042628
min,0.0,0.0,0.0,1.0,0.0
25%,1.0,0.0,1.0,1.0,0.0
50%,1.0,0.0,1.0,1.0,0.0
75%,1.0,0.0,1.0,5.0,0.0
max,22.0,1.0,1.0,5.0,1.0


In [40]:
df['Type'].unique()

array([nan, 'Pre-Demo Follow Up', 'Initial Contact',
       'Post-Demo Follow Up', 'Web Demo Completed', 'Live Demo Completed',
       'Demo Scheduled', 'Email', 'Call', 'Lead Qualification',
       'Conversation', 'Demo/Meeting', '60_Day_Follow_Up_Call',
       'Phone Call', 'COS - 45 Day Call', 'Training'], dtype=object)

In [41]:
cols_to_look_at = [c for c in df.columns if c not in ids_to_ignore]

print("looking at columns: {}".format(cols_to_look_at))

df.groupby('Type')[cols_to_look_at].mean().reset_index()

looking at columns: ['WhoCount', 'WhatCount', 'Subject', 'ActivityDate', 'Status', 'Priority', 'OwnerId', 'Description', 'Type', 'IsClosed', 'CreatedDate', 'CreatedById', 'LastModifiedDate', 'LastModifiedById', 'SystemModstamp', 'ActivityOriginType', 'ReminderDateTime', 'IsReminderSet', 'CompletedDateTime', 'Time_Zone__c', 'Activity_Type__c']


Unnamed: 0,Type,WhoCount,WhatCount,IsClosed,ActivityOriginType,IsReminderSet
0,60_Day_Follow_Up_Call,0.349127,0.935162,0.900249,5.0,0.002494
1,COS - 45 Day Call,1.0,1.0,0.0,,0.0
2,Call,0.835058,0.27335,0.984445,4.982759,0.0087
3,Conversation,0.812317,0.371457,0.987292,5.0,0.008798
4,Demo Scheduled,0.901425,0.750594,0.95962,4.942363,0.005938
5,Demo/Meeting,0.574257,0.787129,0.985149,5.0,0.009901
6,Email,0.661006,0.671195,0.977287,3.269294,0.009552
7,Initial Contact,0.87984,0.592123,0.996662,2.172982,0.000668
8,Lead Qualification,0.970455,0.067045,0.99375,4.721739,0.003409
9,Live Demo Completed,0.793103,0.844828,0.982759,5.0,0.017241


# Cleaning up Task.Type

In [42]:
print("Is NA: {}".format(len(df[df['Type'].isna()])))
df['Type'].value_counts()

Is NA: 419874


Call                     54710
Pre-Demo Follow Up       33352
Post-Demo Follow Up      13696
Email                     4711
Web Demo Completed        2339
Lead Qualification        1760
Initial Contact           1498
Conversation              1023
Demo Scheduled             842
60_Day_Follow_Up_Call      401
Demo/Meeting               202
Live Demo Completed         58
Training                     1
COS - 45 Day Call            1
Phone Call                   1
Name: Type, dtype: int64

In [43]:
df.groupby('Type')['Status'].value_counts()

Type                   Status     
60_Day_Follow_Up_Call  Completed        361
                       Open              40
COS - 45 Day Call      Open               1
Call                   Completed      53859
                       Open             842
                       Not Started        9
Conversation           Completed       1010
                       Open              12
                       Not Started        1
Demo Scheduled         Completed        808
                       Open              34
Demo/Meeting           Completed        199
                       Open               3
Email                  Completed       4604
                       Open             106
                       Not Started        1
Initial Contact        Completed       1493
                       Open               5
Lead Qualification     Completed       1749
                       Open              11
Live Demo Completed    Completed         57
                       Open              

In [44]:
df['Type'] = df['Type'].fillna('None')

def infer_type(r):
    if r['Type'] in ['Web Demo Completed', 'Live Demo Completed', 'Demo/Meeting']:
        return 'Demo Completed'
    elif r['Type'] == 'Phone Call':
        return 'Call'
    elif r['Type'] == 'Conversation':
        return 'Meeting'
    elif r['Type'] == 'None':
        subj = str(r['Subject'])
        
        if 'One-Pager' in subj:
            return 'One Pager Campaign'
        if 'Submitted Form' in subj or 'Contact Us' in subj or 'Received Your Message' in subj or 'Request Web Lead' in subj:
            return 'Lead Submitted Form'
        if 'Thanks for your interest' in subj or 'email' in subj.lower() or 'Qgiv Sign-Up Link' in subj or 'New Customer Inquiry' in subj:
            return 'Email'
        if 'Mobile Field Guide' in subj or 'Autism Nonprofit Boosts Donations'.lower() in subj.lower() or '$10 limit on text giving'.lower() in subj.lower() or 'Level Up' in subj or 'How One Nonprofit Exceed' in subj or 'infographic' in subj.lower() or 'Tips for Managing' in subj or 'eBook'.lower() in subj.lower() or 'Sabotaging Your Peer' in subj or 'Level Up Your Social' in subj or 'Case Study'.lower() in subj.lower() or 'Mobile Fundraising Field ' in subj or 'How Can We Help'.lower() in subj.lower():
            return 'Prepared Materials'
        if 'Are you still interested' in subj or 'Sign-up Reminder' in subj:
            return 'Recovery'
        if 'contact prospect' in subj.lower():
            return 'Initial Contact'
        if 'call' in subj.lower() or 'follow up' in subj.lower().replace('-', ' ') or 'spoke to' in subj.lower():
            return 'Call'
        
        
    return r['Type']

df['Type'] = df.apply(infer_type, axis=1)

In [45]:
df['Type'].value_counts()

Email                    403295
Call                      67350
Pre-Demo Follow Up        33352
Post-Demo Follow Up       13696
Prepared Materials         3491
Demo Completed             2599
None                       2285
Lead Qualification         1760
Lead Submitted Form        1571
Initial Contact            1568
One Pager Campaign         1162
Meeting                    1023
Demo Scheduled              842
60_Day_Follow_Up_Call       401
Recovery                     72
Training                      1
COS - 45 Day Call             1
Name: Type, dtype: int64

In [46]:
df[df['Type']=='None'][['Subject', 'Description', 'Type']].tail(50)

Unnamed: 0,Subject,Description,Type
510331,check on application,,
510363,respond to vm,,
510385,inbound vm,left a message curious to hear what I have to ...,
510415,Web Meeting,He was very happy with the Hobnob Demonstratio...,
510424,delete short code,,
510434,FU if no response,Regarding larger corporate entity Sisters of C...,
510450,check on application,,
510488,In Person Meeting,They are really excited about the text giving ...,
510502,Signup FU,If no repsonse.,
510517,inbound vm,Left a vm stating that he is interested in tex...,


In [47]:
df['Status'].value_counts()

Completed      531206
Open             3250
Not Started        13
Name: Status, dtype: int64

In [50]:
cols = ['Id', 'WhoId', 'WhatId', 'Subject', 
        'OwnerId', 'Description', 'Type', 'AccountId', 
        'CreatedDate', 'CreatedById', 'SystemModstamp']
save_dataframe_to_file("sfc-export", "tasks.clean.csv", df[cols])

uploading to S3
Done
