In [110]:
import pandas as pd
import sys

sys.path.insert(1, '../scripts/')
from s3_support import *

# Load Data From S3

In [111]:
url = get_file_url("sfc-export", "Event.csv")

df = pd.read_csv(url, encoding="ISO-8859-1", low_memory=False)

# Drop Irrelevant Columns

In [112]:
drop_cols = ["Location", "IsPrivate", "ShowAs", "IsDeleted", "IsChild", "IsRecurrence", "RecurrenceStartDateTime", 
             "RecurrenceEndDateOnly", "RecurrenceTimeZoneSidKey", "RecurrenceType", "RecurrenceInterval", 
             "RecurrenceDayOfWeekMask", "RecurrenceDayOfMonth", "RecurrenceInstance", "RecurrenceMonthOfYear", 
             "ReminderDateTime", "IsReminderSet", "RecurrenceActivityId", "bizible2__BizibleId__c", 
             "bizible2__Bizible_Touchpoint_Date__c"]

df.drop(drop_cols, axis=1, inplace=True)
# df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
# after_2016 = df[df['CreatedDate'].dt.year >= 2016]
# len(after_2016)

# Exploratory Analysis

In [113]:
print("{} rows".format(len(df)))
print("Columns ({}):".format(len(df.columns)))
print(list(df.columns))
df.describe()

43663 rows
Columns (26):
['Id', 'WhoId', 'WhatId', 'WhoCount', 'WhatCount', 'Subject', 'IsAllDayEvent', 'ActivityDateTime', 'ActivityDate', 'DurationInMinutes', 'Description', 'AccountId', 'OwnerId', 'Type', 'IsGroupEvent', 'GroupEventType', 'CreatedDate', 'CreatedById', 'LastModifiedDate', 'LastModifiedById', 'SystemModstamp', 'IsArchived', 'ProposedEventTimeframe', 'Time_Zone__c', 'Import2_Id__c', 'Activity_Type__c']


Unnamed: 0,WhoCount,WhatCount,IsAllDayEvent,DurationInMinutes,IsGroupEvent,GroupEventType,IsArchived
count,43663.0,43663.0,43663.0,43663.0,78.0,78.0,43663.0
mean,0.810228,0.189932,0.000115,5.675515,0.948718,0.948718,0.913359
std,0.39428,0.392252,0.010701,88.439409,0.31817,0.31817,0.281311
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,1.0,1.0,1.0
50%,1.0,0.0,0.0,0.0,1.0,1.0,1.0
75%,1.0,0.0,0.0,0.0,1.0,1.0,1.0
max,2.0,1.0,1.0,1440.0,2.0,2.0,1.0


In [114]:
def print_column_descriptions():
    ids_to_ignore = ["Id", "WhoId", "WhatId"]
    group_columns = []

    for c in df.columns:
        if c not in ids_to_ignore:
            print(c)
            print(df[c].describe())
            print("\n")
            
# print_column_descriptions()

In [115]:
# df['Description']

In [116]:
df['Type'].value_counts()

Call            35
Demo            24
Meeting         10
Email            1
Cancellation     1
Name: Type, dtype: int64

In [117]:
# df['Subject']

In [118]:
df['Type'].fillna("None").value_counts(normalize=True)

None            0.998374
Call            0.000802
Demo            0.000550
Meeting         0.000229
Email           0.000023
Cancellation    0.000023
Name: Type, dtype: float64

In [119]:
df[df['Type'].isna()]['Description'].tail(25)

43638                                                                             Spoke with her at their Bike Denver membership drive. She seemed very interested and asked for an email. They are currently using Acceptiva and aren't very happy with the service. Also, they will be redoing their website and would like more information.  I should offer them a discount on the startup cost, maybe 1/2 or full off. See how it goes. Others from Bike Denver were also interested in Qgiv, they were very receptive.
43639                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

_@TODO_ Need to tag types from descriptions. 99% of types are not tagged, but lots of descriptive text.

Task Types:
   - Email
   - Call                      67301
   - Pre-Demo Follow Up        33352
   - Post-Demo Follow Up       13696
   - Prepared Materials         3491
   - Web Demo Completed         2339
   - None                       2333
   - Lead Qualification         1760
   - Lead Submitted Form        1571
   - Initial Contact            1568
   - One Pager Campaign         1162
   - Conversation               1023 Should this be moved to call?
   - Demo Scheduled              842
   - 60_Day_Follow_Up_Call       401
   - Demo/Meeting                202
   - Recovery                     72
   - Live Demo Completed          58
   - Phone Call                    1
   - Training                      1
   - COS - 45 Day Call             1

# Infer Type From Data

In [120]:
email_keywords = [
    'email',
    'dear',
    'new customer inquiry',
    'thanks for your interest',
    'let me know if you have any',
    'fwd',
    'fw'
]

call_keywords = [
    'call',
    'left a message',
    'left message',
    'left him a message',
    'left her a message',
    'left msg',
    'vm',
    'voice mail',
    'spoke with',
    'number not in',
    '# not in',
    'number is not in',
    '# is not in',
    'no answer',
    'on hold',
    'isn\'t in the office',
    'not in the office',
    'out of the office',
    'out of office',
    'spoke to',
    'not in on',
    'was not in',
    'this number was',
    'number does not work',
    'talked with',
    'could not leave a message',
    'mailbox is full',
    'should be in the office',
    'he was in a meeting',
    'he was going into a meeting',
    '# is for',
    'that answered',
    'number is wrong',
    'busy signal',
    'numbers are for',
    'disconnected #',
    'kept ringing',
    'was in the middle of a meeting',
    'took my number',
    'could not get through',
    'wasn\'t home',
    'not in their office',
    'not available to talk',
    'left another message',
    'left follow up message',
    'left a general message',
    'just continued to ring',
    'busy again',
    'busy all day',
    'chatted with',
    'checked in with',
    'talked to',
    'talked with',
    'spoke with',
    'out of town',
    'out of office',
    'get back to'
]

pre_demo_follow_up_keywords = [
    'pre demo follow up'
]

post_demo_follow_up_keywords = [
    'post demo follow up'
]

demo_completed_keywords = [
    'demo completed',
    'completed demo',
    'the demo',
    'in person demo',
    'good demo',
    'bad demo',
    'demo went',
    'they seemed very happy with the system',
    'went through demo',
    'went through a demo',
    'walked through a demo'
]

prepared_materials_keywords = [
    'prepared materials'
]

web_demo_completed_keywords = [
    'web demo completed',
    'please join my meeting from your computer'
]

lead_qualification_keywords = [

]

lead_submitted_form_keywords = [
    'submitted form',
    'contact us',
    'recieved your message',
    'request web lead'
]

initial_contact_keywords = [
    'initial contact',
    'main contact'
]

one_pager_campaign_keywords = [
    'one pager',
    'one-pager'
]

demo_scheduled_keywords = [
    'demo scheduled',
    'scheduled demo',
    'scheduled a demo',
    'set up a meeting',
    'would like to see a demo',
    'moving forward with a demo',
    'demo rescheduled',
    'would like a demo',
    'interested in a demo',
    'would like a comparison',
    'signed up for a demo',
    'would like to go through a demo',
    'would like to see our service',
    'on for a demo',
    'on for a hobnob demo',
    'scheduled a phone meeting',
    'would like to set up a hobnob demo',
    'wants to talk about',
    'wants to see',
    'wants a demo'
]

sixty_day_follow_up_keywords = [
    '60 day follow up',
    'sixty day follow up',
    'sign up follow up',
    'signup follow up',
    '60 day checkin',
    'signup form'
]

training_keywords = [
    'training'
]

interested_keywords = [
    'are interested',
    'was interested',
    'very interested',
    'is interested',
    'interested in',
    'wants more information',
    'will be discussing',
    'might be interested',
    'seemed interested',
    'still interested',
    'would like to learn more',
    'would like info',
    'wants more info'
]

not_interested_keywords = [
    'not interested',
    'no interest',
    'they backed out',
    'no longer interested',
    'doesn\'t seem interested',
    'doesn\'t sound too interested',
    'went with',
    'not that interested',
    'doesn\'t think it would be well received',
    'satisfied with authorize.net',
    'does not think they will move forward',
    'using paypal',
    'staying with',
    'using donor perfect',
    'custom solution',
    'no longer a direction that they are going',
    'decided to go in a different direction',
    'moving in a different direction',
    'currently using blackbaud',
    'are currently ok with',
    'using vanco',
    'using shelby',
    'currently satisfied with their solution',
    'she made the decision on another vendor',
    'hung up on me',
    'he hung up',
    'she hung up',
    'currently using',
    'too expensive for them'
    'using classy',
    'using stay classy',
    'using acs',
    'are not looking to change',
    'happy with their event service',
    'isn\'t a good fit',
    'have decided to go with',
    'satisfied with their current solution',
    'decided not to move forward',
    'decided to move forward with another',
    'using fellowship one',
    'doesn\'t seem to interested',
    'looking into different services',
    'using secure give'
    
]

# TODO: Maybe break these up?
not_ready_keywords = [
    'not ready',
    'needs some time',
    'they will be able to move forward in',
    'asked me to check back',
    'contact me again in',
    'check back with',
    'give her at least a',
    'check back',
    'asked for a day',
    'asked for a week',
    'asked for a couple of weeks',
    'on vacation',
    'out for the next',
    'will get back with me',
    'isn\'t ready'
]

# TODO: Maybe break these up?
sale_closed_keywords = [
    'moving forward',
    'we finalized things',
    'application received',
    'application has been received',
    'have been approved',
    'they should be all set up',
    'setting up an account',
    'start the application process',
    'they are live',
    'they are now live',
    'going through the signup',
    'going through signup',
    'in the signup process',
    'go through the signup',
    'page is finished',
    'they are submitted',
    'they are signed up',
    'they are all signed up',
    'will be sending their information in',
    'being submitted right now',
    'going through the application right now',
    'going to sign up',
    'ready to move forward',
    'going with qgiv',
    'they are activated',
    'they are signing up',
    'want to move forward with qgiv',
    'completed sign up',
    'application submitted',
    'received application',
    'sent online application',
    'signup process',
    'approval process',
    'gone through signup'
]

client_missed_demo_keywords = [
    'missed demo',
    'no show for demo',
    'no show for the hobnob demo'
]

webinar_keywords = [
    'webinar'
]

cancellation_keywords = [
    'cancelled',
    'cancellation',
    'cancel'
]

downgrade_keywords = [
    'downgrade',
    'downgraded',
    'downgrading',
    'from grow to start',
    'from engage to start'
]

upgrade_keywords = [
    'upgrade',
    'upgraded',
    'upgrading'
]

kiosk_activity_keywords = [
    'kiosk'
]

mobile_vt_activity = [
    'mobile vt activity'
]

page_skin_keywords = [
    'page skin',
    'page skinning',
    'page skinned'
]

tshirt_campaign_keywords = [
    'tshirt campaign'
]

control_panel_migration_keywords = [
    'new control panel',
    'migrated to new cp',
    'switched control panel',
    'switched cp'
]

noise_keywords = [
    'birthday cake',
    'town hall meeting',
    'group insurance',
    'try back',
    'contact me back',
    'will be back'
]

meeting_keywords = [
    'meeting with',
    'meeting on',
    'meeting for',
    'met with'
]


def infer_type(r):
    if r['Type'].lower() == 'demo':
        return 'Demo Completed'
    elif r['Type'] != 'None': 
        return r['Type']
         
    def contains(keywords):
        subject = str(r['Subject']).lower()
        description = str(r['Description']).lower()
        
        return any(keyword in subject for keyword in keywords) or any(keyword in description for keyword in keywords)
    
    if contains(tshirt_campaign_keywords):
        return 'T-Shirt Campaign'
    elif contains(control_panel_migration_keywords):
        return 'Control Panel Migration'
    elif contains(pre_demo_follow_up_keywords):
        return 'Pre-Demo Follow Up'
    elif contains(post_demo_follow_up_keywords):
        return 'Post-Demo Follow Up'
    elif contains(demo_completed_keywords) or contains(web_demo_completed_keywords):
        return 'Demo Completed'
    elif contains(prepared_materials_keywords):
        return 'Prepared Materials'
    elif contains(lead_qualification_keywords):
        return 'Lead Qualification'
    elif contains(lead_submitted_form_keywords):
        return 'Lead Submitted Form'
    elif contains(initial_contact_keywords):
        return 'Initial Contact'
    elif contains(one_pager_campaign_keywords):
        return 'One Pager Campaign'
    elif contains(demo_scheduled_keywords):
        return 'Demo Scheduled'
    elif contains(sixty_day_follow_up_keywords):
        return '60 Day Follow Up'
    elif contains(training_keywords):
        return 'Training'
    elif contains(interested_keywords):
        return 'Interested'
    elif contains(not_interested_keywords):
        return 'Not Interested'
    elif contains(not_ready_keywords):
        return 'Not Ready'
    elif contains(sale_closed_keywords):
        return 'Sale Closed'
    elif contains(client_missed_demo_keywords):
        return 'Client Missed Demo'
    elif contains(webinar_keywords):
        return 'Webinar'
    elif contains(cancellation_keywords):
        return 'Cancellation'
    elif contains(downgrade_keywords):
        return 'Downgrade'
    elif contains(upgrade_keywords):
        return 'Upgrade'
    elif contains(kiosk_activity_keywords):
        return 'Kiosk Activity'
    elif contains(mobile_vt_activity):
        return 'Mobile VT Activity'
    elif contains(page_skin_keywords):
        return 'Page Skin'
    elif contains(email_keywords):
        return 'Email'
    elif contains(call_keywords):
        return 'Call'
    elif contains(meeting_keywords):
        return 'Meeting'
    elif contains(noise_keywords):
        return 'Noise'
    else:
        return 'None'

In [121]:
df['Type'].fillna("None", inplace=True)
df['Type'] = df.apply(infer_type, axis=1)

In [122]:
df['Type'].fillna("None").value_counts(normalize=True)

Call                       0.502714
None                       0.167510
Training                   0.048989
Interested                 0.047706
Email                      0.045164
Demo Completed             0.034949
Kiosk Activity             0.034377
Not Interested             0.028491
Sale Closed                0.022330
Demo Scheduled             0.014452
Not Ready                  0.012184
Cancellation               0.010489
Downgrade                  0.009001
Upgrade                    0.007191
Mobile VT Activity         0.002886
Meeting                    0.002222
Control Panel Migration    0.001970
Initial Contact            0.001947
Lead Submitted Form        0.001672
T-Shirt Campaign           0.001397
Noise                      0.001237
Webinar                    0.000435
Client Missed Demo         0.000321
Page Skin                  0.000275
60 Day Follow Up           0.000092
Name: Type, dtype: float64

In [123]:
pd.set_option('display.max_colwidth', 500)
df[df['Type']=='None'][['Subject', 'Description', 'Type']].tail(100)

Unnamed: 0,Subject,Description,Type
42887,Note: Received the application - let Kris know we were missing a couple of signatures.,Received the application - let Kris know we were missing a couple of signatures.,
42889,Note: Is no longer there. Kristi has replaced her.,Is no longer there. Kristi has replaced her.,
42899,Note: He doesn't deal with anything web based.,He doesn't deal with anything web based.,
42901,Note: office is closed,office is closed,
42903,Note: using authorize.net it looks like,using authorize.net it looks like,
42921,Note: He is not the decision maker but said that if they are looking to move forward at some point they will get in contact with me.,He is not the decision maker but said that if they are looking to move forward at some point they will get in contact with me.,
42923,"Note: Using RaisersEdge - brand new to them. However, they are always looking for new options.","Using RaisersEdge - brand new to them. However, they are always looking for new options.",
42941,Note: That went very well and he thinks he should be able to convince the board to go with us... he asked for a rate card and an application,That went very well and he thinks he should be able to convince the board to go with us... he asked for a rate card and an application,
42942,"Note: comments: We are starting an online donation system, where members will sign up to donate at least $10 every month each month. Will this system also be able to accomodate online purchases, for example selling t-shirts online?","comments: We are starting an online donation system, where members will sign up to donate at least $10 every month each month. Will this system also be able to accomodate online purchases, for example selling t-shirts online?",
42944,"Note: Sarah was having issues with the static event URL's asking for ADDITIONAL information and then Participant Information x's the number of tickets purchased. This has been resolved, I let Sarah know.","Sarah was having issues with the static event URL's asking for ADDITIONAL information and then Participant Information x's the number of tickets purchased. This has been resolved, I let Sarah know.",


# Storing cleaned & reduced dataset to S3

In [128]:
cols = ['Id', 'WhoId', 'WhatId', 'Subject', 
        'OwnerId', 'Description', 'Type', 'AccountId', 
        'CreatedDate', 'CreatedById', 'SystemModstamp']
save_dataframe_to_file("sfc-export", "events.clean.csv", df[cols])

uploading to S3
Done
