In [1]:
import pandas as pd
import sys

sys.path.insert(1, '../../../scripts/')
from s3_support import *

# Load the data

In [2]:
url = get_file_url("sfc-export", "OpportunityHistory.csv")
df = pd.read_csv(url, encoding="ISO-8859-1", low_memory=False)

# Clean up dataframe and filter out values before 2016

In [3]:
df = df[['OpportunityId', 'CreatedDate', 'StageName', 'Probability']]
df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
df = df[df['CreatedDate'].dt.year>=2016]

In [4]:
df.head()

Unnamed: 0,OpportunityId,CreatedDate,StageName,Probability
631,0063100000c2VnNAAU,2016-04-06 13:21:29,Signup,90
632,0063100000YzS5wAAF,2016-04-06 14:41:12,Talking/Waiting,20
633,0063100000YzS5wAAF,2016-04-06 14:41:12,Talking/Waiting,20
634,0063100000YzS5wAAF,2016-04-06 14:41:30,Talking/Waiting,20
635,0063100000YzSFIAA3,2016-04-06 14:56:18,Talking/Waiting,20


In [5]:
df['StageName'].unique()

array(['Signup', 'Talking/Waiting', 'Closed Lost',
       'Demo/Meeting Completed', 'Closed Won', 'Demo Scheduled',
       'Technical Win', 'Discovery', 'Sales Hand-off', 'Demo Completed'],
      dtype=object)

# Creating date ranges

In [6]:
final_stages = [
    'OpportunityId',
    'Initial Contact',
    'Demo Scheduled',
    'Demo Completed',
    'Signup',
    'Closed Won',
    'Closed Lost',
    'Onboarding'
]

In [7]:
stages = pd.DataFrame(df.groupby(['OpportunityId', 'StageName'])['CreatedDate'].min()).reset_index()
stages = stages.pivot(index="OpportunityId", columns="StageName", values="CreatedDate").reset_index()
stages['Initial Contact'] = stages[[c for c in stages.columns if c != 'OpportunityId']].min(axis=1)

# Combine `Demo Completed` and `Demo/Meeting Completed`

In [8]:
stages['Demo Completed'].fillna(stages['Demo/Meeting Completed'], inplace=True)
del stages['Demo/Meeting Completed']

# Rename Sales Hand-off to Onboarding

In [9]:
stages.rename({'Sales Hand-off': 'Onboarding'}, axis=1, inplace=True)

# Drop irrelevant columns

In [10]:
stages = stages[final_stages]
stages.drop_duplicates(inplace=True)

In [11]:
stages.head()

StageName,OpportunityId,Initial Contact,Demo Scheduled,Demo Completed,Signup,Closed Won,Closed Lost,Onboarding
0,0063100000XzTINAA3,2016-01-13 18:05:32,NaT,NaT,NaT,NaT,2016-12-19 23:51:23,NaT
1,0063100000Y01KhAAJ,2016-01-07 17:55:34,NaT,2016-06-06 22:13:40,NaT,NaT,2016-06-22 16:34:10,NaT
2,0063100000Y08uaAAB,2016-01-05 21:53:05,NaT,NaT,NaT,2016-01-05 21:53:05,NaT,NaT
3,0063100000Y0RSkAAN,2016-02-29 18:36:11,NaT,NaT,NaT,NaT,2016-03-17 15:17:03,NaT
4,0063100000Y0WlrAAF,2016-01-08 20:17:00,NaT,NaT,2016-01-08 20:17:00,NaT,2016-03-30 14:37:36,NaT


In [17]:
len_all = len(stages)

len_contact_sched = len(stages[stages['Initial Contact']==stages['Demo Scheduled']]) 
len_sched_comp = len(stages[stages['Demo Scheduled']==stages['Demo Completed']])
len_contact_comp = len(stages[stages['Initial Contact']==stages['Demo Completed']])

"{:.2f}%".format((len_contact_sched / len_all) * 100.), len_sched_comp, "{:.2f}%".format((len_contact_comp / len_all) * 100.)

('0.81%', 0, '3.00%')

# Store date ranges

In [12]:
save_dataframe_to_file('sfc-export', 'opportunity_date_ranges.csv', stages)

uploading to S3
Done


# Exploring wins/losses

In [23]:
def infer_final_state(r):
    if not pd.isnull(r['Closed Won']):
        return 'Won'
    elif not pd.isnull(r['Closed Lost']):
        return 'Lost'
    elif pd.isnull(r['Closed Won']) and pd.isnull(r['Closed Lost']):
        return 'Unknown'
    else:
        return None

stages['Status'] = stages.apply(infer_final_state, axis=1)

In [14]:
stages['Status'].value_counts()

Lost       3056
Won        2890
Unknown     448
Name: Status, dtype: int64

In [15]:
stages.head(3)

StageName,OpportunityId,Initial Contact,Demo Scheduled,Demo Completed,Signup,Closed Won,Closed Lost,Onboarding,Status
0,0063100000XzTINAA3,2016-01-13 18:05:32,NaT,NaT,NaT,NaT,2016-12-19 23:51:23,NaT,Lost
1,0063100000Y01KhAAJ,2016-01-07 17:55:34,NaT,2016-06-06 22:13:40,NaT,NaT,2016-06-22 16:34:10,NaT,Lost
2,0063100000Y08uaAAB,2016-01-05 21:53:05,NaT,NaT,NaT,2016-01-05 21:53:05,NaT,NaT,Won


In [25]:
def get_initial_contact_to_close(r):
    if not pd.isnull(r['Closed Won']):
        return r['Closed Won'] - r['Initial Contact']
    elif not pd.isnull(r['Closed Lost']):
        return r['Closed Lost'] - r['Initial Contact']
    else:
        return None

stages['Initial Contact to Demo Scheduled'] = stages['Demo Scheduled'] - stages['Initial Contact']
stages['Demo Scheduled to Demo Completed'] = stages['Demo Completed'] - stages['Demo Scheduled']
stages['Demo Completed to Signup'] = stages['Signup'] - stages['Demo Completed']
stages['Signup to Onboarding'] = stages['Onboarding'] - stages['Signup']
stages['Initial Contact to Close'] =  stages.apply(get_initial_contact_to_close, axis=1)

In [26]:
comparison_cols = [c for c in stages.columns if ' to ' in c]

for c in comparison_cols:
    stages[c] = stages[c].dt.days

stages.groupby('Status')[comparison_cols].mean()

StageName,Initial Contact to Demo Scheduled,Demo Scheduled to Demo Completed,Demo Completed to Signup,Signup to Onboarding,Initial Contact to Close
Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lost,2.687575,15.205742,23.910448,15.0,88.054647
Unknown,0.133333,156.875,103.5,,
Won,2.06563,4.617188,29.817539,5.5,32.695848


In [18]:
stages.groupby('Status')[comparison_cols].count()

StageName,Initial Contact to Demo Scheduled,Demo Scheduled to Demo Completed,Demo Completed to Signup,Signup to Onboarding,Initial Contact to Close
Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lost,829,627,67,1,3056
Unknown,45,8,2,0,0
Won,579,512,707,54,2890


In [20]:
stages.groupby('Status')[comparison_cols].std()

StageName,Initial Contact to Demo Scheduled,Demo Scheduled to Demo Completed,Demo Completed to Signup,Signup to Onboarding,Initial Contact to Close
Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lost,31.99819,73.657012,48.082546,,107.577043
Unknown,0.894427,222.397609,17.67767,,
Won,17.489468,20.45983,53.533785,9.524051,55.988013


In [101]:
len_all = len(stages[stages['Status']=='Won'])

demo_scheduled_msk = stages['Initial Contact to Demo Scheduled']>1
demo_completed_msk = stages['Demo Scheduled to Demo Completed']>1

std1_losses = len(stages[(stages['Status']=='Won')&((demo_scheduled_msk)|(demo_completed_msk))])

demo_scheduled_msk = stages['Initial Contact to Demo Scheduled']>2
demo_completed_msk = stages['Demo Scheduled to Demo Completed']>2

std2_losses = len(stages[(stages['Status']=='Won')&((demo_scheduled_msk)|(demo_completed_msk))])

float(std1_losses) / float(len_all), float(std2_losses) / float(len_all)

(0.10622837370242215, 0.08581314878892733)

In [104]:
len_all = len(stages[stages['Status']=='Lost'])

demo_scheduled_msk = stages['Initial Contact to Demo Scheduled']>1
demo_completed_msk = stages['Demo Scheduled to Demo Completed']>1

std1_losses = len(stages[(stages['Status']=='Lost')&((demo_scheduled_msk)|(demo_completed_msk))])

demo_scheduled_msk = stages['Initial Contact to Demo Scheduled']>2
demo_completed_msk = stages['Demo Scheduled to Demo Completed']>2

std2_losses = len(stages[(stages['Status']=='Lost')&((demo_scheduled_msk)|(demo_completed_msk))])

float(std1_losses) / float(len_all), float(std2_losses) / float(len_all)

(0.1407068062827225, 0.12172774869109948)

In [29]:
stages[stages['Status']=='Lost'][comparison_cols].count() / len(stages[stages['Status']=='Lost'])

StageName
Initial Contact to Demo Scheduled    0.271270
Demo Scheduled to Demo Completed     0.205170
Demo Completed to Signup             0.021924
Signup to Onboarding                 0.000327
Initial Contact to Close             1.000000
dtype: float64

In [19]:
#save_dataframe_to_file('sfc-export', 'opportunity_date_ranges.csv', stages)