In [1]:
import pandas as pd
import sys

sys.path.insert(1, '../../../scripts/')
from s3_support import *

In [2]:
_ = list_files("sfc-export", search_key="Opp")

Opportunity.csv (8MB)
OpportunityContactRole.csv (1MB)
OpportunityHistory.csv (14MB)
OpportunityLineItem.csv (2MB)
OpportunityLineItemSchedule.csv (1MB)
leads_to_opportunities.csv (1MB)
opportunity_date_ranges.csv (1MB)
--------------------------------------------------
Matched files: 7 files (0.0GB)
Bucket sfc-export contains 100 files (3.5GB)


In [95]:
url = get_file_url("sfc-export", "OpportunityHistory.csv")
df = pd.read_csv(url, encoding="ISO-8859-1", low_memory=False)

In [96]:
df.columns

Index(['Id', 'OpportunityId', 'CreatedById', 'CreatedDate',
       'CreatedDateForInsert', 'StageName', 'Amount', 'ExpectedRevenue',
       'CloseDate', 'Probability', 'FromForecastCategory', 'ForecastCategory',
       'PrevForecastUpdate', 'FromOpportunityStageName',
       'PrevOpportunityStageUpdate', 'ValidThroughDate', 'SystemModstamp',
       'IsDeleted'],
      dtype='object')

In [97]:
df = df[['OpportunityId', 'CreatedDate', 'StageName', 'Probability']]
df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
df = df[df['CreatedDate'].dt.year>=2016]

In [98]:
df['CreatedDate'].min(), df['CreatedDate'].max()

(Timestamp('2016-01-04 14:13:54'), Timestamp('2019-11-18 14:06:01'))

In [99]:
df['StageName'].unique()

array(['Signup', 'Talking/Waiting', 'Closed Lost',
       'Demo/Meeting Completed', 'Closed Won', 'Demo Scheduled',
       'Technical Win', 'Discovery', 'Sales Hand-off', 'Demo Completed'],
      dtype=object)

# Looking at time differences from a start date

We can get the date created from the Oppoortunity records but __being lazy here and using the first history entry as the start date__. From there, we will calculate the time difference from the first entry of another stage type in order to calculate a time difference from start to the given stage.

We will first consolidate stages down to the following:

1. Initial contact
2. Demo scheduled
3. Demo completed
4. Signup
5. Onboarding

In [202]:
# set the start date from the earliest date available for the given opportunity
opp_stages = pd.DataFrame(df.groupby(['OpportunityId', 'StageName'])['CreatedDate'].min()).reset_index()
opp_stages.head()

# consolidate stages
def consolidate_stages(stagename):
    if stagename in ['Discovery', 'Talking/Waiting']:
        return 'Initial Contact'
    elif stagename in ['Demo Completed', 'Demo/Meeting Completed']:
        return 'Demo Completed'
    elif stagename == 'Technical Win':
        return 'Closed Won'
    elif stagename == 'Sales Hand-off':
        return 'Onboarding'
    else:
        return stagename
    
opp_stages['StageName'] = opp_stages['StageName'].apply(consolidate_stages)

# Remove duplicated stages for a given opportunity (1853 entries were duplicated)
mask = opp_stages[['OpportunityId', 'StageName']].duplicated()
opp_stages = opp_stages[~mask]

opp_stages = opp_stages.pivot(index="OpportunityId", columns="StageName", values="CreatedDate").reset_index()
opp_stages['Start'] = opp_stages[[c for c in opp_stages.columns if c != 'OpportunityId']].min(axis=1)

In [185]:
opp_stages.head()

StageName,OpportunityId,Closed Lost,Closed Won,Demo Completed,Demo Scheduled,Initial Contact,Onboarding,Signup,Start
0,0063100000XzTINAA3,2016-12-19 23:51:23,NaT,NaT,NaT,2016-01-13 18:05:32,NaT,NaT,2016-01-13 18:05:32
1,0063100000Y01KhAAJ,2016-06-22 16:34:10,NaT,2016-06-06 22:13:40,NaT,2016-01-07 17:55:34,NaT,NaT,2016-01-07 17:55:34
2,0063100000Y08uaAAB,NaT,2016-01-05 21:53:05,NaT,NaT,NaT,NaT,NaT,2016-01-05 21:53:05
3,0063100000Y0RSkAAN,2016-03-17 15:17:03,NaT,NaT,NaT,2016-02-29 18:36:11,NaT,NaT,2016-02-29 18:36:11
4,0063100000Y0WlrAAF,2016-03-30 14:37:36,NaT,NaT,NaT,NaT,NaT,2016-01-08 20:17:00,2016-01-08 20:17:00


In [204]:
len(opp_stages), len(opp_stages.dropna())

(6394, 2)

In [205]:
# Average time spent between each stage

In [187]:
# set the time diff column from start date for each column
stage_cols = [c for c in opp_stages if c != 'OpportunityId']
diff_cols = []
for c in stage_cols:
    opp_stages["{} from Start".format(c)] = opp_stages[c] - opp_stages['Start']
    diff_cols.append("{} from Start".format(c))

In [188]:
# round time delta to days & cleanup column names
stage_days = opp_stages[diff_cols].mean().dt.days.reset_index()
stage_days.columns = ['Stage', 'Days from Start']

In [189]:
# append observation counts to time differences
stage_days = stage_days.merge(opp_stages[diff_cols].count().reset_index(), left_on='Stage', right_on='StageName')
stage_days.drop('StageName', axis=1, inplace=True)
stage_days.columns = ['Stage', 'Days from Start All', 'Count All']

In [197]:
stage_days

Unnamed: 0,Stage,Days from Start All,Count All
0,Closed Lost from Start,87,3163
1,Closed Won from Start,28,3420
2,Demo Completed from Start,9,3799
3,Demo Scheduled from Start,2,1453
4,Initial Contact from Start,0,5682
5,Onboarding from Start,40,123
6,Signup from Start,20,1668
7,Start from Start,0,6394


### Losses

Repeating the previous process on just losses. We will look at the time differences from the earliest observed date for the given opportunity to each available stage date.

In [199]:
# losses
losses = opp_stages[opp_stages['Closed Won'].isna()]
opp_stages_losses = losses[diff_cols].mean().dt.days.reset_index()
opp_stages_losses.columns = ['Stage', 'Days from Start']

opp_stages_losses = opp_stages_losses.merge(losses[diff_cols].count().reset_index(), left_on="Stage", right_on="StageName")
opp_stages_losses.drop('StageName', axis=1, inplace=True)
opp_stages_losses.columns = ['Stage', 'Days from Start Losses', 'Count Losses']

opp_stages_losses

Unnamed: 0,Stage,Days from Start Losses,Count Losses
0,Closed Lost from Start,88.0,2596
1,Closed Won from Start,,0
2,Demo Completed from Start,13.0,2056
3,Demo Scheduled from Start,2.0,794
4,Initial Contact from Start,0.0,2750
5,Onboarding from Start,0.0,1
6,Signup from Start,9.0,93
7,Start from Start,0.0,2974


### Wins

Repeating the previous process on just losses. We will look at the time differences from the earliest observed date for the given opportunity to each available stage date.

In [192]:
# wins
wins = opp_stages[opp_stages['Closed Lost'].isna()]
opp_stages_wins = wins[diff_cols].mean().dt.days.reset_index()
opp_stages_wins.columns = ['Stage', 'Days from Start']

opp_stages_wins = opp_stages_wins.merge(wins[diff_cols].count().reset_index(), left_on="Stage", right_on="StageName")
opp_stages_wins.drop('StageName', axis=1, inplace=True)
opp_stages_wins.columns = ['Stage', 'Days from Start Wins', 'Count Wins']

opp_stages_wins

Unnamed: 0,Stage,Days from Start Wins,Count Wins
0,Closed Lost from Start,,0
1,Closed Won from Start,28.0,2853
2,Demo Completed from Start,7.0,1782
3,Demo Scheduled from Start,1.0,594
4,Initial Contact from Start,0.0,2731
5,Onboarding from Start,38.0,115
6,Signup from Start,18.0,1411
7,Start from Start,0.0,3231


### All data

Now join the 3 dataframes for time differences and counts to see it all in a single view

In [195]:
combined = stage_days.merge(opp_stages_wins, on="Stage").merge(opp_stages_losses, on="Stage")

Unnamed: 0,Stage,Days from Start All,Count All,Days from Start Wins,Count Wins,Days from Start Losses,Count Losses
0,Closed Lost from Start,87,3163,,0,88.0,2596
1,Closed Won from Start,28,3420,28.0,2853,,0
2,Demo Completed from Start,9,3799,7.0,1782,13.0,2056
3,Demo Scheduled from Start,2,1453,1.0,594,2.0,794
4,Initial Contact from Start,0,5682,0.0,2731,0.0,2750
5,Onboarding from Start,40,123,38.0,115,0.0,1
6,Signup from Start,20,1668,18.0,1411,9.0,93
7,Start from Start,0,6394,0.0,3231,0.0,2974


In [None]:
# TODO: Get everage number of days spend at each stage, get how many got stuck in one stage for more than 15 days