In [13]:
import pandas as pd
import numpy as np

In [2]:
# Reading the sales data CSV file and converting specific date columns to datetime format for easier manipulation
df = pd.read_csv('OneDrive_1_01-12-2023/Protenus Sales Training Data.csv', low_memory=False)
df['Snapshot_Date'] = pd.to_datetime(df['Snapshot_Date'], format='%m/%d/%y')
df['Opportunity_Created_Date'] = pd.to_datetime(df['Opportunity_Created_Date'], format='%m/%d/%y')
df['Opportunity_Close_Date'] = pd.to_datetime(df['Opportunity_Close_Date'], format='%m/%d/%y')

In [3]:
# Function to extract quarter and year from a date
def get_quarter_year_from_date(date):
    return 'Q' + str(date.quarter) + '-' + str(date.year)

# Applying the function to create new columns for the quarter-year of snapshot and opportunity close dates
df['Snapshot_Quarter'] = df['Snapshot_Date'].apply(get_quarter_year_from_date)
df['Opportunity_Close_Quarter'] = df['Opportunity_Close_Date'].apply(get_quarter_year_from_date)

In [4]:
df.head()

Unnamed: 0,Snapshot_Date,Opportunity_ID,Opportunity_Stage,Opportunity_Type,Opportunity_Annual_Amount,Opportunity_Created_Date,Opportunity_Close_Date,Opportunity_Product,Account_Electronic_Medical_Record_System,Account_Hospital_Type,...,Account_Zip_Code,Account_Number_of_Hospitals,Account_Number_of_Beds,Account_Number_of_Affiliated_Physicians,Account_Number_of_Employee,Account_Cash_on_Hand,Account_Total_Patient_Revenue,Account_Net_Patient_Revenue,Snapshot_Quarter,Opportunity_Close_Quarter
0,2019-10-01,0064100000TL1Fw,Stage 3 - Pricing,New Business,,2018-09-12,2019-11-12,Diversion Monitoring,Epic,Childrens Hospital,...,43205.0,2.0,694.0,2196.0,7282.0,85077078.0,3652792000.0,2048661000.0,Q4-2019,Q4-2019
1,2019-10-01,0064100000TKWAx,Stage 3 - Pricing,New Business,,2018-09-07,2019-12-12,Privacy Monitoring,Epic,Health System,...,96813.0,4.0,602.0,1829.0,4443.0,-242188.0,3560499000.0,1340713000.0,Q4-2019,Q4-2019
2,2019-10-01,0064100000TKw69,Stage 2 - Scoping,New Business,,2018-09-11,2020-04-15,Diversion Monitoring,Epic,Health System,...,49503.0,10.0,1901.0,4000.0,,982985681.0,7869236000.0,3290488000.0,Q4-2019,Q2-2020
3,2019-10-01,0064100000TK0q9,Stage 2 - Scoping,New Business,,2018-08-29,2020-02-22,Diversion Monitoring,Epic,Health System,...,43604.0,12.0,1594.0,2481.0,,165601560.0,10756540000.0,2468940000.0,Q4-2019,Q1-2020
4,2019-10-01,0064100000QhXXx,Stage 3 - Pricing,New Business,,2018-06-26,2019-12-10,Privacy Monitoring,Epic,Health System,...,53226.0,10.0,1259.0,2666.0,8197.0,20765609.0,10247960000.0,3881722000.0,Q4-2019,Q4-2019


In [6]:
df[df['Opportunity_ID']=='0064100000TL1Fw']['Opportunity_Stage'].value_counts().


Opportunity_Stage
Stage 2 - Scoping    27
Stage 3 - Pricing     5
Closed Lost           1
Name: count, dtype: int64

In [9]:
df['Opportunity_Stage'].value_counts()

Opportunity_Stage
Stage 3 - Pricing         3648
Stage 2 - Scoping         2187
Stage 1 - Evaluating      1365
Stage 4 - Verbal / VOC     568
Closed Lost                526
Stage 5 - Contracting      339
Closed Won                 175
Stage 0 - Prospecting       36
Name: count, dtype: int64

In [19]:
stages = ['Stage 3 - Pricing', 'Stage 2 - Scoping', 'Stage 1 - Evaluating', 'Stage 4 - Verbal / VOC',
          'Closed Lost', 'Stage 5 - Contracting', 'Closed Won', 'Stage 0 - Prospecting']

# Iterate through each stage and create a new column for each stage
for stage in stages:
    column_name = f'Stage_Sum_{stage.replace(" ", "_").replace("/", "_").replace("-", "_")}'
    df[column_name] = (df['Opportunity_Stage'] == stage).astype(int)

# Group by Opportunity_ID and sum the stage occurrences
result = df.groupby('Opportunity_ID').agg({f'Stage_Sum_{stage.replace(" ", "_").replace("/", "_").replace("-", "_")}': 'sum' for stage in stages}).reset_index()


In [20]:
result

Unnamed: 0,Opportunity_ID,Stage_Sum_Stage_3___Pricing,Stage_Sum_Stage_2___Scoping,Stage_Sum_Stage_1___Evaluating,Stage_Sum_Stage_4___Verbal___VOC,Stage_Sum_Closed_Lost,Stage_Sum_Stage_5___Contracting,Stage_Sum_Closed_Won,Stage_Sum_Stage_0___Prospecting
0,0061K00000b3JMy,0,16,0,0,1,0,0,0
1,0061K00000b3Qsm,31,1,2,1,0,2,1,0
2,0061K00000b3uDh,2,0,0,0,1,0,0,0
3,0061K00000bI3DY,0,0,0,3,0,0,1,0
4,0061K00000bJS0n,2,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
856,0064100000TKw69,0,14,0,0,1,0,0,0
857,0064100000TL1Fw,5,27,0,0,1,0,0,0
858,0064100000a4MNO,0,0,0,1,1,0,0,0
859,0064100000a6Ou1,5,0,0,0,1,0,0,0


In [23]:
result.Stage_Sum_Stage_2___Scoping.sum()

2187