In [47]:
import pandas as pd
import numpy as np

In [48]:
df = pd.read_csv('Protenus Sales Training Data.csv', low_memory=False)
df['Snapshot_Date'] = pd.to_datetime(df['Snapshot_Date'], format='%m/%d/%y')
df['Opportunity_Created_Date'] = pd.to_datetime(df['Opportunity_Created_Date'], format='%m/%d/%y')
df['Opportunity_Close_Date'] = pd.to_datetime(df['Opportunity_Close_Date'], format='%m/%d/%y')

In [49]:
def get_quarter_year_from_date(date):
    return 'Q' + str(date.quarter) + '-' + str(date.year)

df['Snapshot_Quarter'] = df['Snapshot_Date'].apply(get_quarter_year_from_date)
df['Opportunity_Close_Quarter'] = df['Opportunity_Close_Date'].apply(get_quarter_year_from_date)



In [50]:
df['true_close_q'] = None

for index, curr_row in df.iterrows():
    curr_id = curr_row['Opportunity_ID']
    pred_close_q = curr_row['Opportunity_Close_Quarter']

    curr_opp_df = df[df['Opportunity_ID'] == curr_id].copy()

    closed_row = curr_opp_df[(curr_opp_df['Opportunity_Stage'] == 'Closed Won') | \
                            (curr_opp_df['Opportunity_Stage'] == 'Closed Lost')]
    
    if len(closed_row) > 0:
        true_close_q = closed_row['Opportunity_Close_Quarter'].values[0]
        df.loc[index, 'true_close_q'] = true_close_q

df['pred_q_correct'] = df['true_close_q'] == df['Opportunity_Close_Quarter']
df['pred_q_correct'] = df['pred_q_correct'].astype(int)

In [60]:
def add_one_quarter(date_str):
    quarter = int(date_str[1])
    year = int(date_str[3:])
    if quarter == 4:
        quarter = 1
        year += 1
    else:
        quarter += 1
    return 'Q' + str(quarter) + '-' + str(year)

df['next_quarter'] = df['Snapshot_Quarter'].apply(add_one_quarter)
df['closed_next_q'] = df['Opportunity_Close_Quarter'] == df['next_quarter']
df['closed_next_q'] = df['closed_next_q'].astype(int)
df['pred_q_plus_1_correct'] = (df['true_close_q'] == df['next_quarter']) | (df['pred_q_correct'] == 1)

In [61]:
df.columns

Index(['Snapshot_Date', 'Opportunity_ID', 'Opportunity_Stage',
       'Opportunity_Type', 'Opportunity_Annual_Amount',
       'Opportunity_Created_Date', 'Opportunity_Close_Date',
       'Opportunity_Product', 'Account_Electronic_Medical_Record_System',
       'Account_Hospital_Type', 'Account_Fiscal_Year_End', 'Account_State',
       'Account_Zip_Code', 'Account_Number_of_Hospitals ',
       'Account_Number_of_Beds', 'Account_Number_of_Affiliated_Physicians',
       'Account_Number_of_Employee', 'Account_Cash_on_Hand',
       'Account_Total_Patient_Revenue', 'Account_Net_Patient_Revenue',
       'Snapshot_Quarter', 'Opportunity_Close_Quarter', 'true_close_q',
       'pred_q_correct', 'next_quarter', 'closed_next_q',
       'pred_q_plus_1_correct'],
      dtype='object')

In [62]:
# Apply one-hot encoding to categorical columns
dummy_cols = ['Opportunity_Stage', 'Opportunity_Type', 'Opportunity_Product', 'Account_Electronic_Medical_Record_System', 'Account_Hospital_Type']
df_dummies = pd.get_dummies(df, columns=dummy_cols)


In [64]:
# Train a model to predict whether closed in next quarter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# X = df_dummies.drop(['Opportunity_ID', 'Opportunity_Stage', 'Opportunity_Created_Date', 'Opportunity_Close_Date', 'Snapshot_Date', 'Snapshot_Quarter', 'Opportunity_Close_Quarter', 'true_close_q', 'next_quarter', 'closed_next_q'], axis=1)
# Get cols in df_dummies that loook like dummy cols
dummy_cols = [col for col in df_dummies.columns if col.startswith('Opportunity_Stage') or col.startswith('Opportunity_Type') or col.startswith('Opportunity_Product') or col.startswith('Account_Electronic_Medical_Record_System') or col.startswith('Account_Hospital_Type')]
X = df_dummies[dummy_cols]
y = df_dummies['next_quarter']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)


0.09723007348784624

In [59]:
df_dummies.columns

Index(['Snapshot_Date', 'Opportunity_ID', 'Opportunity_Annual_Amount',
       'Opportunity_Created_Date', 'Opportunity_Close_Date',
       'Account_Fiscal_Year_End', 'Account_State', 'Account_Zip_Code',
       'Account_Number_of_Hospitals ', 'Account_Number_of_Beds',
       'Account_Number_of_Affiliated_Physicians', 'Account_Number_of_Employee',
       'Account_Cash_on_Hand', 'Account_Total_Patient_Revenue',
       'Account_Net_Patient_Revenue', 'Snapshot_Quarter',
       'Opportunity_Close_Quarter', 'true_close_q', 'pred_q_correct',
       'next_quarter', 'closed_next_q', 'Opportunity_Stage_Closed Lost',
       'Opportunity_Stage_Closed Won',
       'Opportunity_Stage_Stage 0 - Prospecting',
       'Opportunity_Stage_Stage 1 - Evaluating',
       'Opportunity_Stage_Stage 2 - Scoping',
       'Opportunity_Stage_Stage 3 - Pricing',
       'Opportunity_Stage_Stage 4 - Verbal / VOC',
       'Opportunity_Stage_Stage 5 - Contracting',
       'Opportunity_Type_Existing Business', 'Opportunit

In [34]:
df.columns

Index(['Snapshot_Date', 'Opportunity_ID', 'Opportunity_Stage',
       'Opportunity_Type', 'Opportunity_Annual_Amount',
       'Opportunity_Created_Date', 'Opportunity_Close_Date',
       'Opportunity_Product', 'Account_Electronic_Medical_Record_System',
       'Account_Hospital_Type', 'Account_Fiscal_Year_End', 'Account_State',
       'Account_Zip_Code', 'Account_Number_of_Hospitals ',
       'Account_Number_of_Beds', 'Account_Number_of_Affiliated_Physicians',
       'Account_Number_of_Employee', 'Account_Cash_on_Hand',
       'Account_Total_Patient_Revenue', 'Account_Net_Patient_Revenue',
       'Snapshot_Quarter', 'Opportunity_Close_Quarter', 'pred_q_correct',
       'true_close_q', 'next_quarter', 'closed_next_q'],
      dtype='object')

In [22]:
closed_row

Snapshot_Date                                2022-08-01 00:00:00
Opportunity_ID                                   0064100000TL1Fw
Opportunity_Stage                                    Closed Lost
Opportunity_Type                                    New Business
Opportunity_Annual_Amount                            317822.6165
Opportunity_Created_Date                     2018-09-12 00:00:00
Opportunity_Close_Date                       2022-07-07 00:00:00
Opportunity_Product                         Diversion Monitoring
Account_Electronic_Medical_Record_System                    Epic
Account_Hospital_Type                         Childrens Hospital
Account_Fiscal_Year_End                                 12/31/23
Account_State                                                 OH
Account_Zip_Code                                         43205.0
Account_Number_of_Hospitals                                  2.0
Account_Number_of_Beds                                     694.0
Account_Number_of_Affilia

In [23]:
closed_q

'Q3-2022'

In [10]:
df['Opportunity_Stage'].unique()

array(['Stage 2 - Scoping', 'Closed Lost', 'Stage 3 - Pricing',
       'Stage 1 - Evaluating', 'Stage 4 - Verbal / VOC',
       'Stage 5 - Contracting', 'Closed Won', 'Stage 0 - Prospecting'],
      dtype=object)

In [5]:
df.columns

Index(['Snapshot_Date', 'Opportunity_ID', 'Opportunity_Stage',
       'Opportunity_Type', 'Opportunity_Annual_Amount',
       'Opportunity_Created_Date', 'Opportunity_Close_Date',
       'Opportunity_Product', 'Account_Electronic_Medical_Record_System',
       'Account_Hospital_Type', 'Account_Fiscal_Year_End', 'Account_State',
       'Account_Zip_Code', 'Account_Number_of_Hospitals ',
       'Account_Number_of_Beds', 'Account_Number_of_Affiliated_Physicians',
       'Account_Number_of_Employee', 'Account_Cash_on_Hand',
       'Account_Total_Patient_Revenue', 'Account_Net_Patient_Revenue'],
      dtype='object')