### Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

### Importing Data

In [2]:
opportunities_original = pd.read_csv('Opportunity.csv', encoding = 'unicode_escape')

features_to_keep = ['Amount', 'Days_in_Quarter__c', 'ExpectedRevenue',
                    'FiscalQuarter', 'ForecastCategory', 'HasOpportunityLineItem', 'InvoiceTotalIncludingVATFreight__c', 
                    'IsClosed', 'IsWon', 'LineItemCount__c', 'NEW_Total_ACV__c', 'OwnerId', 'Probability', 
                    'Spigit_Rep_Assesment__c', 'Spigit_Stage__c', 'StageName']

opportunities = opportunities_original.copy()
opportunities = opportunities[features_to_keep]

### Data Processing

In [3]:
#Converting into categories
opportunities.StageName = opportunities.StageName.astype('category')
opportunities.Spigit_Stage__c = opportunities.Spigit_Stage__c.astype('category')
opportunities.Spigit_Rep_Assesment__c = opportunities.Spigit_Rep_Assesment__c.astype('category')
opportunities.OwnerId = opportunities.OwnerId.astype('category')
opportunities.OwnerId = opportunities.OwnerId.cat.codes
opportunities.ForecastCategory = opportunities.ForecastCategory.astype('category')

#Dummies
opportunities.StageName = pd.get_dummies(opportunities.StageName)
opportunities.Spigit_Stage__c = pd.get_dummies(opportunities.Spigit_Stage__c)
opportunities.Spigit_Rep_Assesment__c = pd.get_dummies(opportunities.Spigit_Rep_Assesment__c)
opportunities.ForecastCategory = pd.get_dummies(opportunities.ForecastCategory)

#Handling Missing values
opportunities.Amount = opportunities.Amount.fillna(opportunities.Amount.mean())
opportunities.ExpectedRevenue = opportunities.ExpectedRevenue.fillna(opportunities.ExpectedRevenue.mean())

In [4]:
print(opportunities.info())
print(opportunities.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4181 entries, 0 to 4180
Data columns (total 16 columns):
Amount                                4181 non-null float64
Days_in_Quarter__c                    4181 non-null int64
ExpectedRevenue                       4181 non-null float64
FiscalQuarter                         4181 non-null int64
ForecastCategory                      4181 non-null uint8
HasOpportunityLineItem                4181 non-null bool
InvoiceTotalIncludingVATFreight__c    4181 non-null float64
IsClosed                              4181 non-null bool
IsWon                                 4181 non-null bool
LineItemCount__c                      4181 non-null int64
NEW_Total_ACV__c                      4181 non-null float64
OwnerId                               4181 non-null int8
Probability                           4181 non-null int64
Spigit_Rep_Assesment__c               4181 non-null uint8
Spigit_Stage__c                       4181 non-null uint8
StageName          

### Creating copy of opportunities dataframe for open deals

In [5]:
open_opportunities = opportunities.copy()

## For Closed opportunities

In [6]:
opportunities = opportunities[opportunities.IsClosed == True]

#### Splitting Data

In [7]:
X = opportunities.loc[:, opportunities.columns != 'IsWon']
y = opportunities['IsWon']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#### Random Forest algorithm

In [8]:
rf = RandomForestClassifier(n_estimators=10000, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

#### Identify important features

In [9]:
for feature in zip(X, rf.feature_importances_):
    print(feature)

('Amount', 0.024721062880907124)
('Days_in_Quarter__c', 0.007459375812780178)
('ExpectedRevenue', 0.2834060451437956)
('FiscalQuarter', 0.001238041640114168)
('ForecastCategory', 0.0)
('HasOpportunityLineItem', 0.024028288412055038)
('InvoiceTotalIncludingVATFreight__c', 0.02905959777165972)
('IsClosed', 0.0)
('LineItemCount__c', 0.02487511109386473)
('NEW_Total_ACV__c', 0.005698974451708831)
('OwnerId', 0.005920863120899692)
('Probability', 0.5371922210248334)
('Spigit_Rep_Assesment__c', 0.01682078014787047)
('Spigit_Stage__c', 0.00015053424884069018)
('StageName', 0.03942910425066806)


#### Selecting important features and rebuilding Random Forest

In [10]:
features_to_keep_v1 = ['Amount', 'Days_in_Quarter__c', 'ExpectedRevenue',
                    'FiscalQuarter', 'HasOpportunityLineItem', 'InvoiceTotalIncludingVATFreight__c', 
                    'IsClosed', 'IsWon', 'LineItemCount__c', 'NEW_Total_ACV__c', 'OwnerId', 'Probability', 
                    'Spigit_Rep_Assesment__c', 'StageName']
opportunities = opportunities[features_to_keep_v1]

opportunities = opportunities[opportunities.IsClosed == True]

X = opportunities.loc[:, opportunities.columns != 'IsWon']
y = opportunities['IsWon']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

rf1 = RandomForestClassifier(n_estimators=10000, random_state=0)
rf1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [12]:
X_train.to_csv('input_data.csv')

### Prediction and Accuracy

In [13]:
y_pred = rf1.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [14]:
X_test.shape

(844, 13)

In [15]:
closed_predicted = pd.DataFrame(y_pred)
closed_predicted.shape

(844, 1)

In [16]:
#validation_dataset

validation_dataset = pd.concat([X_test.reset_index(drop=True), closed_predicted], axis=1)
validation_dataset.to_csv('closed_deals_validation_data.csv')

## Open Opportunities

In [17]:
open_opportunities = open_opportunities[open_opportunities.IsClosed == False]

In [18]:
open_opportunities = open_opportunities[features_to_keep_v1]

In [19]:
open_opportunities.shape

(807, 14)

In [20]:
X_open = open_opportunities.loc[:, open_opportunities.columns != 'IsWon']
y_open = open_opportunities['IsWon']

In [21]:
y_pred_open = rf1.predict(X_open)
accuracy_score(y_open, y_pred_open)

0.9553903345724907

In [22]:
open_predicted = pd.DataFrame(y_pred_open)

In [23]:
open_deals_validation = pd.concat([X_open.reset_index(drop=True), open_predicted], axis=1)
open_deals_validation.to_csv('open_deals_validation_data.csv')