# Winning Opportunity prediction

## Importing basic libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [2]:
opportunities_original = pd.read_csv('Opportunity.csv', encoding = 'unicode_escape')
opportunities = opportunities_original.copy()

In [3]:
opportunities.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4181 entries, 0 to 4180
Data columns (total 185 columns):
Account_Development_Rep__c                941 non-null object
AccountId                                 4181 non-null object
Americas_Partner_Select__c                1 non-null object
Amount                                    3431 non-null float64
Amount_Billed_To_Date__c                  4181 non-null int64
Approved_Discount__c                      4181 non-null int64
Audit_Flag__c                             0 non-null float64
Bill_To_Customer__c                       222 non-null object
Business_Issues__c                        477 non-null object
CampaignId                                168 non-null object
CDR__c                                    0 non-null float64
CDT_Next_Step__c                          0 non-null float64
CDT_Next_Step_Due_Date__c                 0 non-null float64
CDT_Owner__c                              13 non-null object
CDT_Status__c               

In [4]:
opportunities.isnull().sum()

Account_Development_Rep__c           3240
AccountId                               0
Americas_Partner_Select__c           4180
Amount                                750
Amount_Billed_To_Date__c                0
Approved_Discount__c                    0
Audit_Flag__c                        4181
Bill_To_Customer__c                  3959
Business_Issues__c                   3704
CampaignId                           4013
CDR__c                               4181
CDT_Next_Step__c                     4181
CDT_Next_Step_Due_Date__c            4181
CDT_Owner__c                         4168
CDT_Status__c                        4181
Closed_Lost_Reason__c                2785
CloseDate                               0
Collaboration_Primary_Interest__c       0
Connect_Sales_Decline_Reason__c      4177
CreatedById                             0
CreatedDate                             0
Credit_Card_Cardholders_Name__c      4180
Credit_Card_CVV__c                   4181
Credit_Card_Expiration_Date__c    

## Data Preparation

In [5]:
opportunities.columns.tolist()

req_features = ['Amount', 'Days_in_Quarter__c', 'End_Customer_Region__c', 'ExpectedRevenue', 'FiscalQuarter', 
                'FiscalYear', 'ForecastCategory', 'ForecastCategoryName', 'HasOpportunityLineItem', 
                'InvoiceTotalIncludingVATFreight__c', 'IsClosed', 'IsSplit', 'LineItemCount__c', 'NEW_Total_ACV__c', 
                'Probability', 'PushCount__c', 'Spigit_Channel_Type__c', 'Spigit_Community_Type__c', 
                'Spigit_Rep_Assesment__c', 'Total_ACV__c', 'Total_Spigit_Consulting__c', 'Total_Spigit_Consulting_old__c', 
                'Total_Spigit_Subscriptions__c', 'Total_Spigit_Subscriptions_old__c', 'Total_Subscriptions__c', 
                'TotalOpportunityQuantity', 'Type', 'IsWon']

opportunities = opportunities[req_features]

### Dealing with missing values

In [6]:
replace_with_MISSING = ['End_Customer_Region__c', 'Spigit_Channel_Type__c', 'Spigit_Community_Type__c', 
                        'Spigit_Rep_Assesment__c', 'Type']
opportunities[replace_with_MISSING] = opportunities[replace_with_MISSING].fillna('MISSING')


replace_with_mean = ['Amount', 'End_Customer_Region__c', 'ExpectedRevenue', 'PushCount__c', 'Total_Spigit_Consulting_old__c', 
                     'Total_Spigit_Subscriptions_old__c', 'TotalOpportunityQuantity']
opportunities[replace_with_mean] = opportunities[replace_with_mean].fillna(opportunities[replace_with_mean].mean())

## Scaling Data

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [8]:
numerics = ['int64', 'float64']
newdf = opportunities.select_dtypes(include=numerics)
numerical_colums = newdf.columns
opportunities[numerical_colums] = scaler.fit_transform(opportunities[numerical_colums])

## Converting object to dummies

In [9]:
newdf_O = opportunities.select_dtypes(include='object')
object_columns = newdf_O.columns

for i in object_columns : 
    opportunities[i] = opportunities[i].astype('category')
    
opportunities = pd.get_dummies(opportunities, drop_first=True)

## For Open Opportunities

In [10]:
open_opportunities = opportunities.copy()

## Closed opportunities

In [11]:
opportunities = opportunities[opportunities.IsClosed == True]

#### Splitting data into Train-Test

In [12]:
X = opportunities.loc[:, opportunities.columns != 'IsWon']
y = opportunities['IsWon']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#### Applying Random Forest

In [13]:
rf = RandomForestClassifier(n_estimators=10000, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

#### Model Accuracy

In [14]:
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

## Open Opportunities

In [15]:
open_opportunities = open_opportunities[open_opportunities.IsClosed == False]

In [16]:
X_open = open_opportunities.loc[:, open_opportunities.columns != 'IsWon']
y_open = open_opportunities['IsWon']

In [17]:
y_pred_open = rf.predict(X_open)
accuracy_score(y_open, y_pred_open)

0.9553903345724907

# Exporting Data

#### Imput Data

In [20]:
X_train.to_csv('input.csv')

In [21]:
closed_predicted = pd.DataFrame(y_pred)
val_df = pd.concat([X_test.reset_index(drop=True), closed_predicted], axis=1)
val_df.to_csv('closedVal.csv')

In [22]:
open_predicted = pd.DataFrame(y_pred_open)
open_deals_validation = pd.concat([X_open.reset_index(drop=True), open_predicted], axis=1)
open_deals_validation.to_csv('openVal.csv')