### Step 1: Reading and Understanding the Data

In [51]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.impute import SimpleImputer
import datetime as dt



In [53]:
lead = pd.read_csv('research_joint_data.csv', encoding = 'latin', error_bad_lines=False)
lead.head()

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,Final_Record_Type__c,RICE_Supported__c,CreatedDate1,CreatedDate,CloseDate,Actual_Close_Date__c,...,emails,calls,meetings,tasks,events,other,total_contacts,days_till_first_contact,contacts_first_month,contacts_three_month
0,0062e000002Hc2PAAS,Closed Lost,Customer No Longer Interested,Custom Education (MSPACE Included),Custom Education (MSPACE Included),RIC-BD&I,2019-11-07,2019-11-07T00:59:46.000Z,31/8/20,30/4/20,...,,,,,,,,,,
1,0062e000002HFaaAAG,Post Award,,Parent Grant,Parent Grant,RIC-RE&D,2018-09-19,2018-09-19T04:32:55.000Z,31/12/49,,...,,,,,,,,,,
2,0062e000002HFabAAG,Post Award,,Parent Grant,Parent Grant,RIC-RE&D,2018-09-19,2018-09-19T04:32:19.000Z,22/6/20,22/6/20,...,,,,,,,,,,
3,0062e000002HFacAAG,Closed,,Parent Grant,Parent Grant,,2016-07-06,2016-07-06T05:03:00.000Z,31/12/49,,...,,,,,,,,,,
4,0062e000002HFadAAG,Closed,,Parent Grant,Parent Grant,,2016-07-21,2016-07-21T23:56:54.000Z,31/12/49,,...,,,,,,,,,,


In [54]:
# inspect lead dataframe

print("*********************************  Info *******************************************") 
print(lead.info())
print("*********************************  Shape ******************************************") 
print(lead.shape)
print("**************************** Columns having null values *****************************")
print(lead.isnull().any())
print("**************************** Describe *****************************")
lead.describe()

*********************************  Info *******************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7597 entries, 0 to 7596
Data columns (total 46 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Id                                7597 non-null   object 
 1   StageName                         7597 non-null   object 
 2   Status_Reason__c                  7288 non-null   object 
 3   RecordType.Name                   7597 non-null   object 
 4   Final_Record_Type__c              7577 non-null   object 
 5   RICE_Supported__c                 7569 non-null   object 
 6   CreatedDate1                      7597 non-null   object 
 7   CreatedDate                       7597 non-null   object 
 8   CloseDate                         7597 non-null   object 
 9   Actual_Close_Date__c              6293 non-null   object 
 10  Amount                            7365 non-null 

Unnamed: 0,Amount,Estimated_Project_Total_Value__c,Booked_Revenue__c,Actual_Project_Total_Value__c,UoM_Organisation_Level__c,emails,calls,meetings,tasks,events,other,total_contacts,days_till_first_contact,contacts_first_month,contacts_three_month
count,7365.0,4934.0,3113.0,3173.0,0.0,588.0,588.0,588.0,588.0,588.0,588.0,588.0,588.0,588.0,588.0
mean,416618.2,19493870.0,197417.1,364955.8,,9.52551,0.0,0.0,0.0,0.0,0.0,9.52551,340136100.0,4.943878,6.556122
std,2453104.0,739611500.0,1273635.0,3424244.0,,20.664622,0.0,0.0,0.0,0.0,0.0,20.664622,5827149000.0,12.428134,14.904874
min,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-259.0,0.0,0.0
25%,0.0,4743.75,0.0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
50%,17820.0,60000.0,7530.0,7500.0,,3.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,2.0
75%,150000.0,300000.0,87000.0,90000.0,,8.0,0.0,0.0,0.0,0.0,0.0,8.0,69.75,4.0,6.0
max,80000000.0,30000000000.0,51300000.0,130000000.0,,218.0,0.0,0.0,0.0,0.0,0.0,218.0,100000000000.0,136.0,138.0


# Step 2: Data Cleaning

In [55]:
round(100*(lead.isnull().sum().sort_values(ascending=False)/len(lead.index)), 2)

UoM_Organisation_Level__c           100.00
Supporting_Faculty_2__c              99.07
Supporting_Faculty_1__c              95.60
CE_Course_Audience_Type__c           94.79
CE_Course_Type__c                    93.93
calls                                92.26
contacts_first_month                 92.26
emails                               92.26
contacts_three_month                 92.26
meetings                             92.26
tasks                                92.26
events                               92.26
other                                92.26
total_contacts                       92.26
days_till_first_contact              92.26
Customer_Contact__c                  88.07
Parent_Opportunity__c                87.07
ParentId                             84.99
Lead_Department__c                   75.89
Booked_Revenue__c                    59.02
Actual_Project_Total_Value__c        58.23
Industry_Sub_Type__c                 44.73
Lead_School__c                       43.77
BD_Division

### Check if there are any duplicate values in the dataset

In [56]:
lead[lead.duplicated(keep=False)]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,Final_Record_Type__c,RICE_Supported__c,CreatedDate1,CreatedDate,CloseDate,Actual_Close_Date__c,...,emails,calls,meetings,tasks,events,other,total_contacts,days_till_first_contact,contacts_first_month,contacts_three_month


There are no duplicate values in the data.

### (1) Transform to 1yes/0no

In [57]:
lead[["ParentId","Parent_Opportunity__c"]]=lead[["ParentId","Parent_Opportunity__c"]].notnull().astype(int)

In [58]:
#### lead[["Actual_Close_Date__c"]]=lead[["Actual_Close_Date__c"]].notnull().astype(int)

### (2) Drop 3 columns

In [59]:
#lead = lead.drop(["Lead_School__c","Industry_Sub_Type__c"],axis=1)

In [60]:
#check null values
round(100*(lead.isnull().sum().sort_values(ascending=False)/len(lead.index)), 2)

UoM_Organisation_Level__c           100.00
Supporting_Faculty_2__c              99.07
Supporting_Faculty_1__c              95.60
CE_Course_Audience_Type__c           94.79
CE_Course_Type__c                    93.93
total_contacts                       92.26
days_till_first_contact              92.26
contacts_first_month                 92.26
other                                92.26
contacts_three_month                 92.26
emails                               92.26
calls                                92.26
meetings                             92.26
tasks                                92.26
events                               92.26
Customer_Contact__c                  88.07
Lead_Department__c                   75.89
Booked_Revenue__c                    59.02
Actual_Project_Total_Value__c        58.23
Industry_Sub_Type__c                 44.73
Lead_School__c                       43.77
BD_Division__c                       38.96
Estimated_Project_Total_Value__c     35.05
BD_Cluster_

### (3) Delete opportunities without account id

In [61]:
lead.dropna(subset=['AccountId'],inplace=True)

In [62]:
round(100*(lead.isnull().sum().sort_values(ascending=False)/len(lead.index)), 2)

UoM_Organisation_Level__c           100.00
Supporting_Faculty_2__c              99.02
Supporting_Faculty_1__c              95.47
CE_Course_Audience_Type__c           94.52
CE_Course_Type__c                    93.61
calls                                91.61
contacts_first_month                 91.61
contacts_three_month                 91.61
meetings                             91.61
tasks                                91.61
events                               91.61
other                                91.61
total_contacts                       91.61
days_till_first_contact              91.61
emails                               91.61
Customer_Contact__c                  86.75
Lead_Department__c                   75.58
Booked_Revenue__c                    55.94
Actual_Project_Total_Value__c        55.07
Lead_School__c                       43.30
BD_Division__c                       39.31
Industry_Sub_Type__c                 38.61
BD_Cluster__c                        34.18
Estimated_P

### (4) CreatedDate, CloseDate change string to time, seperate year, month

In [63]:
lead.loc[:,"CloseDate"]

0        31/8/20
1       31/12/49
2        22/6/20
3       31/12/49
4       31/12/49
          ...   
7591    30/10/20
7592     30/9/20
7593     30/9/20
7595    31/12/20
7596    30/11/20
Name: CloseDate, Length: 6840, dtype: object

In [64]:
lead.loc[:,'CloseDate']=pd.to_datetime(lead.loc[:,'CloseDate'],
                                           format='%d/%m/%y', 
                                           errors='coerce')


In [65]:
lead['CloseYear'] = pd.DatetimeIndex(lead['CloseDate']).year  
lead.loc[:,"CloseYear"]

0       2020
1       2049
2       2020
3       2049
4       2049
        ... 
7591    2020
7592    2020
7593    2020
7595    2020
7596    2020
Name: CloseYear, Length: 6840, dtype: int64

In [66]:
lead['CloseMonth'] = pd.DatetimeIndex(lead['CloseDate']).month                                          

In [67]:
lead.loc[:,"CreatedDate"]

0       2019-11-07T00:59:46.000Z
1       2018-09-19T04:32:55.000Z
2       2018-09-19T04:32:19.000Z
3       2016-07-06T05:03:00.000Z
4       2016-07-21T23:56:54.000Z
                  ...           
7591    2020-08-06T00:24:37.000Z
7592    2020-08-06T01:33:22.000Z
7593    2020-08-06T01:40:55.000Z
7595    2020-08-06T11:13:34.000Z
7596    2020-08-06T14:36:01.000Z
Name: CreatedDate, Length: 6840, dtype: object

In [68]:
lead['CreatedYear'] = pd.DatetimeIndex(lead['CreatedDate']).year                                          

In [69]:
lead.loc[:,"CreatedYear"]

0       2019
1       2018
2       2018
3       2016
4       2016
        ... 
7591    2020
7592    2020
7593    2020
7595    2020
7596    2020
Name: CreatedYear, Length: 6840, dtype: int64

In [70]:
lead['CreatedMonth'] = pd.DatetimeIndex(lead['CreatedDate']).month 

In [71]:
lead.loc[:,"CreatedMonth"]

0       11
1        9
2        9
3        7
4        7
        ..
7591     8
7592     8
7593     8
7595     8
7596     8
Name: CreatedMonth, Length: 6840, dtype: int64

In [72]:
lead['year_length']=lead['CloseYear']-lead['CreatedYear']

### fix year_length

In [75]:
lead["CreatedDate1"] = pd.to_datetime(lead["CreatedDate1"], format='%Y/%m/%d')
lead.loc[:,"CreatedDate1"] 

0      2019-11-07
1      2018-09-19
2      2018-09-19
3      2016-07-06
4      2016-07-21
          ...    
7591   2020-08-06
7592   2020-08-06
7593   2020-08-06
7595   2020-08-06
7596   2020-08-06
Name: CreatedDate1, Length: 6840, dtype: datetime64[ns]

In [32]:
lead["CloseDate"] = pd.to_datetime(lead["CloseDate"], format='%Y/%m/%d')
lead.loc[:,"CloseDate"]

0      2020-08-31
1      2049-12-31
2      2020-06-22
3      2049-12-31
4      2049-12-31
          ...    
7591   2020-10-30
7592   2020-09-30
7593   2020-09-30
7595   2020-12-31
7596   2020-11-30
Name: CloseDate, Length: 6840, dtype: datetime64[ns]

In [76]:

lead['days']=(lead['CloseDate']-lead["CreatedDate1"]).dt.days
#lead['days_diff']=(lead['act_closeDate']-Opportunity_raw['createDate']).dt.days


### (5) Industry

In [77]:
lead['Industry'].value_counts()

Services                                    2457
Health                                      1530
Public Administration                        963
Defence                                      477
Technology: IT & Telephone Services          263
Agriculture & Horticulture                   259
Manufacturing & R&D                          212
Energy & Utilities                           191
Construction & Construction Materials        137
Transportation & Logistics                   108
Mining & Refining                             73
Materials: Chemicals & Industrial Metals      50
Education                                     43
Veterinary                                    35
Mining Equipment and Technology Services      17
Transportation                                 8
Aquaculture & Fisheries                        6
Forestry & Paper                               6
Advertising, Marketing and PR                  3
Name: Industry, dtype: int64

In [78]:
lead['Industry'].isnull().sum()

2

In [79]:
lead[lead['Industry'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,Final_Record_Type__c,RICE_Supported__c,CreatedDate1,CreatedDate,CloseDate,Actual_Close_Date__c,...,total_contacts,days_till_first_contact,contacts_first_month,contacts_three_month,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length,days
5807,0062e000002JFzFAAW,Closed Lost,Academic Capacity (Lost),Consultancy (Non-research),Consultancy (Non-research),RIC-BD&I,2019-12-10,2019-12-10T03:10:48.000Z,2020-01-13,13/1/20,...,,,,,2020,1,2019,12,1,34
7459,0062e000002XzU5AAK,Closed Won,Won,Research Contract,Research Contract,RIC-BD&I,2020-07-16,2020-07-16T06:44:43.000Z,2020-07-31,24/7/20,...,,,,,2020,7,2020,7,0,15


In [80]:
# since "Services" has the major chunk of data, 
# we can impute the null values with Services
lead['Industry'] = lead['Industry'].replace(np.nan, "UNKNOWN")

### (6) Industry_Sub_Type__c

In [81]:
#for row in lead[lead['Industry'].isin(["Public Administration"])]:
    #lead['Industry_Sub_Type__c'] = lead['Industry_Sub_Type__c'].replace(np.nan, "Public Administration")
    
lead["Industry_Sub_Type__c"].fillna(lead["Industry"], inplace=True)

### (7) Status_Reason__c

In [82]:
#lead['Status_Reason__c'] = lead['Status_Reason__c'].replace(np.nan, "Other (Lost)")
lead[lead['Status_Reason__c'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,Final_Record_Type__c,RICE_Supported__c,CreatedDate1,CreatedDate,CloseDate,Actual_Close_Date__c,...,total_contacts,days_till_first_contact,contacts_first_month,contacts_three_month,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length,days
1,0062e000002HFaaAAG,Post Award,,Parent Grant,Parent Grant,RIC-RE&D,2018-09-19,2018-09-19T04:32:55.000Z,2049-12-31,,...,,,,,2049,12,2018,9,31,11426
2,0062e000002HFabAAG,Post Award,,Parent Grant,Parent Grant,RIC-RE&D,2018-09-19,2018-09-19T04:32:19.000Z,2020-06-22,22/6/20,...,,,,,2020,6,2018,9,2,642
3,0062e000002HFacAAG,Closed,,Parent Grant,Parent Grant,,2016-07-06,2016-07-06T05:03:00.000Z,2049-12-31,,...,,,,,2049,12,2016,7,33,12231
4,0062e000002HFadAAG,Closed,,Parent Grant,Parent Grant,,2016-07-21,2016-07-21T23:56:54.000Z,2049-12-31,,...,,,,,2049,12,2016,7,33,12216
5,0062e000002HFaeAAG,Closed,,Parent Grant,Parent Grant,,2017-04-07,2017-04-07T01:30:10.000Z,2049-12-31,,...,,,,,2049,12,2017,4,32,11956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7470,0062e000002YDxYAAW,Pre-submission,,Parent Grant,Parent Grant,RIC-RE&D,2020-07-20,2020-07-20T03:45:24.000Z,2020-07-31,,...,,,,,2020,7,2020,7,0,11
7471,0062e000002YE2DAAW,Pre-submission,,Parent Grant,Parent Grant,RIC-RE&D,2020-07-20,2020-07-20T03:58:43.000Z,2020-12-31,,...,,,,,2020,12,2020,7,0,164
7491,0062e000002YKGJAA4,Pre-submission,,Parent Grant,Parent Grant,RIC-BD&I,2020-07-21,2020-07-21T06:41:52.000Z,2020-10-31,,...,,,,,2020,10,2020,7,0,102
7514,0062e000002YrlxAAC,Awaiting Results,,Parent Grant,Parent Grant,RIC-BD&I,2020-07-29,2020-07-29T00:25:39.000Z,2020-11-30,,...,,,,,2020,11,2020,7,0,124


In [83]:
lead["Status_Reason__c"].fillna(lead["StageName"], inplace=True)

### (8) 'RICE_Supported__c'

In [84]:
lead['RICE_Supported__c'].value_counts()

RIC-BD&I             3027
RIC RE&D and BD&I    2702
Not supported         552
RIC-RE&D              531
Name: RICE_Supported__c, dtype: int64

In [85]:
lead[lead['RICE_Supported__c'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,Final_Record_Type__c,RICE_Supported__c,CreatedDate1,CreatedDate,CloseDate,Actual_Close_Date__c,...,total_contacts,days_till_first_contact,contacts_first_month,contacts_three_month,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length,days
3,0062e000002HFacAAG,Closed,Closed,Parent Grant,Parent Grant,,2016-07-06,2016-07-06T05:03:00.000Z,2049-12-31,,...,,,,,2049,12,2016,7,33,12231
4,0062e000002HFadAAG,Closed,Closed,Parent Grant,Parent Grant,,2016-07-21,2016-07-21T23:56:54.000Z,2049-12-31,,...,,,,,2049,12,2016,7,33,12216
5,0062e000002HFaeAAG,Closed,Closed,Parent Grant,Parent Grant,,2017-04-07,2017-04-07T01:30:10.000Z,2049-12-31,,...,,,,,2049,12,2017,4,32,11956
6,0062e000002HFafAAG,Closed,Closed,Parent Grant,Parent Grant,,2017-06-27,2017-06-27T01:24:02.000Z,2049-12-31,,...,,,,,2049,12,2017,6,32,11875
7,0062e000002HFagAAG,Closed,Closed,Parent Grant,Parent Grant,,2017-11-27,2017-11-27T22:16:34.000Z,2049-12-31,,...,,,,,2049,12,2017,11,32,11722
8,0062e000002HFahAAG,Closed,Closed,Parent Grant,Parent Grant,,2016-08-22,2016-08-22T01:40:33.000Z,2049-12-31,,...,,,,,2049,12,2016,8,33,12184
10,0062e000002HFajAAG,Closed,Closed,Parent Grant,Parent Grant,,2017-12-15,2017-12-15T01:48:39.000Z,2049-12-31,,...,,,,,2049,12,2017,12,32,11704
11,0062e000002HFakAAG,Closed,Closed,Parent Grant,Parent Grant,,2017-02-14,2017-02-14T00:14:42.000Z,2049-12-31,,...,,,,,2049,12,2017,2,32,12008
13,0062e000002HFamAAG,Closed,Closed,Parent Grant,Parent Grant,,2017-04-07,2017-04-07T04:48:26.000Z,2049-12-31,,...,,,,,2049,12,2017,4,32,11956
14,0062e000002HFanAAG,Closed,Closed,Parent Grant,Parent Grant,,2016-09-20,2016-09-20T00:32:03.000Z,2049-12-31,,...,,,,,2049,12,2016,9,33,12155


In [86]:
lead['RICE_Supported__c'] = lead['RICE_Supported__c'].replace(np.nan, "NotGiven")

### (9) RecordType.Name.1

In [87]:
lead['RecordType.Name.1'].value_counts()


Business Organization    6796
University Department      43
Administrative              1
Name: RecordType.Name.1, dtype: int64

In [88]:
lead[lead['RecordType.Name.1'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,Final_Record_Type__c,RICE_Supported__c,CreatedDate1,CreatedDate,CloseDate,Actual_Close_Date__c,...,total_contacts,days_till_first_contact,contacts_first_month,contacts_three_month,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length,days


### (10) 'Business_Type__c'

In [89]:
lead['Business_Type__c'].value_counts()

Government: Australia: Federal                       1419
Government: Australia: Local & State                 1288
SME (small to medium enterprise)                      914
Multinational / Other Large Corporate                 853
Not for profit                                        823
University                                            535
Large Australian Corporate                            440
Government: International                             321
PFRO (Publicly-Funded Research Organisation)          101
RDC (Rural Research and Development Corporations)      53
CRC (Co-operative Research Centre)                     48
Name: Business_Type__c, dtype: int64

In [90]:
lead[lead['Business_Type__c'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,Final_Record_Type__c,RICE_Supported__c,CreatedDate1,CreatedDate,CloseDate,Actual_Close_Date__c,...,total_contacts,days_till_first_contact,contacts_first_month,contacts_three_month,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length,days
82,0062e000002HFtrAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Custom Education (MSPACE Included),RIC RE&D and BD&I,2016-06-01,2016-06-01T06:31:24.000Z,2019-11-01,11/12/18,...,,,,,2019,11,2016,6,3,1248
84,0062e000002HFtsAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Custom Education (MSPACE Included),RIC RE&D and BD&I,2017-05-09,2017-05-09T06:26:42.000Z,2019-11-01,11/12/18,...,,,,,2019,11,2017,5,2,906
86,0062e000002HFttAAG,Closed Lost,Other (Lost),Research Contract,Research Contract,RIC RE&D and BD&I,2016-06-01,2016-06-01T06:17:28.000Z,2019-11-01,11/12/18,...,,,,,2019,11,2016,6,3,1248
88,0062e000002HFtuAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Custom Education (MSPACE Included),Not supported,2018-01-22,2018-01-22T00:12:49.000Z,2019-11-01,11/12/18,...,,,,,2019,11,2018,1,1,648
90,0062e000002HFtvAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Custom Education (MSPACE Included),RIC RE&D and BD&I,2017-04-28,2017-04-28T04:09:27.000Z,2019-11-01,27/11/18,...,,,,,2019,11,2017,4,2,917
92,0062e000002HFtwAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Custom Education (MSPACE Included),Not supported,2018-01-22,2018-01-22T00:11:00.000Z,2019-11-01,11/12/18,...,,,,,2019,11,2018,1,1,648
94,0062e000002HFtxAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Custom Education (MSPACE Included),RIC RE&D and BD&I,2017-08-27,2017-08-27T10:47:44.000Z,2019-11-01,27/11/18,...,,,,,2019,11,2017,8,2,796
96,0062e000002HFtyAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Custom Education (MSPACE Included),RIC RE&D and BD&I,2016-02-03,2016-02-03T22:45:38.000Z,2019-11-01,11/12/18,...,,,,,2019,11,2016,2,3,1367
98,0062e000002HFtzAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Custom Education (MSPACE Included),RIC RE&D and BD&I,2017-10-23,2017-10-23T22:54:26.000Z,2019-11-01,27/11/18,...,,,,,2019,11,2017,10,2,739
100,0062e000002HFu0AAG,Closed Lost,Other (Lost),Internship,Internship,RIC RE&D and BD&I,2016-06-01,2016-06-01T11:24:39.000Z,2019-11-01,11/12/18,...,,,,,2019,11,2016,6,3,1248


In [91]:
lead['Business_Type__c'] = lead['Business_Type__c'].replace(np.nan, "Not for profit")

### (11) 'Lead_Faculty__c','Lead_School__c'

In [92]:
lead['Lead_Faculty__c'] = lead['Lead_Faculty__c'].replace(np.nan, 'NotGiven')
lead['Lead_School__c'] = lead['Lead_School__c'].replace(np.nan, 'NotGiven')
#lead['Lead_Faculty__c'].value_counts()
round(100*(lead.isnull().sum().sort_values(ascending=False)/len(lead.index)), 2)
                                                      

UoM_Organisation_Level__c           100.00
Supporting_Faculty_2__c              99.02
Supporting_Faculty_1__c              95.47
CE_Course_Audience_Type__c           94.52
CE_Course_Type__c                    93.61
emails                               91.61
contacts_three_month                 91.61
contacts_first_month                 91.61
days_till_first_contact              91.61
total_contacts                       91.61
other                                91.61
events                               91.61
tasks                                91.61
meetings                             91.61
calls                                91.61
Customer_Contact__c                  86.75
Lead_Department__c                   75.58
Booked_Revenue__c                    55.94
Actual_Project_Total_Value__c        55.07
BD_Division__c                       39.31
BD_Cluster__c                        34.18
Estimated_Project_Total_Value__c     31.74
Lead_Academic_contact__c             25.26
Actual_Clos

### (12) StageName 

In [93]:
lead['Status_Reason__c'].value_counts()

Other (Lost)                         1985
Won                                  1984
On Track                              802
Customer No Longer Interested         607
Proposal/Application Unsuccessful     434
Academic Capacity (Lost)              198
Out-Sold                              133
Pre-submission                        100
Awaiting Customer Action               90
Customer Not Responding                87
Academic No Longer Interested          70
Awaiting Results                       63
Closed                                 56
Not Ready for Market                   40
Submitting                             32
Post Award                             27
Closed Lost                            20
Failure to Agree - Scope of Work       13
BD Capacity (Lost)                     13
Failure to Agree - Price               13
Academic Capacity (Open)               10
Unacceptable Risk                       9
Other (Open)                            8
Price                             

In [94]:
lead['StageName'].value_counts()

Closed Lost                                  3389
Closed Won                                   1706
Identifying                                   404
Developing                                    286
Awaiting Results                              239
Proposing                                     226
Closing                                       148
Application                                   115
Pre-submission                                104
Closed                                         56
Execution / Closing                            33
Submitting                                     32
Execute Contract                               30
Post Award                                     27
Develop Relationship/ Qualify opportunity      18
Notice                                         14
Negotiation / Propose                           7
Triage / Develop Opportunity                    6
Name: StageName, dtype: int64

In [95]:
lead[lead['StageName'].isin(["Closed Won"])].count()/(len(lead.index))

Id                                  0.249415
StageName                           0.249415
Status_Reason__c                    0.249415
RecordType.Name                     0.249415
Final_Record_Type__c                0.249415
RICE_Supported__c                   0.249415
CreatedDate1                        0.249415
CreatedDate                         0.249415
CloseDate                           0.249415
Actual_Close_Date__c                0.249415
Amount                              0.249415
Estimated_Project_Total_Value__c    0.248392
Booked_Revenue__c                   0.247368
Actual_Project_Total_Value__c       0.247368
BD_Cluster__c                       0.177924
BD_Division__c                      0.169006
CE_Course_Audience_Type__c          0.013304
CE_Course_Type__c                   0.016813
AccountId                           0.249415
Customer_Contact__c                 0.014912
Lead_Academic_contact__c            0.239181
Lead_Faculty__c                     0.249415
Lead_Schoo

Converted rate was 0.25

In [96]:
lead['StageName'] = lead['StageName'].replace('Post Award', "Closed Won")

In [97]:
lead.loc[(lead['Status_Reason__c'] == 'Won')|(lead['Status_Reason__c'] == 'Post Award')|(lead['Status_Reason__c'] == 'Closed Won'), 'StageName'] = 'Closed Won'

In [98]:
lead[lead['StageName'].isin(["Closed Won"])].count()/(len(lead.index))

Id                                  0.295468
StageName                           0.295468
Status_Reason__c                    0.295468
RecordType.Name                     0.295468
Final_Record_Type__c                0.295468
RICE_Supported__c                   0.295468
CreatedDate1                        0.295468
CreatedDate                         0.295468
CloseDate                           0.295468
Actual_Close_Date__c                0.294444
Amount                              0.294006
Estimated_Project_Total_Value__c    0.290351
Booked_Revenue__c                   0.289035
Actual_Project_Total_Value__c       0.289035
BD_Cluster__c                       0.215058
BD_Division__c                      0.205117
CE_Course_Audience_Type__c          0.017105
CE_Course_Type__c                   0.020760
AccountId                           0.295468
Customer_Contact__c                 0.042690
Lead_Academic_contact__c            0.281140
Lead_Faculty__c                     0.295468
Lead_Schoo

converted 0.31

In [99]:
#lead = lead.drop(["Id","Status_Reason__c","Actual_Close_Date__c","AccountId","CreatedDate","RecordType.Name.1","CloseDate","CloseMonth","CloseYear"],axis=1)

In [100]:
## drop irrelevant rows from "StageName"

#lead = lead.drop[lead.StageName != 'Closed Lost' or lead.StageName != 'Closed Won' ]
indexNames1 = lead[lead['StageName'] == 'Identifying'].index
lead.drop(indexNames1 , inplace=True)
indexNames2 = lead[lead['StageName'] == 'Awaiting Results'].index
lead.drop(indexNames2, inplace=True)
indexNames3 = lead[lead['StageName'] == 'Developing'].index
lead.drop(indexNames3 , inplace=True)
indexNames4 = lead[lead['StageName'] == 'Closing'].index
lead.drop(indexNames4 , inplace=True)
indexNames5 = lead[lead['StageName'] == 'Proposing'].index
lead.drop(indexNames5 , inplace=True)
indexNames6 = lead[lead['StageName'] == 'Application'].index
lead.drop(indexNames6, inplace=True)
indexNames7 = lead[lead['StageName'] == 'Pre-submission'].index
lead.drop(indexNames7, inplace=True)
indexNames8 = lead[lead['StageName'] == 'Closing'].index
lead.drop(indexNames8, inplace=True)
indexNames9 = lead[lead['StageName'] == 'Application'].index
lead.drop(indexNames9, inplace=True)
indexNames10 = lead[lead['StageName'] == 'Pre-submission'].index
lead.drop(indexNames10, inplace=True)
indexNames11 = lead[lead['StageName'] == 'Closed'].index
lead.drop(indexNames11, inplace=True)
indexNames12 = lead[lead['StageName'] == 'Execution / Closing'].index
lead.drop(indexNames12, inplace=True)
indexNames13 = lead[lead['StageName'] == 'Submitting'].index
lead.drop(indexNames13, inplace=True)
indexNames14 = lead[lead['StageName'] == 'Execute Contract'].index
lead.drop(indexNames14 , inplace=True)
indexNames15 = lead[lead['StageName'] == 'Develop Relationship/ Qualify opportunity'].index
lead.drop(indexNames15 , inplace=True)
indexNames16 = lead[lead['StageName'] == 'Notice'].index
lead.drop(indexNames16 , inplace=True)
indexNames17 = lead[lead['StageName'] == 'Negotiation / Propose'].index
lead.drop(indexNames17 , inplace=True)
indexNames18 = lead[lead['StageName'] == 'Triage / Develop Opportunity'].index
lead.drop(indexNames18 , inplace=True)

lead['StageName'].value_counts()


#lead.drop(lead.index[lead['StageName'] != 'Closed Lost' or lead['StageName'] != 'Closed Won' ], inplace = True)
#df.drop(df.index[df['line_race'] == 0], inplace = True)

Closed Lost    3387
Closed Won     2021
Name: StageName, dtype: int64

In [101]:
lead['StageName']=lead['StageName'].replace("Closed Won", 1)
lead['StageName']=lead['StageName'].replace("Closed Lost", 0)
lead['StageName'].value_counts()

0    3387
1    2021
Name: StageName, dtype: int64

### (13) External(1)

In [102]:
lead['Is_External__c']=lead['Is_External__c'].replace("External", 1)
lead['Is_External__c']=lead['Is_External__c'].replace("Internal", 0)
lead['StageName'].value_counts()

0    3387
1    2021
Name: StageName, dtype: int64

### (14) Combine two columns

In [103]:
lead["oppo_busi"]=lead['RecordType.Name']+lead['Business_Type__c']
lead["fac_oppo"]=lead['Lead_Faculty__c']+lead['RecordType.Name']
lead["oppo_Industry"]=lead['RecordType.Name']+lead['Industry']

In [104]:
#only closed won closed loss is remained

#lead['StageName'] = lead['StageName'].replace('Post Award', "Closed Won")



#data = lead.values
#print(len(data))
#new_data=[]

#for row in lead.values:
#    if row[2] == 'Won' or row[1]=='Post Award':#closing,developing, waiting result, proposing, iden
#        row[1] = 'Closed Won'
        
#    elif row[2] == 'Academic Capacity \(Lost\)' or row[2]=='Other \(Lost\)' or row[2] == "Customer No Longer Interested":
#        row[1] = 'Closed Lost'


#for row in data:
#    if row[1] == "Closed Lost" or row[1] != "Closed Won":
#        new_data.append(row)

#print(len(new_data))

#lead.values = new_data

#print(lead)
#print(lead.keys)


In [105]:
lead.to_csv("cleaned-7-Oct.csv",index=False)

In [106]:
#delete acct id, status reason, created date, close date, oppo id

# Adding Task/Events column

In [107]:
lead.columns

Index(['Id', 'StageName', 'Status_Reason__c', 'RecordType.Name',
       'Final_Record_Type__c', 'RICE_Supported__c', 'CreatedDate1',
       'CreatedDate', 'CloseDate', 'Actual_Close_Date__c', 'Amount',
       'Estimated_Project_Total_Value__c', 'Booked_Revenue__c',
       'Actual_Project_Total_Value__c', 'BD_Cluster__c', 'BD_Division__c',
       'CE_Course_Audience_Type__c', 'CE_Course_Type__c', 'AccountId',
       'Customer_Contact__c', 'Lead_Academic_contact__c', 'Lead_Faculty__c',
       'Lead_School__c', 'Lead_Department__c', 'Supporting_Faculty_1__c',
       'Supporting_Faculty_2__c', 'OwnerId', 'Parent_Opportunity__c',
       'RecordType.Name.1', 'Industry', 'Industry_Sub_Type__c',
       'Business_Type__c', 'Country__c', 'Is_External__c', 'ParentId',
       'UoM_Organisation_Level__c', 'emails', 'calls', 'meetings', 'tasks',
       'events', 'other', 'total_contacts', 'days_till_first_contact',
       'contacts_first_month', 'contacts_three_month', 'CloseYear',
       'CloseMont

In [108]:
tasks_events = []
for each in lead.emails.isna():
    if each==True:
        tasks_events.append(0)
    else:
        tasks_events.append(1)

lead["tasks_events"] = tasks_events

-----------

# Step 3: Bing's f_won_before

In [109]:
lead.columns

Index(['Id', 'StageName', 'Status_Reason__c', 'RecordType.Name',
       'Final_Record_Type__c', 'RICE_Supported__c', 'CreatedDate1',
       'CreatedDate', 'CloseDate', 'Actual_Close_Date__c', 'Amount',
       'Estimated_Project_Total_Value__c', 'Booked_Revenue__c',
       'Actual_Project_Total_Value__c', 'BD_Cluster__c', 'BD_Division__c',
       'CE_Course_Audience_Type__c', 'CE_Course_Type__c', 'AccountId',
       'Customer_Contact__c', 'Lead_Academic_contact__c', 'Lead_Faculty__c',
       'Lead_School__c', 'Lead_Department__c', 'Supporting_Faculty_1__c',
       'Supporting_Faculty_2__c', 'OwnerId', 'Parent_Opportunity__c',
       'RecordType.Name.1', 'Industry', 'Industry_Sub_Type__c',
       'Business_Type__c', 'Country__c', 'Is_External__c', 'ParentId',
       'UoM_Organisation_Level__c', 'emails', 'calls', 'meetings', 'tasks',
       'events', 'other', 'total_contacts', 'days_till_first_contact',
       'contacts_first_month', 'contacts_three_month', 'CloseYear',
       'CloseMont

In [120]:
Opportunity_raw = lead[['Id','AccountId','Actual_Close_Date__c','CreatedDate',
        'StageName', 'RecordType.Name', 'RICE_Supported__c', 'Lead_Faculty__c',
       'Lead_School__c', 'Parent_Opportunity__c', 'Industry',
       'Industry_Sub_Type__c', 'Business_Type__c', 'Is_External__c',
       'ParentId', 'tasks_events','CreatedYear', 'CreatedMonth', 'year_length', "days",'oppo_busi',
       'fac_oppo', 'oppo_Industry']]

In [121]:
Opportunity_raw.head()

Unnamed: 0,Id,AccountId,Actual_Close_Date__c,CreatedDate,StageName,RecordType.Name,RICE_Supported__c,Lead_Faculty__c,Lead_School__c,Parent_Opportunity__c,...,Is_External__c,ParentId,tasks_events,CreatedYear,CreatedMonth,year_length,days,oppo_busi,fac_oppo,oppo_Industry
0,0062e000002Hc2PAAS,0012e000003AqeVAAS,30/4/20,2019-11-07T00:59:46.000Z,0,Custom Education (MSPACE Included),RIC-BD&I,0012e000002ZGfbAAG,0012e000002Zt0mAAC,0,...,1,0,0,2019,11,1,298,Custom Education (MSPACE Included)Multinationa...,0012e000002ZGfbAAGCustom Education (MSPACE Inc...,Custom Education (MSPACE Included)Health
1,0062e000002HFaaAAG,0012e000003A6ElAAK,,2018-09-19T04:32:55.000Z,1,Parent Grant,RIC-RE&D,NotGiven,NotGiven,0,...,1,1,0,2018,9,31,11426,Parent GrantGovernment: Australia: Federal,NotGivenParent Grant,Parent GrantPublic Administration
2,0062e000002HFabAAG,0012e000003A6ElAAK,22/6/20,2018-09-19T04:32:19.000Z,1,Parent Grant,RIC-RE&D,NotGiven,NotGiven,0,...,1,1,0,2018,9,2,642,Parent GrantGovernment: Australia: Federal,NotGivenParent Grant,Parent GrantPublic Administration
9,0062e000002HFaiAAG,0012e000003A6bnAAC,16/6/20,2019-02-13T19:41:22.000Z,1,Parent Grant,Not supported,NotGiven,NotGiven,0,...,1,0,0,2019,2,1,489,Parent GrantGovernment: Australia: Federal,NotGivenParent Grant,Parent GrantHealth
12,0062e000002HFalAAG,0012e000003A6osAAC,22/6/20,2017-04-07T01:27:08.000Z,1,Parent Grant,RIC-RE&D,NotGiven,NotGiven,0,...,1,1,0,2017,4,3,1172,Parent GrantGovernment: Australia: Federal,NotGivenParent Grant,Parent GrantPublic Administration


In [122]:
Opportunity_raw['CreatedDate'] = Opportunity_raw['CreatedDate'].str[:10]
Opportunity_raw['CreatedDate'] = pd.to_datetime(Opportunity_raw['CreatedDate'])
Opportunity_raw['Actual_Close_Date__c'] = pd.to_datetime(Opportunity_raw['Actual_Close_Date__c'])

In [134]:
Opportunity_raw['RN'] = Opportunity_raw.sort_values(['CreatedDate','Actual_Close_Date__c'], ascending=[True,True]).groupby(['AccountId']).cumcount() + 1
Opportunity_raw.loc[Opportunity_raw["AccountId"].isnull(),"RN"] = 0
Opportunity_raw_after=Opportunity_raw[['Id','AccountId','Actual_Close_Date__c','CreatedDate',
                            'StageName', 'RecordType.Name', 'RICE_Supported__c', 'Lead_Faculty__c',
                           'Lead_School__c', 'Parent_Opportunity__c', 'Industry',
                           'Industry_Sub_Type__c', 'Business_Type__c', 'Is_External__c',
                           'ParentId', 'tasks_events','CreatedYear', 'CreatedMonth', 'year_length',"days",'oppo_busi',
                           'fac_oppo', 'oppo_Industry','RN']]

#Opportunity_raw_after=Opportunity_raw[['outcome','Opptype','Supppoted', 
#                          'days_diff','createDate','act_closeDate','closeDate_month','createDate_month',
#                            'closeDate_year','createDate_year','act_amt','act_proj','accountId',
#                            'lead_faculty','f_parent_opp','RN']]

In [135]:
Opportunity_raw_won=Opportunity_raw_after[Opportunity_raw_after.StageName==1]
Opportunity_raw_won['RN_2'] = Opportunity_raw_won.sort_values(['CreatedDate','Actual_Close_Date__c'], ascending=[True,True]).groupby(['AccountId']).cumcount() + 1
Opportunity_raw_won=Opportunity_raw_won[Opportunity_raw_won.RN_2==1]

Opportunity_raw_won=Opportunity_raw_won[['AccountId','CreatedDate']]
Opportunity_raw_won.columns=['accountID_match','firstwon_createdate']
Opportunity_raw_after=Opportunity_raw_after.merge(Opportunity_raw_won, left_on='AccountId', right_on='accountID_match',how='left')
Opportunity_raw_after['f_won_before']=np.where((Opportunity_raw_after['CreatedDate'] > Opportunity_raw_after['firstwon_createdate']), 1, 0)
Opportunity_raw_after.loc[Opportunity_raw_after["AccountId"].isnull(),"f_won_before"] = 0


In [136]:
# for modelling
Opportunity_raw_after = Opportunity_raw_after.drop(['Id', 'AccountId', 'Actual_Close_Date__c', 'CreatedDate',
                                                    'accountID_match', 'firstwon_createdate',"year_length"],axis=1)
# for visual

    

In [137]:
Opportunity_raw_after

Unnamed: 0,StageName,RecordType.Name,RICE_Supported__c,Lead_Faculty__c,Lead_School__c,Parent_Opportunity__c,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId,tasks_events,CreatedYear,CreatedMonth,days,oppo_busi,fac_oppo,oppo_Industry,RN,f_won_before
0,0,Custom Education (MSPACE Included),RIC-BD&I,0012e000002ZGfbAAG,0012e000002Zt0mAAC,0,Health,Health Care & Healthy Aging,Multinational / Other Large Corporate,1,0,0,2019,11,298,Custom Education (MSPACE Included)Multinationa...,0012e000002ZGfbAAGCustom Education (MSPACE Inc...,Custom Education (MSPACE Included)Health,1,0
1,1,Parent Grant,RIC-RE&D,NotGiven,NotGiven,0,Public Administration,Public Administration,Government: Australia: Federal,1,1,0,2018,9,11426,Parent GrantGovernment: Australia: Federal,NotGivenParent Grant,Parent GrantPublic Administration,10,1
2,1,Parent Grant,RIC-RE&D,NotGiven,NotGiven,0,Public Administration,Public Administration,Government: Australia: Federal,1,1,0,2018,9,642,Parent GrantGovernment: Australia: Federal,NotGivenParent Grant,Parent GrantPublic Administration,9,1
3,1,Parent Grant,Not supported,NotGiven,NotGiven,0,Health,Health Care & Healthy Aging,Government: Australia: Federal,1,0,0,2019,2,489,Parent GrantGovernment: Australia: Federal,NotGivenParent Grant,Parent GrantHealth,67,1
4,1,Parent Grant,RIC-RE&D,NotGiven,NotGiven,0,Public Administration,Public Administration,Government: Australia: Federal,1,1,0,2017,4,1172,Parent GrantGovernment: Australia: Federal,NotGivenParent Grant,Parent GrantPublic Administration,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5403,1,Research Contract,RIC-BD&I,0012e000002ZGfbAAG,0012e000002Zt10AAC,1,Health,Biotechnology (non-Pharma),SME (small to medium enterprise),1,0,0,2020,7,0,Research ContractSME (small to medium enterprise),0012e000002ZGfbAAGResearch Contract,Research ContractHealth,1,0
5404,0,Competitive Bid,RIC-BD&I,NotGiven,NotGiven,0,Construction & Construction Materials,Construction & Construction Materials,Not for profit,1,0,1,2020,7,74,Competitive BidNot for profit,NotGivenCompetitive Bid,Competitive BidConstruction & Construction Mat...,1,0
5405,1,Custom Education (MSPACE Included),RIC-BD&I,0012e000002ZmnzAAC,0012e000002Z3p1AAC,0,Services,Education,University,1,0,0,2020,7,0,Custom Education (MSPACE Included)University,0012e000002ZmnzAACCustom Education (MSPACE Inc...,Custom Education (MSPACE Included)Services,278,1
5406,1,Custom Education (MSPACE Included),RIC-BD&I,0012e000002ZmnzAAC,0012e000002Z3p1AAC,0,Services,Education,University,1,0,0,2020,7,0,Custom Education (MSPACE Included)University,0012e000002ZmnzAACCustom Education (MSPACE Inc...,Custom Education (MSPACE Included)Services,279,1


In [138]:
Opportunity_raw_after.to_csv("cleaned-16-Oct.csv",index=False)

In [139]:
sum(tasks_events)

344