### Step 1: Reading and Understanding the Data

In [179]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [180]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.impute import SimpleImputer



In [181]:
lead = pd.read_csv('research_joint_data.csv', encoding = 'latin', error_bad_lines=False)
lead.head()

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,Lead_School__c,Parent_Opportunity__c,RecordType.Name.1,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId
0,0062e000002Hc2PAAS,Closed Lost,Customer No Longer Interested,Custom Education (MSPACE Included),RIC-BD&I,2019-11-07T00:59:46.000Z,31/8/20,30/4/20,0012e000003AqeVAAS,0012e000002ZGfbAAG,0012e000002Zt0mAAC,,Business Organization,Health,Health Care & Healthy Aging,Multinational / Other Large Corporate,External,
1,0062e000002HFaaAAG,Post Award,,Parent Grant,RIC-RE&D,2018-09-19T04:32:55.000Z,31/12/49,,0012e000003A6ElAAK,,,,Business Organization,Public Administration,,Government: Australia: Federal,External,0012e000003A6osAAC
2,0062e000002HFabAAG,Post Award,,Parent Grant,RIC-RE&D,2018-09-19T04:32:19.000Z,22/6/20,22/6/20,0012e000003A6ElAAK,,,,Business Organization,Public Administration,,Government: Australia: Federal,External,0012e000003A6osAAC
3,0062e000002HFacAAG,Closed,,Parent Grant,,2016-07-06T05:03:00.000Z,31/12/49,,0012e000003A6osAAC,,,,Business Organization,Public Administration,,Government: Australia: Federal,External,0012e000003A6FfAAK
4,0062e000002HFadAAG,Closed,,Parent Grant,,2016-07-21T23:56:54.000Z,31/12/49,,0012e000003A79XAAS,,,,Business Organization,Public Administration,,Government: Australia: Federal,External,


In [182]:
# inspect lead dataframe

print("*********************************  Info *******************************************") 
print(lead.info())
print("*********************************  Shape ******************************************") 
print(lead.shape)
print("**************************** Columns having null values *****************************")
print(lead.isnull().any())
print("**************************** Describe *****************************")
lead.describe()

*********************************  Info *******************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7597 entries, 0 to 7596
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Id                     7597 non-null   object
 1   StageName              7597 non-null   object
 2   Status_Reason__c       7288 non-null   object
 3   RecordType.Name        7597 non-null   object
 4   RICE_Supported__c      7569 non-null   object
 5   CreatedDate            7597 non-null   object
 6   CloseDate              7597 non-null   object
 7   Actual_Close_Date__c   6293 non-null   object
 8   AccountId              6840 non-null   object
 9   Lead_Faculty__c        6325 non-null   object
 10  Lead_School__c         4272 non-null   object
 11  Parent_Opportunity__c  982 non-null    object
 12  RecordType.Name.1      6840 non-null   object
 13  Industry               6839 non-null   

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,Lead_School__c,Parent_Opportunity__c,RecordType.Name.1,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId
count,7597,7597,7288,7597,7569,7597,7597,6293,6840,6325,4272,982,6840,6839,4199,6795,6840,1140
unique,7597,18,25,12,4,7498,885,1072,2408,16,46,315,3,19,19,11,2,105
top,0062e000002HHOqAAO,Closed Lost,Other (Lost),Research Contract,RIC-BD&I,2016-08-08T06:48:56.000Z,1/11/19,11/5/2017,0012e000003A6qHAAS,0012e000002ZGfbAAG,0012e000002Zt0cAAC,0062e000002HFbAAAW,Business Organization,Services,Education,Government: Australia: Federal,External,0012e000003A78yAAC
freq,1,4003,2398,3453,3289,14,2726,103,324,1514,526,50,6796,2457,1366,1419,6796,319


# Step 2: Data Cleaning

In [183]:
round(100*(lead.isnull().sum().sort_values(ascending=False)/len(lead.index)), 2)

Parent_Opportunity__c    87.07
ParentId                 84.99
Industry_Sub_Type__c     44.73
Lead_School__c           43.77
Actual_Close_Date__c     17.16
Lead_Faculty__c          16.74
Business_Type__c         10.56
Industry                  9.98
Is_External__c            9.96
AccountId                 9.96
RecordType.Name.1         9.96
Status_Reason__c          4.07
RICE_Supported__c         0.37
CloseDate                 0.00
CreatedDate               0.00
RecordType.Name           0.00
StageName                 0.00
Id                        0.00
dtype: float64

### Check if there are any duplicate values in the dataset

In [184]:
lead[lead.duplicated(keep=False)]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,Lead_School__c,Parent_Opportunity__c,RecordType.Name.1,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId


There are no duplicate values in the data.

### (1) Transform to 1yes/0no

In [185]:
lead[["ParentId","Parent_Opportunity__c"]]=lead[["ParentId","Parent_Opportunity__c"]].notnull().astype(int)

In [186]:
lead[["Actual_Close_Date__c"]]=lead[["Actual_Close_Date__c"]].notnull().astype(int)

### (2) Drop 3 columns

In [187]:
#lead = lead.drop(["Lead_School__c","Industry_Sub_Type__c"],axis=1)

In [188]:
#check null values
round(100*(lead.isnull().sum().sort_values(ascending=False)/len(lead.index)), 2)

Industry_Sub_Type__c     44.73
Lead_School__c           43.77
Lead_Faculty__c          16.74
Business_Type__c         10.56
Industry                  9.98
AccountId                 9.96
RecordType.Name.1         9.96
Is_External__c            9.96
Status_Reason__c          4.07
RICE_Supported__c         0.37
StageName                 0.00
RecordType.Name           0.00
ParentId                  0.00
CreatedDate               0.00
CloseDate                 0.00
Actual_Close_Date__c      0.00
Parent_Opportunity__c     0.00
Id                        0.00
dtype: float64

### (3) Delete opportunities without account id

In [189]:
lead.dropna(subset=['AccountId'],inplace=True)

In [190]:
round(100*(lead.isnull().sum().sort_values(ascending=False)/len(lead.index)), 2)

Lead_School__c           43.30
Industry_Sub_Type__c     38.61
Lead_Faculty__c          17.28
Status_Reason__c          4.47
Business_Type__c          0.66
RICE_Supported__c         0.41
Industry                  0.01
ParentId                  0.00
CreatedDate               0.00
StageName                 0.00
RecordType.Name           0.00
AccountId                 0.00
CloseDate                 0.00
Actual_Close_Date__c      0.00
Is_External__c            0.00
Parent_Opportunity__c     0.00
RecordType.Name.1         0.00
Id                        0.00
dtype: float64

### (4) CreatedDate, CloseDate change string to time, seperate year, month

In [191]:
lead.loc[:,"CloseDate"]

0        31/8/20
1       31/12/49
2        22/6/20
3       31/12/49
4       31/12/49
          ...   
7591    30/10/20
7592     30/9/20
7593     30/9/20
7595    31/12/20
7596    30/11/20
Name: CloseDate, Length: 6840, dtype: object

In [192]:
lead.loc[:,'CloseDate']=pd.to_datetime(lead.loc[:,'CloseDate'],
                                           format='%d/%m/%y', 
                                           errors='coerce')


In [193]:
lead['CloseYear'] = pd.DatetimeIndex(lead['CloseDate']).year  
lead.loc[:,"CloseYear"]

0       2020
1       2049
2       2020
3       2049
4       2049
        ... 
7591    2020
7592    2020
7593    2020
7595    2020
7596    2020
Name: CloseYear, Length: 6840, dtype: int64

In [194]:
lead['CloseMonth'] = pd.DatetimeIndex(lead['CloseDate']).month                                          

In [195]:
lead.loc[:,"CreatedDate"]

0       2019-11-07T00:59:46.000Z
1       2018-09-19T04:32:55.000Z
2       2018-09-19T04:32:19.000Z
3       2016-07-06T05:03:00.000Z
4       2016-07-21T23:56:54.000Z
                  ...           
7591    2020-08-06T00:24:37.000Z
7592    2020-08-06T01:33:22.000Z
7593    2020-08-06T01:40:55.000Z
7595    2020-08-06T11:13:34.000Z
7596    2020-08-06T14:36:01.000Z
Name: CreatedDate, Length: 6840, dtype: object

In [196]:
lead['CreatedYear'] = pd.DatetimeIndex(lead['CreatedDate']).year                                          

In [197]:
lead.loc[:,"CreatedYear"]

0       2019
1       2018
2       2018
3       2016
4       2016
        ... 
7591    2020
7592    2020
7593    2020
7595    2020
7596    2020
Name: CreatedYear, Length: 6840, dtype: int64

In [198]:
lead['CreatedMonth'] = pd.DatetimeIndex(lead['CreatedDate']).month 

In [199]:
lead.loc[:,"CreatedMonth"]

0       11
1        9
2        9
3        7
4        7
        ..
7591     8
7592     8
7593     8
7595     8
7596     8
Name: CreatedMonth, Length: 6840, dtype: int64

In [200]:
lead['year_length']=lead['CloseYear']-lead['CreatedYear']

### (5) Industry

In [201]:
lead['Industry'].value_counts()

Services                                    2457
Health                                      1530
Public Administration                        963
Defence                                      477
Technology: IT & Telephone Services          263
Agriculture & Horticulture                   259
Manufacturing & R&D                          212
Energy & Utilities                           191
Construction & Construction Materials        137
Transportation & Logistics                   108
Mining & Refining                             73
Materials: Chemicals & Industrial Metals      50
Education                                     44
Veterinary                                    35
Mining Equipment and Technology Services      17
Transportation                                 8
Forestry & Paper                               6
Aquaculture & Fisheries                        6
Advertising, Marketing and PR                  3
Name: Industry, dtype: int64

In [202]:
lead['Industry'].isnull().sum()

1

In [203]:
lead[lead['Industry'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,...,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length
5807,0062e000002JFzFAAW,Closed Lost,Academic Capacity (Lost),Consultancy (Non-research),RIC-BD&I,2019-12-10T03:10:48.000Z,2020-01-13,1,0012e000003AAqGAAW,,...,,,,External,0,2020,1,2019,12,1


In [204]:
# since "Services" has the major chunk of data, 
# we can impute the null values with Services
lead['Industry'] = lead['Industry'].replace(np.nan, "UNKNOWN")

### (6) Industry_Sub_Type__c

In [205]:
#for row in lead[lead['Industry'].isin(["Public Administration"])]:
    #lead['Industry_Sub_Type__c'] = lead['Industry_Sub_Type__c'].replace(np.nan, "Public Administration")
    
lead["Industry_Sub_Type__c"].fillna(lead["Industry"], inplace=True)

### (7) Status_Reason__c

In [206]:
#lead['Status_Reason__c'] = lead['Status_Reason__c'].replace(np.nan, "Other (Lost)")
lead[lead['Status_Reason__c'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,...,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length
1,0062e000002HFaaAAG,Post Award,,Parent Grant,RIC-RE&D,2018-09-19T04:32:55.000Z,2049-12-31,0,0012e000003A6ElAAK,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2049,12,2018,9,31
2,0062e000002HFabAAG,Post Award,,Parent Grant,RIC-RE&D,2018-09-19T04:32:19.000Z,2020-06-22,1,0012e000003A6ElAAK,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2020,6,2018,9,2
3,0062e000002HFacAAG,Closed,,Parent Grant,,2016-07-06T05:03:00.000Z,2049-12-31,0,0012e000003A6osAAC,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2049,12,2016,7,33
4,0062e000002HFadAAG,Closed,,Parent Grant,,2016-07-21T23:56:54.000Z,2049-12-31,0,0012e000003A79XAAS,,...,Public Administration,Public Administration,Government: Australia: Federal,External,0,2049,12,2016,7,33
5,0062e000002HFaeAAG,Closed,,Parent Grant,,2017-04-07T01:30:10.000Z,2049-12-31,0,0012e000003A79XAAS,,...,Public Administration,Public Administration,Government: Australia: Federal,External,0,2049,12,2017,4,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7470,0062e000002YDxYAAW,Pre-submission,,Parent Grant,RIC-RE&D,2020-07-20T03:45:24.000Z,2020-07-31,0,0012e000003A6bnAAC,,...,Health,Health Care & Healthy Aging,Government: Australia: Federal,External,0,2020,7,2020,7,0
7471,0062e000002YE2DAAW,Pre-submission,,Parent Grant,RIC-RE&D,2020-07-20T03:58:43.000Z,2020-12-31,0,0012e000003A6ElAAK,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2020,12,2020,7,0
7491,0062e000002YKGJAA4,Pre-submission,,Parent Grant,RIC-BD&I,2020-07-21T06:41:52.000Z,2020-10-31,0,0012e000003A6EjAAK,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2020,10,2020,7,0
7514,0062e000002YrlxAAC,Awaiting Results,,Parent Grant,RIC-BD&I,2020-07-29T00:25:39.000Z,2020-11-30,0,0012e000003A716AAC,,...,Defence,Defence,Government: Australia: Federal,External,0,2020,11,2020,7,0


In [207]:
lead["Status_Reason__c"].fillna(lead["StageName"], inplace=True)

### (8) 'RICE_Supported__c'

In [208]:
lead['RICE_Supported__c'].value_counts()

RIC-BD&I             3027
RIC RE&D and BD&I    2702
Not supported         552
RIC-RE&D              531
Name: RICE_Supported__c, dtype: int64

In [209]:
lead[lead['RICE_Supported__c'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,...,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length
3,0062e000002HFacAAG,Closed,Closed,Parent Grant,,2016-07-06T05:03:00.000Z,2049-12-31,0,0012e000003A6osAAC,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2049,12,2016,7,33
4,0062e000002HFadAAG,Closed,Closed,Parent Grant,,2016-07-21T23:56:54.000Z,2049-12-31,0,0012e000003A79XAAS,,...,Public Administration,Public Administration,Government: Australia: Federal,External,0,2049,12,2016,7,33
5,0062e000002HFaeAAG,Closed,Closed,Parent Grant,,2017-04-07T01:30:10.000Z,2049-12-31,0,0012e000003A79XAAS,,...,Public Administration,Public Administration,Government: Australia: Federal,External,0,2049,12,2017,4,32
6,0062e000002HFafAAG,Closed,Closed,Parent Grant,,2017-06-27T01:24:02.000Z,2049-12-31,0,0012e000003A6hbAAC,,...,Energy & Utilities,Energy & Utilities,Government: Australia: Federal,External,1,2049,12,2017,6,32
7,0062e000002HFagAAG,Closed,Closed,Parent Grant,,2017-11-27T22:16:34.000Z,2049-12-31,0,0012e000003A6jLAAS,,...,Health,Pharmaceuticals,Multinational / Other Large Corporate,External,0,2049,12,2017,11,32
8,0062e000002HFahAAG,Closed,Closed,Parent Grant,,2016-08-22T01:40:33.000Z,2049-12-31,0,0012e000003A6MsAAK,,...,Agriculture & Horticulture,Agriculture & Horticulture,Government: Australia: Local & State,External,0,2049,12,2016,8,33
10,0062e000002HFajAAG,Closed,Closed,Parent Grant,,2017-12-15T01:48:39.000Z,2049-12-31,0,0012e000003A6bnAAC,,...,Health,Health Care & Healthy Aging,Government: Australia: Federal,External,0,2049,12,2017,12,32
11,0062e000002HFakAAG,Closed,Closed,Parent Grant,,2017-02-14T00:14:42.000Z,2049-12-31,0,0012e000003A6osAAC,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2049,12,2017,2,32
13,0062e000002HFamAAG,Closed,Closed,Parent Grant,,2017-04-07T04:48:26.000Z,2049-12-31,0,0012e000003A6osAAC,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2049,12,2017,4,32
14,0062e000002HFanAAG,Closed,Closed,Parent Grant,,2016-09-20T00:32:03.000Z,2049-12-31,0,0012e000003A6osAAC,,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2049,12,2016,9,33


In [210]:
lead['RICE_Supported__c'] = lead['RICE_Supported__c'].replace(np.nan, "NotGiven")

### (9) RecordType.Name.1

In [211]:
lead['RecordType.Name.1'].value_counts()


Business Organization    6796
University Department      43
Administrative              1
Name: RecordType.Name.1, dtype: int64

In [212]:
lead[lead['RecordType.Name.1'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,...,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length


### (10) 'Business_Type__c'

In [213]:
lead['Business_Type__c'].value_counts()

Government: Australia: Federal                       1419
Government: Australia: Local & State                 1288
SME (small to medium enterprise)                      914
Multinational / Other Large Corporate                 853
Not for profit                                        823
University                                            535
Large Australian Corporate                            440
Government: International                             321
PFRO (Publicly-Funded Research Organisation)          101
RDC (Rural Research and Development Corporations)      53
CRC (Co-operative Research Centre)                     48
Name: Business_Type__c, dtype: int64

In [214]:
lead[lead['Business_Type__c'].isin([np.nan])]

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,...,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId,CloseYear,CloseMonth,CreatedYear,CreatedMonth,year_length
82,0062e000002HFtrAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),RIC RE&D and BD&I,2016-06-01T06:31:24.000Z,2019-11-01,1,0012e000002YmNRAA0,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2016,6,3
84,0062e000002HFtsAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),RIC RE&D and BD&I,2017-05-09T06:26:42.000Z,2019-11-01,1,0012e000002YmNRAA0,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2017,5,2
86,0062e000002HFttAAG,Closed Lost,Other (Lost),Research Contract,RIC RE&D and BD&I,2016-06-01T06:17:28.000Z,2019-11-01,1,0012e000002Z4IxAAK,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2016,6,3
88,0062e000002HFtuAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Not supported,2018-01-22T00:12:49.000Z,2019-11-01,1,0012e000002Z3OuAAK,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2018,1,1
90,0062e000002HFtvAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),RIC RE&D and BD&I,2017-04-28T04:09:27.000Z,2019-11-01,1,0012e000002YlvyAAC,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2017,4,2
92,0062e000002HFtwAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),Not supported,2018-01-22T00:11:00.000Z,2019-11-01,1,0012e000002YlvyAAC,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2018,1,1
94,0062e000002HFtxAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),RIC RE&D and BD&I,2017-08-27T10:47:44.000Z,2019-11-01,1,0012e000002Z4IxAAK,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2017,8,2
96,0062e000002HFtyAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),RIC RE&D and BD&I,2016-02-03T22:45:38.000Z,2019-11-01,1,0012e000002Z3OuAAK,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2016,2,3
98,0062e000002HFtzAAG,Closed Lost,Other (Lost),Custom Education (MSPACE Included),RIC RE&D and BD&I,2017-10-23T22:54:26.000Z,2019-11-01,1,0012e000002ZNYOAA4,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2017,10,2
100,0062e000002HFu0AAG,Closed Lost,Other (Lost),Internship,RIC RE&D and BD&I,2016-06-01T11:24:39.000Z,2019-11-01,1,0012e000002YlvyAAC,0012e000002ZNYOAA4,...,Education,Education,,Internal,1,2019,11,2016,6,3


In [215]:
lead['Business_Type__c'] = lead['Business_Type__c'].replace(np.nan, "Not for profit")

### (11) 'Lead_Faculty__c','Lead_School__c'

In [216]:
lead['Lead_Faculty__c'] = lead['Lead_Faculty__c'].replace(np.nan, 'NotGiven')
lead['Lead_School__c'] = lead['Lead_School__c'].replace(np.nan, 'NotGiven')
#lead['Lead_Faculty__c'].value_counts()
round(100*(lead.isnull().sum().sort_values(ascending=False)/len(lead.index)), 2)
                                                      

year_length              0.0
Lead_School__c           0.0
StageName                0.0
Status_Reason__c         0.0
RecordType.Name          0.0
RICE_Supported__c        0.0
CreatedDate              0.0
CloseDate                0.0
Actual_Close_Date__c     0.0
AccountId                0.0
Lead_Faculty__c          0.0
Parent_Opportunity__c    0.0
CreatedMonth             0.0
RecordType.Name.1        0.0
Industry                 0.0
Industry_Sub_Type__c     0.0
Business_Type__c         0.0
Is_External__c           0.0
ParentId                 0.0
CloseYear                0.0
CloseMonth               0.0
CreatedYear              0.0
Id                       0.0
dtype: float64

### (12) StageName 

In [217]:
lead['Status_Reason__c'].value_counts()

Other (Lost)                         1985
Won                                  1984
On Track                              802
Customer No Longer Interested         607
Proposal/Application Unsuccessful     434
Academic Capacity (Lost)              198
Out-Sold                              133
Pre-submission                        100
Awaiting Customer Action               90
Customer Not Responding                87
Academic No Longer Interested          70
Awaiting Results                       63
Closed                                 56
Not Ready for Market                   40
Submitting                             32
Post Award                             27
Closed Lost                            20
Failure to Agree - Price               13
Failure to Agree - Scope of Work       13
BD Capacity (Lost)                     13
Academic Capacity (Open)               10
Unacceptable Risk                       9
Other (Open)                            8
Price                             

In [218]:
lead['StageName'].value_counts()

Closed Lost                                  3389
Closed Won                                   1706
Identifying                                   404
Developing                                    286
Awaiting Results                              239
Proposing                                     226
Closing                                       148
Application                                   115
Pre-submission                                104
Closed                                         56
Execution / Closing                            33
Submitting                                     32
Execute Contract                               30
Post Award                                     27
Develop Relationship/ Qualify opportunity      18
Notice                                         14
Negotiation / Propose                           7
Triage / Develop Opportunity                    6
Name: StageName, dtype: int64

In [219]:
lead[lead['StageName'].isin(["Closed Won"])].count()/(len(lead.index))

Id                       0.249415
StageName                0.249415
Status_Reason__c         0.249415
RecordType.Name          0.249415
RICE_Supported__c        0.249415
CreatedDate              0.249415
CloseDate                0.249415
Actual_Close_Date__c     0.249415
AccountId                0.249415
Lead_Faculty__c          0.249415
Lead_School__c           0.249415
Parent_Opportunity__c    0.249415
RecordType.Name.1        0.249415
Industry                 0.249415
Industry_Sub_Type__c     0.249415
Business_Type__c         0.249415
Is_External__c           0.249415
ParentId                 0.249415
CloseYear                0.249415
CloseMonth               0.249415
CreatedYear              0.249415
CreatedMonth             0.249415
year_length              0.249415
dtype: float64

Converted rate was 0.25

In [220]:
lead['StageName'] = lead['StageName'].replace('Post Award', "Closed Won")

In [221]:
lead.loc[(lead['Status_Reason__c'] == 'Won')|(lead['Status_Reason__c'] == 'Post Award')|(lead['Status_Reason__c'] == 'Closed Won'), 'StageName'] = 'Closed Won'

In [222]:
lead[lead['StageName'].isin(["Closed Won"])].count()/(len(lead.index))

Id                       0.295468
StageName                0.295468
Status_Reason__c         0.295468
RecordType.Name          0.295468
RICE_Supported__c        0.295468
CreatedDate              0.295468
CloseDate                0.295468
Actual_Close_Date__c     0.295468
AccountId                0.295468
Lead_Faculty__c          0.295468
Lead_School__c           0.295468
Parent_Opportunity__c    0.295468
RecordType.Name.1        0.295468
Industry                 0.295468
Industry_Sub_Type__c     0.295468
Business_Type__c         0.295468
Is_External__c           0.295468
ParentId                 0.295468
CloseYear                0.295468
CloseMonth               0.295468
CreatedYear              0.295468
CreatedMonth             0.295468
year_length              0.295468
dtype: float64

converted 0.31

In [223]:
lead = lead.drop(["Id","Status_Reason__c","Actual_Close_Date__c","AccountId","CreatedDate","RecordType.Name.1","CloseDate","CloseMonth","CloseYear"],axis=1)

In [224]:
#lead = lead.drop[lead.StageName != 'Closed Lost' or lead.StageName != 'Closed Won' ]
indexNames1 = lead[lead['StageName'] == 'Identifying'].index
lead.drop(indexNames1 , inplace=True)
indexNames2 = lead[lead['StageName'] == 'Awaiting Results'].index
lead.drop(indexNames2, inplace=True)
indexNames3 = lead[lead['StageName'] == 'Developing'].index
lead.drop(indexNames3 , inplace=True)
indexNames4 = lead[lead['StageName'] == 'Closing'].index
lead.drop(indexNames4 , inplace=True)
indexNames5 = lead[lead['StageName'] == 'Proposing'].index
lead.drop(indexNames5 , inplace=True)
indexNames6 = lead[lead['StageName'] == 'Application'].index
lead.drop(indexNames6, inplace=True)
indexNames7 = lead[lead['StageName'] == 'Pre-submission'].index
lead.drop(indexNames7, inplace=True)
indexNames8 = lead[lead['StageName'] == 'Closing'].index
lead.drop(indexNames8, inplace=True)
indexNames9 = lead[lead['StageName'] == 'Application'].index
lead.drop(indexNames9, inplace=True)
indexNames10 = lead[lead['StageName'] == 'Pre-submission'].index
lead.drop(indexNames10, inplace=True)
indexNames11 = lead[lead['StageName'] == 'Closed'].index
lead.drop(indexNames11, inplace=True)
indexNames12 = lead[lead['StageName'] == 'Execution / Closing'].index
lead.drop(indexNames12, inplace=True)
indexNames13 = lead[lead['StageName'] == 'Submitting'].index
lead.drop(indexNames13, inplace=True)
indexNames14 = lead[lead['StageName'] == 'Execute Contract'].index
lead.drop(indexNames14 , inplace=True)
indexNames15 = lead[lead['StageName'] == 'Develop Relationship/ Qualify opportunity'].index
lead.drop(indexNames15 , inplace=True)
indexNames16 = lead[lead['StageName'] == 'Notice'].index
lead.drop(indexNames16 , inplace=True)
indexNames17 = lead[lead['StageName'] == 'Negotiation / Propose'].index
lead.drop(indexNames17 , inplace=True)
indexNames18 = lead[lead['StageName'] == 'Triage / Develop Opportunity'].index
lead.drop(indexNames18 , inplace=True)

lead['StageName'].value_counts()


#lead.drop(lead.index[lead['StageName'] != 'Closed Lost' or lead['StageName'] != 'Closed Won' ], inplace = True)
#df.drop(df.index[df['line_race'] == 0], inplace = True)

Closed Lost    3387
Closed Won     2021
Name: StageName, dtype: int64

In [225]:
lead['StageName']=lead['StageName'].replace("Closed Won", 1)
lead['StageName']=lead['StageName'].replace("Closed Lost", 0)
lead['StageName'].value_counts()

0    3387
1    2021
Name: StageName, dtype: int64

### (13) External(1)

In [226]:
lead['Is_External__c']=lead['Is_External__c'].replace("External", 1)
lead['Is_External__c']=lead['Is_External__c'].replace("Internal", 0)
lead['StageName'].value_counts()

0    3387
1    2021
Name: StageName, dtype: int64

### (14) Combine two columns

In [227]:
lead["oppo_busi"]=lead['RecordType.Name']+lead['Business_Type__c']
lead["fac_oppo"]=lead['Lead_Faculty__c']+lead['RecordType.Name']
lead["oppo_Industry"]=lead['RecordType.Name']+lead['Industry']

In [228]:
#only closed won closed loss is remained

#lead['StageName'] = lead['StageName'].replace('Post Award', "Closed Won")



#data = lead.values
#print(len(data))
#new_data=[]

#for row in lead.values:
#    if row[2] == 'Won' or row[1]=='Post Award':#closing,developing, waiting result, proposing, iden
#        row[1] = 'Closed Won'
        
#    elif row[2] == 'Academic Capacity \(Lost\)' or row[2]=='Other \(Lost\)' or row[2] == "Customer No Longer Interested":
#        row[1] = 'Closed Lost'


#for row in data:
#    if row[1] == "Closed Lost" or row[1] != "Closed Won":
#        new_data.append(row)

#print(len(new_data))

#lead.values = new_data

#print(lead)
#print(lead.keys)


       



In [229]:
lead.to_csv("cleaned-7-Oct.csv",index=False)

In [230]:
#delete acct id, status reason, created date, close date, oppo id