# Lead Scoring Exploration  
Ankur Vishwakarma  
November 25, 2020

In [2]:
# import everything 

import os
import pandas as pd
import numpy as np
import snowflake.connector as snow

In [97]:
# write function to get data from snowflake

def query_snowflake(sql_query):
    '''Runs the supplied query and returns a Pandas dataframe.'''
    
    ctx = snow.connect( 
        account   = os.getenv('snowflake_account'),
        user      = os.getenv('snowflake_username'),
        password  = os.getenv('snowflake_password'),
        warehouse = os.getenv('snowflake_warehouse'),
        role      = os.getenv('snowflake_role')
    )

    cs = ctx.cursor()

    try:
        cs.execute(sql_query)
        data = pd.DataFrame(cs.fetchall())
        data.columns = [x[0] for x in cs.description]
    finally:
        cs.close()

    ctx.close()
    
    print('Snowflake: {:,} rows x {:,} columns returned.'.format(data.shape[0], data.shape[1]))
    return data

In [104]:
# sql query

sql_query = """
    select 
        c.id as contact_id,
        c.account_id, 
        c.title,
        c.mailing_city,
        c.user_city_c,
        a.firm_type_c,
        a.firm_subspecialty_c
    from pg_prod_db.salesforce.contact c
    left join pg_prod_db.salesforce.account a on c.account_id = a.id
    where c.created_date >= '2020-02-01'
    and c.created_via_conversion_c = 'TRUE';
"""

In [105]:
# get data

data = query_snowflake(sql_query)

Snowflake: 197,582 rows x 7 columns returned.


In [106]:
data.shape

(197582, 7)

In [14]:
data.head()

Unnamed: 0,CONTACT_ID,ACCOUNT_ID,TITLE,MAILING_CITY,USER_CITY_C,FIRM_TYPE_C,FIRM_SUBSPECIALTY_C
0,0030W00003k719SQAQ,0010W00002LUOvBQAX,Site Manager,Brisbane,Brisbane,General Contractor,
1,0030W00003rTDFeQAO,0010W00002Da5ShQAJ,,Calgary,,,
2,0030W00003sSKjCQAW,0010W00002bimZQQAY,,Tulsa,,,
3,0030W00003sTqWeQAK,001d0000028bI8bAAE,,Anchorage,,Design,Architect​
4,0030W00003m0hLVQAY,001d0000025HEdIAAW,,Greensboro,,General Contractor,


In [15]:
for c in data.columns:
    print(c)
    num_nulls = sum(data[c].isna())/data.shape[0]
    print("{:.0%} null".format(num_nulls))
    print()

CONTACT_ID
0% null

ACCOUNT_ID
0% null

TITLE
65% null

MAILING_CITY
9% null

USER_CITY_C
76% null

FIRM_TYPE_C
19% null

FIRM_SUBSPECIALTY_C
84% null



In [17]:
data[data['MAILING_CITY'].isna()].sample(5)

Unnamed: 0,CONTACT_ID,ACCOUNT_ID,TITLE,MAILING_CITY,USER_CITY_C,FIRM_TYPE_C,FIRM_SUBSPECIALTY_C
164744,0030W00003lzY7vQAE,001d000001kzLbZAAU,Superintendent,,,General Contractor,
179696,0030W00003qZP5ZQAW,0010W00002XaVy2QAF,,,,General Contractor,
45652,0030W00003sTWXRQA4,0010W00002Xap9fQAB,,,,Subcontractor,
80251,0030W00003rUYWfQAO,0010W00002XcKClQAN,,,,General Contractor,
136941,0030W00003k7W6LQAU,0010W00002XafewQAB,,,,Subcontractor,


## Looking at Actual Leads

In [20]:
# load data provided by Michael Dobos

fname = 'List_of_Firmographic_Leads.csv'
leads = pd.read_csv(fname)

In [36]:
# check null values

for c in leads.columns:
    texty = '{} - {:.1%} null'.format(c, sum(leads[c].isnull())/len(leads))
    print(texty)

Email Address - 14.6% null
SFDC Type - 0.3% null
Marketo SFDC ID - 0.3% null


In [113]:
# drop rows without SFDC IDs and keep only contacts

leads.dropna(subset = ['Marketo SFDC ID'], inplace = True)
leads = leads[leads['SFDC Type'] == 'Contact']
leads.head(5)

Unnamed: 0,Email Address,SFDC Type,Marketo SFDC ID
0,gb112@walkertx.com,Contact,0030W00003Pkfn3QAB
1,dlockhart@mbkahn.com,Contact,0030W00003PkM9iQAF
2,mattj@dpr.com,Contact,0030W00003Pk1rcQAB
3,awoo@accoes.com,Contact,0030W00003PkXkmQAF
4,rgiebels@vccusa.com,Contact,0030W00003Pkh58QAB


In [163]:
# get all contact info

sql_query =  """
    with
    relevant_opps as (
        select *
        from pg_prod_db.salesforce.opportunity
        where type not in (
            'Pilot - Paid', 
            'Pilot - Unpaid', 
            'BETA', 
            'Advisor', 
            'Educational', 
            'Partner', 
            'Console Add-On', 
            'Consulting Services', 
            'Renewal')
    )

    select distinct
        c.id as contact_id,
        c.account_id, 
        c.title,
        c.mailing_city,
        c.user_city_c,
        a.firm_type_c,
        a.firm_subspecialty_c,
        a.SECTOR_C,
        a.INDUSTRY,
        a.PREVIOUS_MONTH_END_ARR_C,
        a.ASSEMBLE_PRODUCT_ACV_C,
        a.BC_PRODUCT_ACV_C,
        a.BIM_360_PRODUCT_ACV_C,
        a.PG_PRODUCT_ACV_C,
        a.PYPE_PRODUCT_ACV_C,
        a.NUMBER_OF_EMPLOYEES,
        a.EMPLOYEE_COUNT_ZOOM_INFO_C,
        case 
            when o.id is null then 0
            else 1 
            end as has_opportunity
    from pg_prod_db.salesforce.contact c
    left join pg_prod_db.salesforce.account a on c.account_id = a.id
    left join relevant_opps o 
        on a.id = o.account_id
        and o.created_date >= c.LEAD_CREATE_DATE_C;
"""

In [164]:
# query snowflake

all_contacts = query_snowflake(sql_query)

Snowflake: 2,221,324 rows x 18 columns returned.


In [165]:
# join with provided leads data

df = pd.merge(
    left = leads, 
    right = all_contacts, 
    how = 'inner', 
    left_on = 'Marketo SFDC ID', 
    right_on = 'CONTACT_ID'
)

df.head()

Unnamed: 0,Email Address,SFDC Type,Marketo SFDC ID,CONTACT_ID,ACCOUNT_ID,TITLE,MAILING_CITY,USER_CITY_C,FIRM_TYPE_C,FIRM_SUBSPECIALTY_C,...,INDUSTRY,PREVIOUS_MONTH_END_ARR_C,ASSEMBLE_PRODUCT_ACV_C,BC_PRODUCT_ACV_C,BIM_360_PRODUCT_ACV_C,PG_PRODUCT_ACV_C,PYPE_PRODUCT_ACV_C,NUMBER_OF_EMPLOYEES,EMPLOYEE_COUNT_ZOOM_INFO_C,HAS_OPPORTUNITY
0,gb112@walkertx.com,Contact,0030W00003Pkfn3QAB,0030W00003Pkfn3QAB,001d000001nS3L3AAK,,Beaumont,Houston,Design,Engineer​,...,Subcontractor,0.0,,,,,,6.0,338.0,0
1,dlockhart@mbkahn.com,Contact,0030W00003PkM9iQAF,0030W00003PkM9iQAF,001d000001h3FFzAAM,PM,Columbia,Columbia,Subcontractor,,...,General Contractor,92631.2,0.0,0.0,0.0,92631.2,,513.0,513.0,1
2,mattj@dpr.com,Contact,0030W00003Pk1rcQAB,0030W00003Pk1rcQAB,001d000001gAlbYAAS,Project Engineer,San Jose,San Francisco,General Contractor,,...,General Contractor,1345909.38,0.0,0.0,0.0,1336909.38,9000.0,6500.0,2600.0,1
3,awoo@accoes.com,Contact,0030W00003PkXkmQAF,0030W00003PkXkmQAF,001d000001nS2VtAAK,Engineer,Lafayette,Lafayette,Subcontractor,Plumbing;Mechanical,...,Manufacturer,0.0,0.0,0.0,,0.0,,774.0,4000.0,1
4,rgiebels@vccusa.com,Contact,0030W00003Pkh58QAB,0030W00003Pkh58QAB,001d000001nS2ScAAK,Project Manager,La Mirada,,General Contractor,,...,General Contractor,0.0,,,,,,25.0,280.0,0


In [166]:
df['HAS_OPPORTUNITY'].value_counts()

0    189060
1     56790
Name: HAS_OPPORTUNITY, dtype: int64

In [162]:
# export for Michael Dobos

df.to_csv('firmographic_lead_score_and_opps.csv')

In [119]:
# delete all_contacts

# del all_contacts

In [156]:
# check coverage of each field

for c in df.columns:
    texty = '{:_<30s}{:>05.1%} null'.format(c, sum(df[c].isnull())/len(df))
    print(texty)

Email Address_________________16.5% null
SFDC Type_____________________00.0% null
Marketo SFDC ID_______________00.0% null
CONTACT_ID____________________00.0% null
ACCOUNT_ID____________________00.0% null
TITLE_________________________07.4% null
MAILING_CITY__________________16.4% null
USER_CITY_C___________________80.5% null
FIRM_TYPE_C___________________10.5% null
FIRM_SUBSPECIALTY_C___________81.3% null
SECTOR_C______________________65.1% null
INDUSTRY______________________14.3% null
PREVIOUS_MONTH_END_ARR_C______00.3% null
ASSEMBLE_PRODUCT_ACV_C________59.9% null
BC_PRODUCT_ACV_C______________59.9% null
BIM_360_PRODUCT_ACV_C_________81.8% null
PG_PRODUCT_ACV_C______________59.8% null
PYPE_PRODUCT_ACV_C____________93.7% null
NUMBER_OF_EMPLOYEES___________01.3% null
EMPLOYEE_COUNT_ZOOM_INFO_C____09.8% null
HAS_OPPORTUNITY_______________00.0% null
