# Lead Scoring Exploration  
Ankur Vishwakarma  
November 25, 2020

In [2]:
# import everything 

import os
import pandas as pd
import numpy as np
import snowflake.connector as snow

In [10]:
# write function to get data from snowflake

def query_snowflake(sql_query):
    '''Runs the supplied query and returns a Pandas dataframe.'''
    
    ctx = snow.connect( 
        account   = os.getenv('snowflake_account'),
        user      = os.getenv('snowflake_username'),
        password  = os.getenv('snowflake_password'),
        warehouse = os.getenv('snowflake_warehouse'),
        role      = os.getenv('snowflake_role')
    )

    cs = ctx.cursor()

    try:
        cs.execute(sql_query)
        data = pd.DataFrame(cs.fetchall())
        data.columns = [x[0] for x in cs.description]
    finally:
        cs.close()

    ctx.close()
    
    return data

In [11]:
# sql query

sql_query = """
    select 
        c.id as contact_id,
        c.account_id, 
        c.title,
        c.mailing_city,
        c.user_city_c,
        a.firm_type_c,
        a.firm_subspecialty_c
    from pg_prod_db.salesforce.contact c
    left join pg_prod_db.salesforce.account a on c.account_id = a.id
    where c.created_date >= '2020-02-01'
    and c.created_via_conversion_c = 'TRUE';
"""

In [12]:
# get data

data = query_snowflake(sql_query)

In [13]:
data.shape

(197582, 7)

In [14]:
data.head()

Unnamed: 0,CONTACT_ID,ACCOUNT_ID,TITLE,MAILING_CITY,USER_CITY_C,FIRM_TYPE_C,FIRM_SUBSPECIALTY_C
0,0030W00003k719SQAQ,0010W00002LUOvBQAX,Site Manager,Brisbane,Brisbane,General Contractor,
1,0030W00003rTDFeQAO,0010W00002Da5ShQAJ,,Calgary,,,
2,0030W00003sSKjCQAW,0010W00002bimZQQAY,,Tulsa,,,
3,0030W00003sTqWeQAK,001d0000028bI8bAAE,,Anchorage,,Design,Architect​
4,0030W00003m0hLVQAY,001d0000025HEdIAAW,,Greensboro,,General Contractor,


In [15]:
for c in data.columns:
    print(c)
    num_nulls = sum(data[c].isna())/data.shape[0]
    print("{:.0%} null".format(num_nulls))
    print()

CONTACT_ID
0% null

ACCOUNT_ID
0% null

TITLE
65% null

MAILING_CITY
9% null

USER_CITY_C
76% null

FIRM_TYPE_C
19% null

FIRM_SUBSPECIALTY_C
84% null



In [17]:
data[data['MAILING_CITY'].isna()].sample(5)

Unnamed: 0,CONTACT_ID,ACCOUNT_ID,TITLE,MAILING_CITY,USER_CITY_C,FIRM_TYPE_C,FIRM_SUBSPECIALTY_C
164744,0030W00003lzY7vQAE,001d000001kzLbZAAU,Superintendent,,,General Contractor,
179696,0030W00003qZP5ZQAW,0010W00002XaVy2QAF,,,,General Contractor,
45652,0030W00003sTWXRQA4,0010W00002Xap9fQAB,,,,Subcontractor,
80251,0030W00003rUYWfQAO,0010W00002XcKClQAN,,,,General Contractor,
136941,0030W00003k7W6LQAU,0010W00002XafewQAB,,,,Subcontractor,


## Looking at Actual Leads

In [18]:
fname = 'List_of_Firmographic_Leads.xlsx'
leads = pd.read_excel(fname)

IndexError: list index out of range

In [None]:
leads.head()