In [3]:
from fuzzywuzzy import fuzz

import os
import sys

import pandas as pd
pd.options.display.max_columns = 999
import sqlalchemy as sqla
from sqlalchemy import create_engine
%matplotlib inline  

DB_URI = os.getenv('CD_DWH')
engine = create_engine(DB_URI)

In [64]:
# Load data
QUERY = """
select
    recipient_candidate_name as dbname,
    count(recipient_candidate_name)
from trg_analytics.candidate_contributions
group by recipient_candidate_name
"""
with engine.begin() as conn:
    sql_results = pd.read_sql(QUERY, conn)

#sql_results = sql_results.dropna()
print("Size:", len(sql_results))
sql_results.head(5)


Size: 1564


Unnamed: 0,dbname,count
0,,0
1,"MC HUGH, PETER A.",33
2,"ALI, ANILA",163
3,"KNIGHT, WILLIAM J. \PETE\""""",195
4,"HILL, JERRY A.",5404


In [14]:
# Load data
QUERY = """
select
    candidate_name as dbname,
    count(candidate_name)
from data_ingest.casos__california_candidate_statewide_election_results
group by candidate_name
"""
with engine.begin() as conn:
    sql_results2 = pd.read_sql(QUERY, conn)
    
print("Size:", len(sql_results2))
sql_results2.head(5)


Size: 1011


Unnamed: 0,dbname,count
0,,0
1,Adam B. Schiff*,4
2,G. Burt Lancaster,4
3,Baron Bruno,2
4,Zoe Lofgren,2


In [17]:
# Concat loaded data
results = sql_results
results.index = range(len(results))

results = results.append(sql_results2, ignore_index=True) # concat
results.index = range(len(results))

print("Size:", len(results))
results.head(5)

Size: 2575


Unnamed: 0,dbname,count
0,,0
1,"MC HUGH, PETER A.",33
2,"ALI, ANILA",163
3,"KNIGHT, WILLIAM J. \PETE\""""",195
4,"HILL, JERRY A.",5404


In [None]:
# Perform matching

import time
time_start = time.time()

UPPER_LIMIT = 85 # Anything >= this score will be considered "a definite match"
LOWER_LIMIT = 85 # Anything <= this score will be considered "a definite miss"

# Take in 2 names, return 1 number, percent, 0-100
def fuzzymatch(name1, name2):
    name1 = clean(name1)
    name2 = clean(name2)
    whole_name_match_ratio = fuzz.ratio(name1, name2)
    # last_name_match_ratio = fuzz.ratio(lastName(name1), lastName(name1))
    return whole_name_match_ratio # * last_name_match_ratio

import re
regex = re.compile(r'[^a-zA-Z ]')
def clean(name):
    parts = name.split(",")
    if len(parts) == 2: # If they aren't equal we have more or less than 1 comma, which we don't know how to handle. eg not "Smith, John M"
        name = parts[1].strip() + " " + parts[0].strip()
    return regex.sub("", name).lower()

sql_names = results

db_dbname_2_id = pd.DataFrame(columns=["dbname", "id"])  # [] # (dbname, id)
db_id_2_showname = pd.DataFrame(columns=["id", "showname"]) # [] # (id, showname)

# Static insert for "None" as -1 
db_dbname_2_id = db_dbname_2_id.append({"dbname" : "None", "id" : -1}, ignore_index=True)
db_id_2_showname = db_id_2_showname.append({"id" : -1, "showname" : "None"}, ignore_index=True)

next_id = 1 # At some point this should be set equal to "retreive last id in db"
count_hits = 0
count_misses = 0
count_skips = 0
count_news = 0
count_times = 0
count_nones = 0
total = len(sql_names)

hits = []
misses = []
skips = []
news = []
nones = []

for index, input_dbname in sql_names.iterrows():
    count_times += 1
    print(str(count_times) + " " + str(count_times / total * 100) + "%", end="\r") 
    # \r causes it to print over itself, so instead of being spammed with text, its a nice little ticker
    
    dbname = input_dbname.dbname
    
    if dbname is None:
        nones += [index]
        count_nones += 1
        continue
    
    if sum(db_dbname_2_id["dbname"] == dbname) > 0: # If the name is already processed
        skips += [str(count_skips) + " Skip: " + str(dbname)]
        count_skips += 1
        continue # Skip it
    
    fuzzymatches = []
    for index, row in db_dbname_2_id.iterrows(): # iterrows is really slow I think?
        name = row["dbname"]
        id_ = row["id"] 
        fuzzymatches += [[fuzzymatch(name, dbname), id_, name]]
    
    if not fuzzymatches:
        news += [str(count_news) + " New: " + str(dbname)]
        count_news += 1
        db_dbname_2_id = db_dbname_2_id.append({"dbname" : dbname, "id" : next_id}, ignore_index=True)
        db_id_2_showname = db_id_2_showname.append({"id" : next_id, "showname" : dbname}, ignore_index=True)
        next_id += 1
        continue # NEXT!
    
    fuzzymatches.sort(reverse=True, key=lambda x: x[0])
    h_match_name = fuzzymatches[0][2]
    h_match_id = fuzzymatches[0][1]
    h_match_score = fuzzymatches[0][0]
    
    if h_match_score >= UPPER_LIMIT:
        db_dbname_2_id = db_dbname_2_id.append({"dbname" : dbname, "id" : h_match_id}, ignore_index=True)   
        hits += [str(h_match_score) + "% " + str(count_hits) + " Match: " + str(dbname) + " vs " + str(h_match_name)]
        count_hits += 1
    
    elif h_match_score <= LOWER_LIMIT:
        db_dbname_2_id = db_dbname_2_id.append({"dbname" : dbname, "id" : next_id}, ignore_index=True)
        db_id_2_showname = db_id_2_showname.append({"id" : next_id, "showname" : dbname}, ignore_index=True)
        next_id += 1
        misses += [str(h_match_score) + "% " + str(count_misses) + " Miss: " + str(dbname) + " vs " + str(h_match_name)]
        count_misses += 1
        
    else:
        print(h_match_score, "% match: ", dbname, " vs ", name, " ")
        print("Same person (y/N)? ")
        answer = input()
        if answer == "":
            answer = "n"
        
        if answer.lower() == "y":
            db_dbname_2_id = db_dbname_2_id.append({"dbname" : dbname, "id" : h_match_id}, ignore_index=True)
        elif answer.lower() == "n":
            db_dbname_2_id = db_dbname_2_id.append({"dbname" : dbname, "id" : next_id}, ignore_index=True)
            db_id_2_showname = db_id_2_showname.append({"id" : next_id, "showname" : dbname}, ignore_index=True)
            next_id += 1

print("DONE!" + " "*50)

misses.sort(reverse=True, key=lambda x: int(x.split("%")[0]))
hits.sort(reverse=True, key=lambda x: int(x.split("%")[0]))

print("Misses:")
print("\n".join(misses))
print("Skips:")
#print("\n".join(skips))
print("New:")
print("\n".join(news))
print("Hits:")
print("\n".join(hits))
print("Nones indexes:")
#print("\n".join(nones))

print(db_dbname_2_id)
print(db_id_2_showname)
print("News:", count_news)
print("Hits:", count_hits)
print("Misses:", count_misses)
print("Skips:", count_skips)
print("Nones:", count_nones)
print("Total:", count_hits + count_misses + count_skips + count_news + count_nones)
print("times:", count_times)
print("Dataset len:", total)

time_elapsed = time.time() - time_start
print("Took:", time_elapsed, " seconds")


In [70]:
# Save pairings
# db_dbname_2_id.to_csv("./dbname_2_id.csv", index=False)
#db_dbname_2_id

In [68]:
# Load pairings
csv_data = pd.read_csv("./dbname_2_id.csv")
csv_data = csv_data.set_index("dbname") # Now we can do csv_data[str(x)].id to get the id

# Remove any column and resave
# csv_data = csv_data.drop(csv_data.columns[[0]], 1)
# csv_data.to_csv("./dbname_2_id.csv", index=False)

# Indexing the data
# csv_data.loc[csv_data.dbname == str(None), "id" ].values[0] # Pretty slow
# csv_data.loc[str(None)].id # Much faster


def dbname_to_id(name):
    return csv_data.loc[str(name)].id


csv_data.head(5)

Unnamed: 0_level_0,id
dbname,Unnamed: 1_level_1
,-1
"MC HUGH, PETER A.",1
"ALI, ANILA",2
"KNIGHT, WILLIAM J. \PETE\""""",3
"HILL, JERRY A.",4


In [5]:
# Example Load data to augment
QUERY = """
select
    distinct(recipient_candidate_name) as dbname,
    *
from trg_analytics.candidate_contributions
where cast(election_cycle as int) >= 2009
"""
with engine.begin() as conn:
    sql_results = pd.read_sql(QUERY, conn)

#sql_results = sql_results.dropna()
print("Size:", len(sql_results))
sql_results.head(5)



Size: 720630


Unnamed: 0,dbname,transaction_id,transaction_type,election_cycle,election,primary_general_indicator,transaction_date,transaction_amount,filed_date,recipient_committee_name,recipient_candidate_name,recipient_candidate_party,recipient_candidate_ico,recipient_candidate_status,recipient_candidate_office,recipient_candidate_district,donor_name,donor_city,donor_state,donor_zip_code,donor_employer,donor_occupation,donor_organization,donor_industry,donor_entity_type,donor_committee_id,donor_committee_name,donor_committee_type,donor_committee_party
0,"AANESTAD, SAMUEL",1602987 - PAY422,Loan,2011,2010-06-08,0,2009-12-29,0.0,2011-07-25,FRIENDS OF SAM AANESTAD FOR LT. GOVERNOR 2010,"AANESTAD, SAMUEL",NOT CURRENTLY SUPPORTED,,NOT CURRENTLY SUPPORTED,Lieutenant Governor,,"AANESTAD, SAM",PENN VALLEY,CA,95946,ST. OF CALIF.,NOT CURRENTLY SUPPORTED,IND,0,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,,
1,"AANESTAD, SAMUEL",1602987 - PAY785,Loan,2011,2010-06-08,0,2011-05-19,4000.0,2011-07-25,FRIENDS OF SAM AANESTAD FOR LT. GOVERNOR 2010,"AANESTAD, SAMUEL",NOT CURRENTLY SUPPORTED,,NOT CURRENTLY SUPPORTED,Lieutenant Governor,,"AANESTAD, SAM",PENN VALLEY,CA,95946,ST. OF CALIF.,NOT CURRENTLY SUPPORTED,IND,0,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,,
2,"AANESTAD, SAMUEL",1630984 - PAY422,Loan,2011,2010-06-08,0,2009-12-29,0.0,2012-01-24,FRIENDS OF SAM AANESTAD FOR LT. GOVERNOR 2010,"AANESTAD, SAMUEL",NOT CURRENTLY SUPPORTED,,NOT CURRENTLY SUPPORTED,Lieutenant Governor,,"AANESTAD, SAM",PENN VALLEY,CA,95946,ST. OF CALIF.,NOT CURRENTLY SUPPORTED,IND,0,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,,
3,"AANESTAD, SAMUEL",1630984 - PAY785,Loan,2011,2010-06-08,0,2011-05-19,0.0,2012-01-24,FRIENDS OF SAM AANESTAD FOR LT. GOVERNOR 2010,"AANESTAD, SAMUEL",NOT CURRENTLY SUPPORTED,,NOT CURRENTLY SUPPORTED,Lieutenant Governor,,"AANESTAD, SAM",PENN VALLEY,CA,95946,ST. OF CALIF.,NOT CURRENTLY SUPPORTED,IND,0,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,,
4,"AANESTAD, SAMUEL",1650089 - PAY422,Loan,2011,2010-06-08,0,2009-12-29,0.0,2012-04-02,FRIENDS OF SAM AANESTAD FOR LT. GOVERNOR 2010,"AANESTAD, SAMUEL",NOT CURRENTLY SUPPORTED,,NOT CURRENTLY SUPPORTED,Lieutenant Governor,,"AANESTAD, SAM",PENN VALLEY,CA,95946,ST. OF CALIF.,NOT CURRENTLY SUPPORTED,IND,0,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,NOT CURRENTLY SUPPORTED,,


In [69]:
# sql_results["ID"] = None
#test = sql_results.head(10000).copy(True)
# Create an ID column where its value is the rows dbname's id from csv_data[str(dbname)]
sql_results.loc[:, "ID"] = sql_results["dbname"].apply(dbname_to_id)
sql_results
#csv_data["id"][csv_data.dbname == sql_results.dbname]
#  # GARCIA, BONNIE

Unnamed: 0,dbname,count,ID
0,,0,-1
1,"MC HUGH, PETER A.",33,1
2,"ALI, ANILA",163,2
3,"KNIGHT, WILLIAM J. \PETE\""""",195,3
4,"HILL, JERRY A.",5404,4
5,"ACOSTA, GEORGIA L.",26,5
6,"HARRIS, ELIHU",22,6
7,"GUILLEN, ABEL",1300,7
8,"GERBER, DONNA C.",1026,8
9,"MC IVER, BARBARA G.",259,9
