In [1]:
from ethnicolr import census_ln, pred_fl_reg_ln, pred_fl_reg_name
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [2]:
def fetch_csv_file(selected_csv):
    """Specify the csv file being used."""
    dime_database_file = selected_csv 
    df=pd.read_csv(dime_database_file, low_memory=False)
    return df

In [3]:
def make_contributor_subset(df):
    """Subset the database to keep: 
    a. uniqueid (there are multiple IDs. Use one that gives a unique id for each contribution.) 
    b. columns related to name, and 
    c. year in which the contribution was made."""

    selected_cols = ['cycle', 'transaction.id',
     'date', 'contributor.name',
     'contributor.lname', 'contributor.fname',
     'contributor.mname', 'contributor.suffix',
     'contributor.title', 'contributor.ffname'
    ]
    subset_df = df[selected_cols].copy()
    new_col_names = [w.replace('.', '_') for w in selected_cols]
    new_cols_dict = dict(zip(selected_cols, new_col_names))
    subset_df.rename(columns=new_cols_dict, inplace=True)
    subset_df.rename({'date':'contribution_year'}, inplace=True)
    return subset_df

In [4]:
def deduplicate_contributors(subset_df):
    """Build a primary key, where key = concatenation of name + year of contribution. 
       De-duplicate based on the key. """
    subset_df['annual_contrib_key'] = subset_df['contributor_name'] + str(subset_df['contributor_name'])
    subset_df.drop_duplicates(subset=['annual_contrib_key'], keep=False, inplace=True)
    subset_df.drop(columns=['annual_contrib_key'], inplace=True)
    return subset_df

In [5]:
def run_census_ln (subset_df, census_year):
    """Run the Census Ln Function."""
    has_last_name_df = subset_df[subset_df.contributor_lname.notnull()].copy()
    return census_ln(has_last_name_df, 'contributor_lname', census_year)


In [6]:
def run_pred_fl_reg_ln(subset_df):
    """Run the pred_fl_reg_ln Function."""
    has_last_name_df = subset_df[subset_df.contributor_lname.notnull()].copy()
    return pred_fl_reg_ln(has_last_name_df , 'contributor_lname')    
    

In [7]:
def run_pred_fl_reg_name(subset_df):
    """Run Florida Data by Name"""
    has_last_name_df = subset_df[subset_df.contributor_lname.notnull()].copy()
    also_has_first_name_df = has_last_name_df[has_last_name_df.contributor_fname.notnull()].copy()
    return pred_fl_reg_name(also_has_first_name_df, 'contributor_lname', 'contributor_fname')

In [None]:
def export_generated_df_csv(df, file_name):
    """Generate new csv file of the prepared dataset."""
    df.to_csv(file_name, encoding='utf-8', index=False, header=True)

In [19]:
def main_process(selected_csv):
    """Consolidate the steps in the data collation and sanitization."""
    results = {}
    data_df = fetch_csv_file(selected_csv)
    contributors_df = make_contributor_subset(data_df)
    unique_contributors = deduplicate_contributors(contributors_df)
    census_ln_2000_results = run_census_ln(unique_contributors, 2000)
    census_ln_2010_results = run_census_ln(unique_contributors, 2010)
    pred_fl_reg_ln_results = run_pred_fl_reg_ln(unique_contributors)
    pred_fl_reg_name_results = run_pred_fl_reg_name(unique_contributors)
    results['census_ln_2000_results'] = census_ln_2000_results
    results['census_ln_2010_results'] = census_ln_2010_results
    results['pred_fl_reg_ln_results'] = pred_fl_reg_ln_results
    results['pred_fl_reg_name_results'] = pred_fl_reg_name_results
    return results

In [20]:
"""Execute a single run of data fro dataset."""
selected_csv = 'contribDB_1980.csv' # Example DB from DIME Dataset, csv file in same folder
results = main_process(selected_csv)
for key in results.keys():
    export_generated_df_csv(results[key] , key+'.csv')
    print('{0} successfully exported'.format(key))

In [22]:
results['census_ln_2000_results'] 

Unnamed: 0,cycle,transaction_id,date,contributor_name,contributor_lname,contributor_fname,contributor_mname,contributor_suffix,contributor_title,contributor_ffname,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,1980,comm:1980:126045,1980-01-19,"GROVER, HENRY C",grover,henry,c,,,,85.90,6.10,4.05,0.90,1.58,1.47
1,1980,comm:1980:126050,1979-03-19,"GABRIELE, PITCAIRN PENDLETON",gabriele,pitcairn,pendleton,,,,94.22,1.15,(S),(S),1.05,3.01
2,1980,comm:1980:126121,1980-03-19,"ZIEN, BURT",zien,burt,,,,,78.53,3.39,16.38,(S),(S),0.00
3,1980,comm:1980:126147,1980-11-19,"WYLY, SAM",wyly,sam,,,,,87.87,5.28,(S),4.11,2.15,(S)
4,1980,comm:1980:126150,1980-10-19,"NUSSBAUM, BERNARD W",nussbaum,bernard,w,,,,97.54,0.29,0.41,0.00,1.04,0.72
5,1980,comm:1980:126151,1980-11-19,"HALLORAN, JOHN E",halloran,john,e,,,,96.59,0.23,0.71,0.10,0.96,1.42
6,1980,comm:1980:75485,1980-10-19,"WALD, JEFFREY LEE",wald,jeffrey,lee,,,,92.38,3.71,0.45,0.47,1.38,1.61
7,1980,comm:1980:75490,1980-10-19,"KALISH, HARRY A",kalish,harry,a,,,,96.80,0.49,0.37,0.00,1.22,1.14
8,1980,comm:1980:75512,1980-11-19,"BERRY, B J FOR REAGAN",berry,b j for,,,,,73.27,22.26,0.44,0.68,1.75,1.60
9,1980,comm:1980:75513,1980-11-19,"LASKER, MARY W",lasker,mary,w,,,,74.18,16.44,6.77,0.00,1.74,0.87


In [23]:
 results['census_ln_2010_results'] 

Unnamed: 0,cycle,transaction_id,date,contributor_name,contributor_lname,contributor_fname,contributor_mname,contributor_suffix,contributor_title,contributor_ffname,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,1980,comm:1980:126045,1980-01-19,"GROVER, HENRY C",grover,henry,c,,,,82.01,6.13,6.63,0.92,1.95,2.37
1,1980,comm:1980:126050,1979-03-19,"GABRIELE, PITCAIRN PENDLETON",gabriele,pitcairn,pendleton,,,,92.62,0.96,0.87,0.27,0.82,4.45
2,1980,comm:1980:126121,1980-03-19,"ZIEN, BURT",zien,burt,,,,,74.86,5.14,16,(S),2.86,(S)
3,1980,comm:1980:126147,1980-11-19,"WYLY, SAM",wyly,sam,,,,,83.69,7.29,0,2.69,4.61,1.73
4,1980,comm:1980:126150,1980-10-19,"NUSSBAUM, BERNARD W",nussbaum,bernard,w,,,,96.81,0.28,0.62,0.16,1.13,1
5,1980,comm:1980:126151,1980-11-19,"HALLORAN, JOHN E",halloran,john,e,,,,95.26,0.38,0.84,0.24,1.23,2.05
6,1980,comm:1980:75485,1980-10-19,"WALD, JEFFREY LEE",wald,jeffrey,lee,,,,91.74,3.25,1.08,0.52,1.36,2.05
7,1980,comm:1980:75490,1980-10-19,"KALISH, HARRY A",kalish,harry,a,,,,95.38,0.45,0.72,0.17,1.62,1.66
8,1980,comm:1980:75512,1980-11-19,"BERRY, B J FOR REAGAN",berry,b j for,,,,,70.66,23.11,0.56,0.8,2.34,2.53
9,1980,comm:1980:75513,1980-11-19,"LASKER, MARY W",lasker,mary,w,,,,71.22,17.16,8.86,(S),(S),2.03


In [24]:
results['pred_fl_reg_ln_results']

Unnamed: 0,cycle,transaction_id,date,contributor_name,contributor_lname,contributor_fname,contributor_mname,contributor_suffix,contributor_title,contributor_ffname,race,asian,hispanic,nh_black,nh_white
28918,1980,comm:1980:126045,1980-01-19,"GROVER, HENRY C",grover,henry,c,,,,nh_white,0.006012,0.020842,0.081621,0.977151
28924,1980,comm:1980:126050,1979-03-19,"GABRIELE, PITCAIRN PENDLETON",gabriele,pitcairn,pendleton,,,,nh_white,0.004202,0.051441,0.138404,0.815632
29003,1980,comm:1980:126121,1980-03-19,"ZIEN, BURT",zien,burt,,,,,nh_white,0.010336,0.014242,0.015116,0.878859
29031,1980,comm:1980:126147,1980-11-19,"WYLY, SAM",wyly,sam,,,,,nh_white,0.006747,0.018699,0.258969,0.976818
29035,1980,comm:1980:126150,1980-10-19,"NUSSBAUM, BERNARD W",nussbaum,bernard,w,,,,nh_white,0.008804,0.039631,0.019481,0.869828
29036,1980,comm:1980:126151,1980-11-19,"HALLORAN, JOHN E",halloran,john,e,,,,nh_white,0.013417,0.025171,0.045281,0.750207
99014,1980,comm:1980:75485,1980-10-19,"WALD, JEFFREY LEE",wald,jeffrey,lee,,,,nh_white,0.003927,0.015393,0.401022,0.993561
99020,1980,comm:1980:75490,1980-10-19,"KALISH, HARRY A",kalish,harry,a,,,,nh_white,0.020515,0.019710,0.024152,0.956055
99045,1980,comm:1980:75512,1980-11-19,"BERRY, B J FOR REAGAN",berry,b j for,,,,,nh_white,0.005566,0.032074,0.290651,0.785595
99046,1980,comm:1980:75513,1980-11-19,"LASKER, MARY W",lasker,mary,w,,,,nh_white,0.005754,0.017943,0.021452,0.917368


In [25]:
results['pred_fl_reg_name_results']

Unnamed: 0,cycle,transaction_id,date,contributor_name,contributor_lname,contributor_fname,contributor_mname,contributor_suffix,contributor_title,contributor_ffname,race,asian,hispanic,nh_black,nh_white
28918,1980,comm:1980:126045,1980-01-19,"GROVER, HENRY C",grover,henry,c,,,,nh_white,0.002552,0.012257,0.095512,0.257572
28924,1980,comm:1980:126050,1979-03-19,"GABRIELE, PITCAIRN PENDLETON",gabriele,pitcairn,pendleton,,,,nh_black,0.032187,0.011376,0.106356,0.015542
29003,1980,comm:1980:126121,1980-03-19,"ZIEN, BURT",zien,burt,,,,,nh_white,0.000832,0.002869,0.002841,0.821611
29031,1980,comm:1980:126147,1980-11-19,"WYLY, SAM",wyly,sam,,,,,nh_white,0.004217,0.002675,0.076634,0.156450
29035,1980,comm:1980:126150,1980-10-19,"NUSSBAUM, BERNARD W",nussbaum,bernard,w,,,,nh_white,0.001623,0.003258,0.015580,0.532584
29036,1980,comm:1980:126151,1980-11-19,"HALLORAN, JOHN E",halloran,john,e,,,,nh_white,0.002015,0.004242,0.049680,0.798114
99014,1980,comm:1980:75485,1980-10-19,"WALD, JEFFREY LEE",wald,jeffrey,lee,,,,nh_white,0.002330,0.009052,0.013420,0.891666
99020,1980,comm:1980:75490,1980-10-19,"KALISH, HARRY A",kalish,harry,a,,,,nh_white,0.002213,0.004046,0.012709,0.829020
99045,1980,comm:1980:75512,1980-11-19,"BERRY, B J FOR REAGAN",berry,b j for,,,,,nh_white,0.002360,0.014807,0.070930,0.684374
99046,1980,comm:1980:75513,1980-11-19,"LASKER, MARY W",lasker,mary,w,,,,nh_white,0.002277,0.005412,0.048142,0.751135
