## Read in data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import datetime


In [10]:
df = pd.read_csv('raw_data/2019-10-18-DC.csv', index_col=False, na_values=[99999])

print('Num of cand_id:',len(df['cand_id'].unique()), 'Num of cmte_id:', len(df['cmte_id'].unique()),'Num of cand_nm:', len(df['cand_nm'].unique()))
print('Shape:', df.shape)

Num of cand_id: 28 Num of cmte_id: 28 Num of cand_nm: 28
Shape: (19875, 18)


## Remove all candidates except top 5 in national polls

In [11]:
print(len(df.cand_nm.unique()))

28


In [12]:
cand_list = ['Warren, Elizabeth ', 'Trump, Donald J.', 'Sanders, Bernard', 'Buttigieg, Pete', 'Biden, Joseph R Jr']
df = df.loc[df.cand_nm.isin(cand_list)]
print('Num of candidates:', len(df.cand_nm.unique()))


Num of candidates: 5


## Clean form type

In [13]:
print(df.form_tp.value_counts())

SA17A    12029
SA18       903
SB28A      206
Name: form_tp, dtype: int64


In [14]:
# check which candidates have the instances of SB28A (refunds)
# Buttigieg (88), Biden (62), Sanders (38), Warren (18), Trump (0)
df.loc[df.form_tp == 'SB28A'].groupby('cand_nm').count();

In [15]:
# drop the refunds (SB28A)
df = df[df.form_tp != 'SB28A']

# drop all contributions from committees (SA18) 
df= df[df.form_tp != 'SA18']

In [16]:
print(df.form_tp.value_counts())

SA17A    12029
Name: form_tp, dtype: int64


## Clean city names 

In [17]:
print(df.contbr_city.value_counts())

WASHINGTON     12011
BOLLING AFB       11
BEAR               5
WASHINGT           1
GREENVILLE         1
Name: contbr_city, dtype: int64


In [18]:
# change bolling afb and washingt to washington
ind_err = df.loc[(df.contbr_city == 'WASHINGT') |(df.contbr_city == 'BOLLING AFB')].index
for ind in list(ind_err):
    df.contbr_city[ind] = 'WASHINGTON'    

# both are in Delaware 
df.loc[df.contbr_city == 'GREENVILLE']
df.loc[df.contbr_city == 'BEAR']
df.contbr_city.value_counts()

# remove Delaware cities 
df = df.loc[df.contbr_city == 'WASHINGTON']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [19]:
print(df.contbr_city.value_counts())

WASHINGTON    12023
Name: contbr_city, dtype: int64


## Clean zip codes 

In [20]:
df.loc[(df.contbr_zip.isna() != False)]                         
df.loc[df.contbr_nm == 'THOMPSON, KENNETH D JR', 'contbr_zip'] = 20001

In [21]:
df['contbr_zip'] = df.contbr_zip.astype('str')
df.contbr_zip = df['contbr_zip'].apply(lambda x: x[0:5])
print(len(df.contbr_zip.unique()))

50


In [22]:
print('Null zip codes:', df.contbr_zip.isna().sum())

Null zip codes: 0


In [23]:
print(df.contbr_zip.unique())
len(df.contbr_zip.unique()) # 48 unique zip codes 

['20008' '20007' '20037' '20003' '20016' '20001' '20009' '20006' '20013'
 '20018' '20010' '20002' '20024' '20011' '20005' '20015' '20012' '20552'
 '20212' '20004' '20036' '20019' '20526' '20017' '20301' '20540' '20032'
 '20020' '22205' '25413' '20585' '20057' '20375' '20472' '20270' '20026'
 '20229' '20420' '20591' '20500' '20210' '20038' '20044' '10036' '20014'
 '20027' '20577' '11217' '20052' '20035']


50

In [7]:
# (df.contbr_zip == '11217')].index)
df = df[df.contbr_zip != '10036']
df = df[df.contbr_zip !='11217']


NameError: name 'df' is not defined

In [8]:
len(df.contbr_zip.unique())

NameError: name 'df' is not defined

## Clean contribution amounts

In [None]:
# filter out negative values
df = df.loc[df.contb_receipt_amt > 0]
print('Min contribution: $', df.contb_receipt_amt.min())

In [None]:
# filter out values over legal limit
df = df.loc[df.contb_receipt_amt <= 2800]
print('Max contribution: $', df.contb_receipt_amt.max())

In [None]:
df.contb_receipt_amt.describe()

## Clean occupations

In [None]:
print(len(df.contbr_occupation.unique()))
print(df.contbr_occupation.dtype)
df.contbr_occupation.unique()[0:30]
len(df.contbr_occupation.unique())

In [None]:
# standardize occupation function
def std_occ(title): 
    if isinstance(title, float):
        return None
    t = title.lower().strip().split()
    
    for word in ['retired']:
        if word in t: 
            title = 'Retired'
            break
            
    for word in ['attorney', 'lawyer', 'general counsel', 'judge','clerk', 'investigator','officer','paralegal']:
        if word in t:
            title = 'Law'
            break
            
    for word in ['teacher','professor', 'educator', 'student', 'instructor','prof','librarian','education','tutor']:
        if word in t:
            title = 'Education'
            break
            
    for word in ['physician','doc','r.n.','surgeon','nurse','psychologist','therapist']:
        if word in t: 
            title = 'Healthcare'
            break
            
    for word in ['trade','government', 'federal', 'policy', 'consultant','consulting', 'program', 'manager', 'analyst', 'diplomat', 'organizer','public']:
        if word in t:
            title = 'Gov'
            break
            
    for word in ['executive', 'ceo', 'chief', 'director','president','vice', 'owner','partner','management','vp', 'finance','senior']: 
        if word in t:
            title = 'Leadership'
            break
   
    for word in ['economics','economic', 'economist', 'researcher','research','scientist']:
        if word in t: 
            title = 'Science'
            break       
    
    for word in ['not', 'information', 'requested','homemaker']:
        if word in t: 
            title = 'Not Employed or Unknown'
            break   
            
    for word in ['cybersecurity', 'software', 'engineer', 'it', 'technology','web','technical']:
        if word in t:
            title ='IT'
            break
            
    for word in ['writer', 'communications','editor','author','musician', 'designer','artist','art']:
        if word in t: 
            title = 'Artist'
            break
            
    for word in ['realtor', 'estate', 'architect','chst']:
        if word in t: 
            title = 'Real Estate/Construction'
            break   
    
    for word in ['sales', 'retail', 'supervisor']:
        if word in t: 
            title = 'Sales'
            break   
            
    return title.lower()

In [None]:
df['occ_cat'] = df.contbr_occupation.apply(std_occ)
df[['occ_cat', 'contbr_occupation']]
df.occ_cat.value_counts();

In [None]:
df.contbr_employer.fillna('none-listed', inplace=True)

## Convert dates to Month and Year

In [None]:
# convert receipt date to datetime format and then add second column called converted_date
df.contb_receipt_dt = pd.to_datetime(df.contb_receipt_dt)

In [None]:
df.contb_receipt_dt.describe()

In [None]:
df['converted_date'] = df.contb_receipt_dt.map(lambda x: 100*x.year + x.month)
df['payment_yr'] = df.contb_receipt_dt.map(lambda x: x.year)

## Drop unneeded features 

In [None]:
# tran_id is not a unique value 
df[df.duplicated(subset = 'tran_id', keep=False)]
df = df.drop(['cand_id', 'file_num', 'contbr_nm','contb_receipt_dt','form_tp','election_tp','contbr_city', 'memo_cd', 'receipt_desc', 'contbr_st', 'memo_text','cmte_id', 'tran_id'], axis=1)



## Look at different years 

In [None]:
df.sort_values(by=['payment_yr']);

In [None]:
df_grouped_yr = df.groupby(['cand_nm','payment_yr']).mean()
df_grouped_yr[['contb_receipt_amt']]

In [None]:
# filter out 2016, 2017, and 2018 
df = df[df.payment_yr >= 2019]
len(df)

In [None]:
plt.scatter(df.contb_receipt_amt, df.cand_nm)
plt.title('2019 Individual Contributions')
plt.xlabel('Dollars')
plt.show()

In [None]:
df_grouped_cand = df.groupby('cand_nm').mean()
df_grouped_cand['contb_receipt_amt']

In [None]:
df.cand_nm.value_counts()

## Save as CSV file 

In [None]:
df.to_csv('clean_data/2019-all-contributions.csv')

In [None]:
df_dems = df[df.cand_nm != 'Trump, Donald J.']

In [None]:
df_dems.to_csv('clean_data/2019-dems-contributions.csv')