In [24]:
import pandas as pd

## UPDATE

In [25]:

contrib_df = pd.read_csv('./tabula_generated_csvs/tabula-wengay_newton_filing_05_21.csv')

In [26]:
contrib_df = contrib_df[contrib_df['Unnamed: 7'].notna()]


In [27]:
contrib_df = contrib_df.drop(axis=1, labels=['(11)\rAmendment', '(12)\rAmount'])


In [28]:
contrib_df = contrib_df.rename(mapper={'(5)\rDate': 'Date', '(8)\rContributor\rTypeOccupation': 'Contributor Type', '(9)\rContribution\rType': 'Occupation', 
                         '(10)\rIn-kind\rDescription': 'Payment Type', 'Unnamed: 7':'Amount', '(7)\rFull Name\r(Last, Suffix, First, Middle)\rStreet Address &\rCity, State, Zip Code':"Name"}, axis=1)

In [29]:
def parse_date(date_str):
    slashes = [2, 5, 6]
    date = ""
    while len(date) < 8:
        for char in str(date_str):
            if char.isnumeric():
                date = date + char
                if len(date) in slashes:
                    date = date + '/'
    return date

In [30]:
parse_date('01\r1921\r//	')

'01/19/21'

In [31]:
contrib_df['Date'] = contrib_df['Date'].apply(lambda x: parse_date(x))


In [32]:
contrib_df['Name'] = contrib_df['Name'].apply(lambda x: x.split('\r'))

In [33]:
contrib_df['Full Name'] = contrib_df['Name'].apply(lambda x: x[0])

In [34]:
def extract_street_address(address_list):
    if len(address_list) > 3:
        del address_list[0]
        address = " ".join(address_list[:-1])
        return address
    else:
        return address_list[1]

In [35]:
contrib_df['Address'] = contrib_df['Name'].apply(lambda x: extract_street_address(x))


In [36]:
def split_city_state_zip(name_list):
    print(name_list)

    val = name_list[-1]
    city = None
    state = None
    zipcode = None
    if val.isnumeric():
        zipcode = val
    else:
        city = val.split(',')[0]
        if len(val.split(',')) > 1:
            state_zip = val.split(',')[1].strip().split(' ')
            if len(state_zip) > 1:
                state = state_zip[0]
                zipcode = state_zip[1]
            else:
                state = state_zip[0]
    return city, state, zipcode

In [37]:
contrib_df['City'] = contrib_df['Name'].apply(lambda x: split_city_state_zip(x)[0])
contrib_df['State'] = contrib_df['Name'].apply(lambda x: split_city_state_zip(x)[1])
contrib_df['Zipcode'] = contrib_df['Name'].apply(lambda x: split_city_state_zip(x)[2])


['DANIEL COUGHLIN', '1636 LAUREL', 'SARASOTA, FL 34235']
['APRIL DALLAS', '633 59TH STREET SO', 'ST.PETERSBURG, FL']
['SUSIE COPELAND', '1010 27TH STREET', 'EAST']
['STEVE WESTPHAL', '1770 BRIGHTWATER', 'ST. PETERSBURG, FL']
['DONALD MORGAN', '14351 CHAMBERLAIN', 'AVE']
['MELISSA HARGROVE', '5829 SHARP DR.', 'MABLENTON, GA']
['DARLINE SLY', '325 YOUTH CAMP RD', 'GROVELAND, FL 34736']
['DEBORAH SCANLAN', '8018 35TH AVE NORTH', 'ST. PETERSBURG, FL']
['SHANNON HISER', '1914 10TH STREET', 'ST. PETERSBURG, FL']
['YOLANDA JACKSON', '8339 N.W. 195TH', 'TERRACE']
['MYRTHA FETGUSON', '18800 NW 29 AVE', 'MIAMI GARDENS, FL']
['BERNICE DARLING', '1423 23RD AVENUE', 'ST. PETERSBURG, FL']
['FLORENCE SHELTON', '1227 GRAND TRAIL', 'BRADENTON, FL 34121']
['CUTHBERTSON', 'P.O.  BOX 13177', 'ST. PETERSBURG, FL']
['DONNIE MCMILLAN', '15908 NORTHLAKE', 'VILLAGE DR']
['CONOR DARKEN', '27215 FORDHAM', 'DRIVE']
['CUTHBERTSON', '4534 21ST AVE SOUTH', 'ST. PETERSBURG, FL']
['CHERLYN FLOUNARY', '737 60TH AVE SOU

In [38]:
def parse_first_name(full_name):
    names = full_name.split(' ')
    titles = ['Mr', 'Mrs', 'Miss', 'Ms', 'Dr']
    if names[0] not in titles:
        first = names[0]
    else:
        first = names[1]

    res = ''.join([i for i in first if not i.isdigit()])
    return res

In [39]:
contrib_df['First'] = contrib_df['Full Name'].apply(lambda x: parse_first_name(x))
contrib_df['Last'] = contrib_df['Full Name'].apply(lambda x: x.split(' ')[-1].strip())

In [40]:
contrib_df = contrib_df.drop(columns = ['Name', 'Full Name'])

In [41]:
contrib_df['Contributor Type'] = contrib_df['Contributor Type'].map({'Busine':'Business', 'Individ': 'Individual'})

In [42]:
def clean_occupation(occupation):
    if '\ral' in occupation:
        occupation = occupation.replace('\ral', '')
    occupation = occupation.replace('\r', ' ')
    if occupation == 'al':
        occupation = ''
    return occupation

In [43]:
contrib_df['Occupation'] = contrib_df['Occupation'].apply(lambda x: clean_occupation(x))

In [44]:
contrib_df = contrib_df[['Date', 'Amount', 'Payment Type', 'First', 'Last', 'Address', 'City', 'State', 'Zipcode', 'Occupation']]

## UPDATE

In [45]:
contrib_df.to_csv('wengay_newton_filing_05_21.csv', index=False)

# TODO
- [ ] Remove salutations from names
- [ ] Weird names with numbers in the middle
- [ ] Accents
- [ ] Permute columns