In [4]:
import requests
import pandas as pd
import os
from bs4 import BeautifulSoup

In [5]:
# scrape org names from Fast Forward directory
npages = 30
org_names_raw = []

for ipage in range(1, npages+1):
    url = 'http://www.ffwd.org/wp-admin/admin-ajax.php?action=get_results&sfid=2724&paged=%d' % ipage
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    for elt in soup.find_all('h2'):
        org_names_raw.append(elt.contents[0].contents[0])

print 'Got %d org names from %s to %s' % (len(org_names_raw), org_names_raw[0], org_names_raw[-1])

Got 296 org names from #IGottaMakeIt to Zoonk




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [19]:
# QA based on visual examination.
# In most cases, the procedure was:
# look up Fast Forward org name on http://www.guidestar.org/,
# find EIN if it exists,
# then look up corrected org name for EIN in https://s3.amazonaws.com/irs-form-990/index_*.csv.
# Note that EINs in the S3 files have no dashes, while those on GuideStar do.
org_set = set(org_names_raw)

# first is bad, second is good
to_replace = [
    ('AdoptTogether', 'HOPING HEARTS FOUNDATION INC'),
    ('Adopt a Classroom', 'ADOPT-A-CLASSROOM INC'),
    ('Benetech', 'BENEFICENT TECHNOLOGY'),
    ('CareerVillage.org', 'CareerVillage'),
    ('CareMessage', 'ANJNA PATIENT EDUCATION DBA CAREMESSAGE'),
    ('Classroom, Inc.', 'Classroom Inc'),
    ('Code.org', 'CODEORG'),
    ('Coworker.org', 'COWORKERORG'),
    ('Digital Democracy', 'DTWO LTD'),
    ('DonorsChoose.org', 'DONORSCHOOSEORG'),
    ('DoSomething.org', 'DO SOMETHING INC'),
    ('Elephant Action League/WildLeaks', 'ELEPHANT ACTION LEAGUE'),
    ('E The People', 'E-THE PEOPLE'),
    ('EveryoneOn', 'CONNECT TO COMPETE'),
    ('FreeCycle.org', 'Freecycle Network'),
    ('GiveDirectly', 'Give Direct'),
    ('Gooru', 'Ednovo'),
    ('GreatNonprofits', 'Great Nonprofits'),
    ('Hollaback!', 'Hollaback'),
    ('ioby', 'IN OUR BACKYARDS'),
    ('Kiva', 'KIVA FOUNDATION'),
    ('Laborlink by Good World Solutions', 'Good World Solutions'),
    ('Learn Fresh Education Co.', 'Learn Fresh Education Co'),
    ('The Lunchbox Fund', 'Lunchbox Fund'),
    ('Made In a Free World', 'FAIR TRADE FUND INC'),
    ('New Classrooms Innovation Partners for Learning', 'NEW CLASSROOMS INNOVATION PARTNERS'),
    ('Samaschool', 'Samasource'),
    ('Scratch Foundation', 'CODE-TO-LEARN FOUNDATION'),
    ('Social Interest Solutions', 'CENTER TO PROMOTE HEALTHCARE ACCESS INC'),
    ('Stellar.org', 'STELLAR FOUNDATION'),
    ('The Global Lives Project', 'Global Lives Project'),
    ('The Freecycle Network', 'Freecycle Network'),
    ('UNCOMMEN', 'MEN OF COURAGE FOUNDATION DBA UNCOMMEN'),
    ('Wishbone', 'WISHBONEORG'),
    ('WITNESS', 'WITNESS INC'),
    ('Wordnik Society', 'PLANETWORK NGO INC'),
    ('YTH', 'INTERNET SEXUALITY INFORMATION SERVICES'),
]

for bad_name, good_name in to_replace:
    org_set.remove(bad_name)
    org_set.add(good_name)
    
# orgs where the name matches but the match is incorrect and there is no correct match
to_remove = set([
    'Pennies',
    'Quill',
    'UltraViolet',
])
org_set -= to_remove

# to sorted list again
org_names = sorted(list(org_set))

In [20]:
# collect org unique IDs from metadata files
years = [2011, 2012, 2013, 2014, 2015, 2016]
pieces = []
orgs_not_found = set(org_names)

path = '.'  # https://s3.amazonaws.com/

for year in years:
    # pull full metadata file
    # locally have deleted line 39569 of year 2014 because there is an extra comma
    year_df = pd.read_csv(os.path.join(path, 'irs-form-990', 'index_%s.csv' % year),
                          header=0)
    print 'For year %d, got %d lines with columns %s' % (year, len(year_df), year_df.columns)
    
    # store rows matching each org name
    for name in org_names:
        # search for rows where TAXPAYER_NAME starts with the org name
        regex = r'^' + name
        criterion = year_df['TAXPAYER_NAME'].str.contains(regex, case=False)
        org_rows = year_df[criterion]

        # store if found a single org
        if len(org_rows['TAXPAYER_NAME'].unique()) == 1:
            # one or more submissions for a single org, store all
            pieces.append(org_rows)
            orgs_not_found.discard(name)

# collect
df = pd.concat(pieces)

For year 2011, got 203074 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2012, got 261622 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2013, got 261449 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2014, got 387528 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2015, got 261032 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
  



In [21]:
# log found and missing orgs
orgs_found = set(org_names) - orgs_not_found
print 'found %d orgs: %s' % (len(orgs_found), sorted(list(orgs_found)))
print
print 'failed to find %d orgs: %s' % (len(orgs_not_found), sorted(list(orgs_not_found)))

found 104 orgs: [u'1947 Partition Archive', 'ADOPT-A-CLASSROOM INC', 'ANJNA PATIENT EDUCATION DBA CAREMESSAGE', 'BENEFICENT TECHNOLOGY', u'Blue Planet Network', u'Brackets For Good', 'CENTER TO PROMOTE HEALTHCARE ACCESS INC', 'CODE-TO-LEARN FOUNDATION', 'CODEORG', 'CONNECT TO COMPETE', 'COWORKERORG', u'Cancer Commons', 'CareerVillage', u'Case Commons', u'Center for Student Opportunity', 'Classroom Inc', u'Code for America', u'Common Sense Media', u'Crisis Text Line', u'D-Rev', 'DO SOMETHING INC', 'DONORSCHOOSEORG', 'DTWO LTD', u'Democracy Works', u'Design that Matters', u'Digital Green', 'E-THE PEOPLE', 'ELEPHANT ACTION LEAGUE', 'Ednovo', u'Elephant Action League', 'FAIR TRADE FUND INC', u'Families Empowered', 'Freecycle Network', u'Get Schooled', 'Give Direct', u'GiveWell', 'Global Lives Project', 'Good World Solutions', 'Great Nonprofits', 'HOPING HEARTS FOUNDATION INC', u'Harmony Institute', 'Hollaback', u'HopeLab', u'Humanitarian OpenStreetMap Team', 'IN OUR BACKYARDS', 'INTERNET S

In [22]:
# output
df.to_csv('output/metadata.csv', index=False)