# Analyzing local nonprofits with ProPublica's Nonprofit Explorer API

*Note: This is based on version two of ProPublica's API. It will depreciate at the end of 2017. Your mileage may vary. Refer to official docs for specifics: https://projects.propublica.org/nonprofits/api*

**The goal:**
* Pull down the nonprofit listings of a state, via iterating through every page of the query results. 
* Write the listings to a local file for posterity. 
* Filter down the state listings to local ones. 
* Save the filtered down results to a local file for posterity. 
* Pull the EIN codes for local nonprofits. 
* Query the API for detailed IRS nonprofit filings using the list of EIN codes
* Save these results to a local file.
* Analyze the data

In [1]:
import os
import time
import requests
#ProPublica's API only responds to GET requests and we're using Python 3. Use requests lib
import json
#The data is pulled down as JSON. So, we need to properly convert the GET request
import pandas as pd
#We'll need to filter and analyze the data, so use pandas so we can stay in Python
import numpy as np
from pprint import pprint as pp
# This makes printing the JSON clean, save your sanity
%matplotlib inline

In [2]:
locations = ['The Villages', 'Lady Lake', 'Wildwood', 'Busnell', 'Center Hill','Webseter','Coleman', 'Leesburg', 'Fruitland Park', 'Mt. Dora', 'Belleville', 'Summerfield'] 
#To find your specific local towns, Google towns in [YOUR COUNTY/GEOGRAPHIC AREA OF COVERAGE] and create a list from them. We'll use this to filter the results below


# To pull data from ProPublica's API, takes about 10-20 minutes. Commented out so it doesn't constantly rerun.
# You should save the results to a json file for easy reference.
#Uncomment this code to pull data down. 
# baseurl = "https://projects.propublica.org/nonprofits/api/v2/search.json?state%5Bid%5D=FL&page="
# r = requests.get(baseurl)
# rawdata = r.json()
# nonprof = rawdata['organizations']
# df = pd.DataFrame(nonprof)
# for page in range(1, rawdata['num_pages']):
#     print("Fetching " + str(page))
#     r = requests.get(baseurl + str(page))
#     rawdata = r.json()
#     nonprof = rawdata['organizations']
#     dfnew = pd.DataFrame(nonprof)
#     df = df.append(dfnew)
#     time.sleep(2)
# df.shape

#time.sleep(int) is needed because ProPublica's API will block you if you submit too many GET requests at once.
#request Florida nonprofit data from ProPublica's Nonprofit explorer API
#Returns 43,941 Florida nonprofits/charities
#Narrow down to Lady Lake, Wildwood, The Villages, etc. 


In [3]:
# Show you what's in the json you just pulled
# print(r.content)

In [10]:
# Show the dataframe of what you just scrapped so you can reference it and make sure your saved copy saves correctly. 
# df

Unnamed: 0,city,ein,has_subseccd,have_extracts,have_filings,have_pdfs,name,ntee_code,raw_ntee_code,score,state,strein,sub_name,subseccd
0,HURLBURT FLD,562342625,True,False,True,True,1 SOSFS DEFENDERS ASSOCIATION,,,4.283534,FL,56-2342625,1 SOSFS DEFENDERS ASSOCIATION,3
1,PENSACOLA,593068740,True,False,True,True,100 BLACK MEN OF PENSACOLA INC,S82Z,S82Z,4.283534,FL,59-3068740,100 BLACK MEN OF PENSACOLA INC,3
2,DEERFIELD BCH,591756721,True,True,True,True,100 CLUB OF SOUTH PALM BEACH COUNTY INC,,,4.283534,FL,59-1756721,100 CLUB OF SOUTH PALM BEACH COUNTY INC,3
3,PONTE VEDRA,261211203,True,True,True,True,100 TIMES FOUNDATION CORPORATION,T20,T20,4.283534,FL,26-1211203,100 TIMES FOUNDATION CORPORATION,3
4,FORT LAUDERDALE,800364095,True,True,True,True,1000 PLUS CLUB TO BENEFIT CANCER INC,G113,G113,4.283534,FL,80-0364095,1000 PLUS CLUB TO BENEFIT CANCER INC,3
5,WILTON MANORS,455195419,True,True,True,True,100PLUSANIMALRESCUE INC,D20,D20,4.283534,FL,45-5195419,100PLUSANIMALRESCUE INC,3
6,JACKSONVILLE,592888577,True,False,True,True,12 WHO CARE COMMUNITY SERVICE AWARDS INC,,,4.283534,FL,59-2888577,12 WHO CARE COMMUNITY SERVICE AWARDS INC,3
7,ORLANDO,311656521,True,True,True,True,1420 FOUNDATION FOR SUSTAINABLE DEVELOPMENT ED...,B11,B11,4.283534,FL,31-1656521,1420 FOUNDATION FOR SUSTAINABLE DEVELOPMENT ED...,3
8,WINTER HAVEN,270206132,True,True,True,True,1ST VETERANS KIDS CARE INC,P20,P20,4.283534,FL,27-0206132,1ST VETERANS KIDS CARE INC,3
9,SARASOTA,200262358,True,True,True,True,2-1-1 SUNCOAST INC,P20,P20,4.283534,FL,20-0262358,2-1-1 SUNCOAST INC,3


In [17]:
# Send the dataframe to a json file for posterity
#df.to_json('data/florida_nonprofs.json', orient='records')
#You need to tell pandas what to use as an key value for orienting the JSON file. 
# If you do reset_index().to_json('filepath', orient='index')
# The JSON file will have flipped the columns and rows

S/O to Mike Stucka (@MikeStucka) for helping me figure out how to properly parse ProPublica's Nonprofit API

Helpful stackoverflow thread: https://stackoverflow.com/questions/19214588/how-can-i-efficiently-move-from-a-pandas-dataframe-to-json

In [4]:
# Create a dataframe from your saved copy of the scrape
df1 = pd.read_json('data/florida_nonprofs.json')

In [5]:
df1.head()
#Make sure the dataframe maintained shape.

Unnamed: 0,city,ein,has_subseccd,have_extracts,have_filings,have_pdfs,name,ntee_code,raw_ntee_code,score,state,strein,sub_name,subseccd
0,HURLBURT FLD,562342625,True,False,True,True,1 SOSFS DEFENDERS ASSOCIATION,,,4.283534,FL,56-2342625,1 SOSFS DEFENDERS ASSOCIATION,3
1,PENSACOLA,593068740,True,False,True,True,100 BLACK MEN OF PENSACOLA INC,S82Z,S82Z,4.283534,FL,59-3068740,100 BLACK MEN OF PENSACOLA INC,3
2,DEERFIELD BCH,591756721,True,True,True,True,100 CLUB OF SOUTH PALM BEACH COUNTY INC,,,4.283534,FL,59-1756721,100 CLUB OF SOUTH PALM BEACH COUNTY INC,3
3,PONTE VEDRA,261211203,True,True,True,True,100 TIMES FOUNDATION CORPORATION,T20,T20,4.283534,FL,26-1211203,100 TIMES FOUNDATION CORPORATION,3
4,FORT LAUDERDALE,800364095,True,True,True,True,1000 PLUS CLUB TO BENEFIT CANCER INC,G113,G113,4.283534,FL,80-0364095,1000 PLUS CLUB TO BENEFIT CANCER INC,3


In [30]:
# locations = [x.upper() for x in locations]
# filter out non local nonprofits, upper case all city names
# localdf = df1[df1['city'].isin(locations)]

In [33]:
#Show the new dataframe, reset its index
# localdf.reset_index().head()

Unnamed: 0,index,city,ein,has_subseccd,have_extracts,have_filings,have_pdfs,name,ntee_code,raw_ntee_code,score,state,strein,sub_name,subseccd
0,265,THE VILLAGES,300096938,True,False,True,True,AMERICAN BRUSSELS GRIFFON ASSOCIATION,D60,D60,4.283534,FL,30-0096938,AMERICAN BRUSSELS GRIFFON ASSOCIATION,7
1,646,WILDWOOD,50579861,True,False,True,True,BARAK ENTERPRISES INC,X20,X20,4.283534,FL,05-0579861,BARAK ENTERPRISES INC,3
2,649,THE VILLAGES,382964580,True,True,True,True,BARBER FOUNDATION,,,4.283534,FL,38-2964580,BARBER FOUNDATION,3
3,826,THE VILLAGES,341910573,True,True,True,True,BLAIR FAMILY FOUNDATION,T20,T20,4.283534,FL,34-1910573,BLAIR FAMILY FOUNDATION,3
4,1124,LEESBURG,592808772,True,True,True,True,CARE DIVERSIFIED OF LAKE COUNTY INC,,,4.283534,FL,59-2808772,CARE DIVERSIFIED OF LAKE COUNTY INC,3


In [36]:
#Write the new dataframe to a local json file
# localdf.to_json('data/local_nonprofits.json', orient='records')

In [38]:
#Write the local dataframe to a local csv file for reporters
# localdf.to_csv('data/local_nonprofits.csv', sep=',')

In [6]:
localdf = pd.read_json('data/local_nonprofits.json')

In [7]:
localdf.head()

Unnamed: 0,city,ein,has_subseccd,have_extracts,have_filings,have_pdfs,name,ntee_code,raw_ntee_code,score,state,strein,sub_name,subseccd
0,THE VILLAGES,300096938,True,False,True,True,AMERICAN BRUSSELS GRIFFON ASSOCIATION,D60,D60,4.283534,FL,30-0096938,AMERICAN BRUSSELS GRIFFON ASSOCIATION,7
1,WILDWOOD,50579861,True,False,True,True,BARAK ENTERPRISES INC,X20,X20,4.283534,FL,05-0579861,BARAK ENTERPRISES INC,3
2,THE VILLAGES,382964580,True,True,True,True,BARBER FOUNDATION,,,4.283534,FL,38-2964580,BARBER FOUNDATION,3
3,THE VILLAGES,341910573,True,True,True,True,BLAIR FAMILY FOUNDATION,T20,T20,4.283534,FL,34-1910573,BLAIR FAMILY FOUNDATION,3
4,LEESBURG,592808772,True,True,True,True,CARE DIVERSIFIED OF LAKE COUNTY INC,,,4.283534,FL,59-2808772,CARE DIVERSIFIED OF LAKE COUNTY INC,3


In [8]:
#let's pull all of the EIN numbers so we can pull detailed records on our local nonprofits.
#Stackoverflow guidance: https://stackoverflow.com/questions/22341271/get-list-from-pandas-dataframe-column
einlist = localdf['ein'].tolist()


In [9]:
# Check to make sure we have a list
pp(einlist)

[300096938,
 50579861,
 382964580,
 341910573,
 592808772,
 592790823,
 593455505,
 592520097,
 275370261,
 592442549,
 596139800,
 800072799,
 320315315,
 650363634,
 592975723,
 760704071,
 272517148,
 593648363,
 592844668,
 61357703,
 680663759,
 436938878,
 590330175,
 134315485,
 592319673,
 161454827,
 472283160,
 472722928,
 596152302,
 591786646,
 202438907,
 562379882,
 593085768,
 591930274,
 464440493,
 464232908,
 593054643,
 593733183,
 455560376,
 263731546,
 510567649,
 593656796,
 593474454,
 205972206,
 592944839,
 270656070,
 236279703,
 204742893,
 10574719,
 462859665,
 453185222,
 311216257,
 592764174,
 364625733,
 462442032,
 591652673,
 61331583,
 592187338,
 592096407,
 592100791,
 591813434,
 593466574,
 591934800,
 592181152,
 590878982,
 203588786,
 271069304,
 800185498,
 593270444,
 464044482,
 870738252,
 596579131,
 592878329,
 462353037,
 383019445,
 651180390,
 593697805,
 461336140,
 200301785,
 460712125,
 593750165,
 900223420,
 942699567,
 5935673

In [15]:
# Add .json to the end of each list item and convert to string
einlist = list(map(str, einlist))

In [20]:
einlist = [j + '.json' for j in einlist]
#https://stackoverflow.com/questions/2050637/appending-the-same-string-to-a-list-of-strings-in-python

In [19]:
pp(einlist)

['300096938.json',
 '50579861.json',
 '382964580.json',
 '341910573.json',
 '592808772.json',
 '592790823.json',
 '593455505.json',
 '592520097.json',
 '275370261.json',
 '592442549.json',
 '596139800.json',
 '800072799.json',
 '320315315.json',
 '650363634.json',
 '592975723.json',
 '760704071.json',
 '272517148.json',
 '593648363.json',
 '592844668.json',
 '61357703.json',
 '680663759.json',
 '436938878.json',
 '590330175.json',
 '134315485.json',
 '592319673.json',
 '161454827.json',
 '472283160.json',
 '472722928.json',
 '596152302.json',
 '591786646.json',
 '202438907.json',
 '562379882.json',
 '593085768.json',
 '591930274.json',
 '464440493.json',
 '464232908.json',
 '593054643.json',
 '593733183.json',
 '455560376.json',
 '263731546.json',
 '510567649.json',
 '593656796.json',
 '593474454.json',
 '205972206.json',
 '592944839.json',
 '270656070.json',
 '236279703.json',
 '204742893.json',
 '10574719.json',
 '462859665.json',
 '453185222.json',
 '311216257.json',
 '592764174.jso

In [21]:
# With our EIN list in hand, let's reuse the code from the beginning to iterate through the API
# Uncomment this code to use it
#Key url: https://projects.propublica.org/nonprofits/api/v2/organizations/ +einlist
baseurl2 = "https://projects.propublica.org/nonprofits/api/v2/organizations/"
r2 = requests.get(baseurl2)
rawdata2 = r2.json()
orgdf = pd.DataFrame()
for e in einlist:
    print("Fetching")
    r2 = requests.get(baseurl2 + e)
    rawdata2 = r2.json()
    dfnew = pd.DataFrame(rawdata2)
    orgdf = orgdf.append(dfnew)
    time.sleep(2)
orgdf.shape

JSONDecodeError: Expecting value: line 1 column 1 (char 0)