In [10]:
#!/usr/bin/env python
# -*- coding: UTF-8

# Google search using Python
> See [here](http://pythonhosted.org/google/) for documentation and [here](https://pypi.python.org/pypi/google) for source code.

## Initializing the Python environment

In [11]:
# IMPORTING KEY PACKAGES
from google import search
import csv, re, os

## Testing out the search function

In [3]:
help(search)

Help on function search in module google:

search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0, stop=None, pause=2.0, only_standard=False, extra_params={}, tpe='', user_agent=None)
    Search the given query string using Google.
    
    @type  query: str
    @param query: Query string. Must NOT be url-encoded.
    
    @type  tld: str
    @param tld: Top level domain.
    
    @type  lang: str
    @param lang: Languaje.
    
    @type  tbs: str
    @param tbs: Time limits (i.e "qdr:h" => last hour, "qdr:d" => last 24 hours, "qdr:m" => last month).
    
    @type  safe: str
    @param safe: Safe search.
    
    @type  num: int
    @param num: Number of results per page.
    
    @type  start: int
    @param start: First result to retrieve.
    
    @type  stop: int
    @param stop: Last result to retrieve.
        Use C{None} to keep searching forever.
    
    @type  pause: float
    @param pause: Lapse to wait between HTTP requests.
        A lapse too long will

In [4]:
# Example of using the function:
for url in search('BENJAMIN FRANKLIN CHARTER SCHOOL MESA 2345 NORTH HORNE, MESA, AZ', \
                  stop=5, pause=1):
    print(url)

https://www.trulia.com/schools/AZ-Mesa/Benjamin_Franklin_Charter_School_-_Mesa/
http://www.ade.az.gov/edd/NewDetails.asp?EntityID=5536&RefTypeID=1035
http://www.ratemyteachers.com/benjamin-franklin-charter-school-mesa/500425-s
https://www.schooldigger.com/go/AZ/schools/0006500821/school.aspx
https://www.spellingcity.com/benjamin-franklin-charter-school-mesa-mesa-az.html
https://www.mapquest.com/us/arizona/schools-mesa/franklin-benjamin-charter-school-345471093
https://www.yelp.com/biz/franklin-benjamin-charter-school-mesa
https://www.publicschoolreview.com/benjamin-franklin-charter-school-mesa-profile
https://www.noodle.com/schools/az-benjamin-franklin-charter-school_district
https://www.neighborhoodscout.com/az/mesa/schools/040006500821


## Configuring search environment

In [5]:
# Here's a list of sites we DON'T want to spider, 
# but that an automated Google search might return...
# and we might thus accidentally spider unless we filter them out (as below)!

bad_sites = []
with open("../bad_sites.csv", "r", encoding = "utf-8") as csvfile:
    for row in csvfile:
        bad_sites.append(re.sub("\n", "", row))

print(bad_sites)

['high-schools.com', 'yelp.com', 'har.com', 'trulia.com', 'redfin.com', 'practutor.com', 'startclass.com', 'greatschools.org', 'greatschools.com', 'greatschools.net', 'paschoolperformance.org', 'worldcontactinfo.com', 'kula.com', 'mapquest.com', 'maps.net', 'google.com', 'facebook.com', 'zillow.com', 'manta.com', 'yellowpages.com', 'usnews.com', 'publicschoolreview.com', 'publicschoolreview.org', 'schooldigger.com', 'niche.com', 'privateschoolreview.com', 'cappex.com', 'collegeconfidential.com', 'tripsadvisor.com', 'groupon.com', 'school-ratings.com', 'superpages.com', 'onsaleph.com', 'psk12.com', 'schoolmatters.com', 'neighborhoodscout.com', 'localschooldirectory.com', 'publicschoolsk12.com', 'schooldatadirect.org', 'nces.ed.gov', 'cityrating.com', 'blogspot.com', 'public-schools.findthebest.com', 'twitter.com', 'zoominfo.com', 'jigsaw.com', 'hoovers.com', 'corporateinformation.com', 'doe.k12.ga.us', 'gradeschools.net', 'charterschoolratings.net', 'schools.net', 'insiderpages.com', 'p

## Helpful bash-fu

In [3]:
!cat > bad_sites

^C


In [None]:
  114  cat > testlist.txt
  115  cat testlist.txt 
  116  for i in $(cat testlist.txt | head -n 4); do echo $i; done
  117  for i in $(cat testlist.txt | head -n 4); do echo wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done
  118  for i in $(cat testlist.txt | head -n 4); do echo wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; echo; echo; done
  119  for i in $(cat testlist.txt | head -n 4); do wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done
  120  ls -la
  121  rm -f 500425-s 55362003.pdf franklin-benjamin-charter-school-mesa index.html 
  122* for i in $(cat testlist.txt | head -n 4); do wget --mirror --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done

## Reading in data

In [6]:
sample = [] # make empty list to store the dictionaries in
with open('../charter_URLs_Apr17.csv', 'r', encoding = 'Latin-1')\
as csvfile: # open file                      
    reader = csv.DictReader(csvfile) # create a reader
    for row in reader: # loop through rows
        sample.append(row) # append each row to the list

In [7]:
# Take a look at the first entry's contents and the variables list in our sample (a list of dictionaries)
print(sample[1]['SEARCH'], "\n", sample[1]["OLD_URL"], "\n")
print(sample[1].keys())

POLK STATE COLLEGE COLLEGIATE HIGH SCHOOL 3425 WINTER LK RD LAC1200, WINTER HAVEN, FL 33881 
 https://www.polk.edu/charter-high-schools/ 

dict_keys(['STABR', 'NCESSCH', 'OLD_URL', 'SCH_NAME', 'SEARCH', 'ADDRESS', 'MANUAL_URL'])


## Getting URLs

In [70]:
def getURL(search_terms, bad_sites_list, manual_url, known_urls):
    
    '''This function finds the one best URL for a school
    using a Google search of the school's name and address (stored in the SEARCH variable).
    It excludes URLs with any of the 61 bad_sites defined above, e.g. trulia.com, 
    greatschools.org, mapquest. It returns the number of excluded URLs and the first non-bad 
    URL discovered--or the already-collected manual_URL, which has already been documented.'''
    
    new_urls = []   # start with empty list
    k = 0 # initialize counter
    print("\nGetting URL for", search_terms)  # show school name & address
    
    new_urls = list(search(search_terms, num=10, pause=1, stop=10)) # grab first 10 Google results (URLs)
    
    '''for url in new_urls:
        new_urls.remove(url) if url==badsite for badsite in badsites_list
        if url = new_urls[0]:
            k += 1'''
    
    for badsite in bad_sites_list: # loop through bad sites list
        
        '''while badsite in new_urls[0]:
            k += 1
            new_urls.remove(new_urls[0])
            pass'''
            
        for url in new_urls: # loop through 10 search results
            if badsite in url:
                new_urls.remove(url) # Remove any search results that have a bad site url in them
                k+=1
    
    
    #if k>2: # Print this warning if any bad sites have been detected (and deleted)
    #    print("WARNING!! CHECK THIS URL!: " + new_urls[0] + \
    #          "\n" + str(k) + " bad Google results have been omitted.")

    if k>1:
        print(str(k) + " bad Google results have been omitted.")
        
    elif k>0:
        print(str(k) + " bad Google result has been omitted.")
    
    else: 
        print("No bad sites detected.")

        
    if manual_url != "":
        print("VALIDITY CHECK: Is the discovered URL of " + new_urls[0] + \
              " consistent with the known URL of " + manual_url + " ?")
        known_urls.append(manual_url)
        return(k, manual_url)
    
    else:
        known_urls.append(new_urls[0])
        return(k, new_urls[0])

In [71]:
numschools = 0 # initialize school counter
known_URLs = [] # initialize list of known URLs

for school in sample[:10]: # loop through list of schools
    numschools += 1
    school["BAD_URLS"], school["URL"] = "", "" # start with empty strings
    school["BAD_URLS"], school["URL"] = getURL(school["SEARCH"], bad_sites, school["MANUAL_URL"], known_URLs)

print("\n\nURLs discovered for " + str(numschools) + " schools.")

print("\nThe list of known URLs is now: \n" + str(known_URLs))


Getting URL for Richland Two Charter High 750 Old Clemson Road, Columbia, SC 29229
7 bad Google results have been omitted.
VALIDITY CHECK: Is the discovered URL of https://www.richland2.org/aec consistent with the known URL of https://www.richland2.org/charterhigh/ ?

Getting URL for POLK STATE COLLEGE COLLEGIATE HIGH SCHOOL 3425 WINTER LK RD LAC1200, WINTER HAVEN, FL 33881
4 bad Google results have been omitted.
VALIDITY CHECK: Is the discovered URL of http://www.ncsasports.org/football-recruiting/florida/winter-haven/polk-state-college-collegiate-high-school consistent with the known URL of https://www.polk.edu/lakeland-gateway-to-college-high-school/ ?

Getting URL for River City Scholars Charter Academy 944 Evergreen Street, Grand Rapids, MI 49507
7 bad Google results have been omitted.
VALIDITY CHECK: Is the discovered URL of https://www.nhaschools.com/schools/rivercity consistent with the known URL of https://www.nhaschools.com/schools/rivercity/Pages/default.aspx ?

Getting URL

In [None]:
# SAVE IT