In [131]:
# Dependencies
import requests
import json
from pprint import pprint
import pandas as pd
import numpy as np
import pymongo
from bson import json_util
from bson.objectid import ObjectId

# Import API Key
from config import api_key

In [132]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [133]:
# Define database and collection
db = client.schools_db
collection = db.items

In [134]:
# URL for GET requests to retrieve school data
base_url = "https://api.data.gov/ed/collegescorecard/v1/schools?api_key=" + api_key

In [135]:
# Filter by ST and Limit to a Min Student Size
ST = "&school.state=CA"
min_size = "&latest.student.size__range=1000.."
filter_url = base_url + ST + min_size
print(filter_url)

https://api.data.gov/ed/collegescorecard/v1/schools?api_key=rjhyKFWn0g9PsqjQC1dH9vRABMgQOMzdY5msysWA&school.state=CA&latest.student.size__range=1000..


In [136]:
response = requests.get(filter_url).json()

In [137]:
school_responses = response['metadata']['total']
max_pages = 20
pages = int(np.ceil(school_responses / max_pages))
print(pages)

11


In [138]:
def api_call(page):
    
    school_data = []    
    
    url = filter_url + f"&_page={page}"
    data = requests.get(url).json()
    
    
    for i in range(len(data["results"])):
        s_id = data["results"][i]["id"]

        #SCHOOL DATA
        school_name = data["results"][i]["school"]["name"]
        url = data["results"][i]["school"]["school_url"]
        size = data["results"][i]["latest"]["student"]["size"]

        school_data.append({"ID": s_id,
                            "Name": school_name,
                            "Website": url,
                            "Total Students": size,})
    return school_data

In [139]:
# Clear out collection, Paginate API, Insert into MongoDB collection
collection.remove()

for page_iteration in range(pages):
    college_data = api_call(page_iteration)
    print(page_iteration)
    print("next page --------------------------")
    for college in college_data:
        collection.insert_one(college)
        print(college)
print("DONE!")
# colleges;
# college_df = pd.DataFrame(colleges)

#Insert into MongoDB

# collection.insert_many(colleges)

  


0
next page --------------------------
{'ID': 461148, 'Name': 'New York Film Academy', 'Website': 'www.nyfa.edu', 'Total Students': 1080, '_id': ObjectId('5d5d7e07711e479ac4691ebf')}
{'ID': 455512, 'Name': 'Woodland Community College', 'Website': 'https://wcc.yccd.edu', 'Total Students': 2891, '_id': ObjectId('5d5d7e07711e479ac4691ec0')}
{'ID': 445188, 'Name': 'University of California-Merced', 'Website': 'www.ucmerced.edu', 'Total Students': 7375, '_id': ObjectId('5d5d7e07711e479ac4691ec1')}
{'ID': 447768, 'Name': 'American Career College-Ontario', 'Website': 'americancareercollege.edu', 'Total Students': 1275, '_id': ObjectId('5d5d7e07711e479ac4691ec2')}
{'ID': 117724, 'Name': 'Los Angeles Trade Technical College', 'Website': 'www.lattc.edu', 'Total Students': 11194, '_id': ObjectId('5d5d7e07711e479ac4691ec3')}
{'ID': 119067, 'Name': 'Monterey Peninsula College', 'Website': 'www.mpc.edu', 'Total Students': 6856, '_id': ObjectId('5d5d7e07711e479ac4691ec4')}
{'ID': 119164, 'Name': 'Mt 

3
next page --------------------------
{'ID': 117274, 'Name': 'Lassen Community College', 'Website': 'www.lassencollege.edu/', 'Total Students': 2187, '_id': ObjectId('5d5d7e1f711e479ac4691efb')}
{'ID': 115001, 'Name': 'Glendale Community College', 'Website': 'www.glendale.edu', 'Total Students': 12996, '_id': ObjectId('5d5d7e1f711e479ac4691efc')}
{'ID': 117627, 'Name': 'La Sierra University', 'Website': 'https://lasierra.edu', 'Total Students': 1929, '_id': ObjectId('5d5d7e1f711e479ac4691efd')}
{'ID': 113634, 'Name': 'Diablo Valley College', 'Website': 'www.dvc.edu', 'Total Students': 17926, '_id': ObjectId('5d5d7e1f711e479ac4691efe')}
{'ID': 112561, 'Name': 'Columbia College', 'Website': 'www.gocolumbia.edu/', 'Total Students': 2216, '_id': ObjectId('5d5d7e1f711e479ac4691eff')}
{'ID': 399212, 'Name': 'Santiago Canyon College', 'Website': 'www.sccollege.edu/Pages/default.aspx', 'Total Students': 11378, '_id': ObjectId('5d5d7e1f711e479ac4691f00')}
{'ID': 118718, 'Name': 'Merced College

6
next page --------------------------
{'ID': 154022, 'Name': 'Ashford University', 'Website': 'www.ashford.edu', 'Total Students': 31046, '_id': ObjectId('5d5d7e36711e479ac4691f37')}
{'ID': 438504, 'Name': 'Universal Technical Institute of California Inc', 'Website': 'www.uti.edu/', 'Total Students': 1273, '_id': ObjectId('5d5d7e36711e479ac4691f38')}
{'ID': 441937, 'Name': 'California State University-Channel Islands', 'Website': 'www.csuci.edu', 'Total Students': 7047, '_id': ObjectId('5d5d7e36711e479ac4691f39')}
{'ID': 443331, 'Name': 'West Coast University-Los Angeles', 'Website': 'westcoastuniversity.edu/', 'Total Students': 1660, '_id': ObjectId('5d5d7e36711e479ac4691f3a')}
{'ID': 243744, 'Name': 'Stanford University', 'Website': 'www.stanford.edu/', 'Total Students': 7056, '_id': ObjectId('5d5d7e36711e479ac4691f3b')}
{'ID': 366401, 'Name': 'Las Positas College', 'Website': 'www.laspositascollege.edu', 'Total Students': 8077, '_id': ObjectId('5d5d7e36711e479ac4691f3c')}
{'ID': 36

9
next page --------------------------
{'ID': 110653, 'Name': 'University of California-Irvine', 'Website': 'www.uci.edu/', 'Total Students': 29295, '_id': ObjectId('5d5d7e4e711e479ac4691f73')}
{'ID': 110680, 'Name': 'University of California-San Diego', 'Website': 'www.ucsd.edu', 'Total Students': 28577, '_id': ObjectId('5d5d7e4e711e479ac4691f74')}
{'ID': 111896, 'Name': 'Cerro Coso Community College', 'Website': 'cerrocoso.edu', 'Total Students': 4736, '_id': ObjectId('5d5d7e4e711e479ac4691f75')}
{'ID': 112686, 'Name': 'Compton College', 'Website': 'www.compton.edu', 'Total Students': 6212, '_id': ObjectId('5d5d7e4e711e479ac4691f76')}
{'ID': 113856, 'Name': 'East Los Angeles College', 'Website': 'www.elac.edu/', 'Total Students': 29896, '_id': ObjectId('5d5d7e4e711e479ac4691f77')}
{'ID': 113980, 'Name': 'El Camino Community College District', 'Website': 'www.elcamino.edu', 'Total Students': 21663, '_id': ObjectId('5d5d7e4e711e479ac4691f78')}
{'ID': 114789, 'Name': 'Fresno City Colleg

In [140]:
def toJson(data):
    """Convert Mongo object(s) to JSON"""
    return json.dumps(data, default=json_util.default)

In [141]:
# Query MongoDB and return as Json
results = collection.find()
all_data = [result for result in results]
print(toJson(all_data))

[{"_id": {"$oid": "5d5d7e07711e479ac4691ebf"}, "ID": 461148, "Name": "New York Film Academy", "Website": "www.nyfa.edu", "Total Students": 1080}, {"_id": {"$oid": "5d5d7e07711e479ac4691ec0"}, "ID": 455512, "Name": "Woodland Community College", "Website": "https://wcc.yccd.edu", "Total Students": 2891}, {"_id": {"$oid": "5d5d7e07711e479ac4691ec1"}, "ID": 445188, "Name": "University of California-Merced", "Website": "www.ucmerced.edu", "Total Students": 7375}, {"_id": {"$oid": "5d5d7e07711e479ac4691ec2"}, "ID": 447768, "Name": "American Career College-Ontario", "Website": "americancareercollege.edu", "Total Students": 1275}, {"_id": {"$oid": "5d5d7e07711e479ac4691ec3"}, "ID": 117724, "Name": "Los Angeles Trade Technical College", "Website": "www.lattc.edu", "Total Students": 11194}, {"_id": {"$oid": "5d5d7e07711e479ac4691ec4"}, "ID": 119067, "Name": "Monterey Peninsula College", "Website": "www.mpc.edu", "Total Students": 6856}, {"_id": {"$oid": "5d5d7e07711e479ac4691ec5"}, "ID": 119164,

In [146]:
pprint(all_data)

[{'ID': 461148,
  'Name': 'New York Film Academy',
  'Total Students': 1080,
  'Website': 'www.nyfa.edu',
  '_id': ObjectId('5d5d7e07711e479ac4691ebf')},
 {'ID': 455512,
  'Name': 'Woodland Community College',
  'Total Students': 2891,
  'Website': 'https://wcc.yccd.edu',
  '_id': ObjectId('5d5d7e07711e479ac4691ec0')},
 {'ID': 445188,
  'Name': 'University of California-Merced',
  'Total Students': 7375,
  'Website': 'www.ucmerced.edu',
  '_id': ObjectId('5d5d7e07711e479ac4691ec1')},
 {'ID': 447768,
  'Name': 'American Career College-Ontario',
  'Total Students': 1275,
  'Website': 'americancareercollege.edu',
  '_id': ObjectId('5d5d7e07711e479ac4691ec2')},
 {'ID': 117724,
  'Name': 'Los Angeles Trade Technical College',
  'Total Students': 11194,
  'Website': 'www.lattc.edu',
  '_id': ObjectId('5d5d7e07711e479ac4691ec3')},
 {'ID': 119067,
  'Name': 'Monterey Peninsula College',
  'Total Students': 6856,
  'Website': 'www.mpc.edu',
  '_id': ObjectId('5d5d7e07711e479ac4691ec4')},
 {'ID'

  '_id': ObjectId('5d5d7e4a711e479ac4691f6c')},
 {'ID': 125499,
  'Name': 'West Valley College',
  'Total Students': 6799,
  'Website': 'www.westvalley.edu',
  '_id': ObjectId('5d5d7e4a711e479ac4691f6d')},
 {'ID': 400080,
  'Name': 'Summit College',
  'Total Students': 1154,
  'Website': 'www.summitcollege.edu',
  '_id': ObjectId('5d5d7e4a711e479ac4691f6e')},
 {'ID': 112190,
  'Name': 'City College of San Francisco',
  'Total Students': 23574,
  'Website': 'www.ccsf.edu',
  '_id': ObjectId('5d5d7e4a711e479ac4691f6f')},
 {'ID': 110583,
  'Name': 'California State University-Long Beach',
  'Total Students': 31402,
  'Website': 'www.csulb.edu',
  '_id': ObjectId('5d5d7e4a711e479ac4691f70')},
 {'ID': 112260,
  'Name': 'Claremont McKenna College',
  'Total Students': 1334,
  'Website': 'www.cmc.edu',
  '_id': ObjectId('5d5d7e4a711e479ac4691f71')},
 {'ID': 113573,
  'Name': 'College of the Desert',
  'Total Students': 10074,
  'Website': 'collegeofthedesert.edu',
  '_id': ObjectId('5d5d7e4a7

In [148]:
all_df = pd.DataFrame(all_data)
all_df

Unnamed: 0,ID,Name,Total Students,Website,_id
0,461148,New York Film Academy,1080,www.nyfa.edu,5d5d7e07711e479ac4691ebf
1,455512,Woodland Community College,2891,https://wcc.yccd.edu,5d5d7e07711e479ac4691ec0
2,445188,University of California-Merced,7375,www.ucmerced.edu,5d5d7e07711e479ac4691ec1
3,447768,American Career College-Ontario,1275,americancareercollege.edu,5d5d7e07711e479ac4691ec2
4,117724,Los Angeles Trade Technical College,11194,www.lattc.edu,5d5d7e07711e479ac4691ec3
5,119067,Monterey Peninsula College,6856,www.mpc.edu,5d5d7e07711e479ac4691ec4
6,119164,Mt San Antonio College,26865,www.mtsac.edu,5d5d7e07711e479ac4691ec5
7,120254,Occidental College,1958,https://www.oxy.edu,5d5d7e07711e479ac4691ec6
8,119331,Napa Valley College,5237,www.napavalley.edu,5d5d7e07711e479ac4691ec7
9,120421,Oxnard College,6809,www.oxnardcollege.edu,5d5d7e07711e479ac4691ec8


In [142]:
# Sort by School Name
sorted_df = college_df.sort_values("Name")
sorted_df.head()

Unnamed: 0,ID,Name,Total Students,Website
73,108232,Academy of Art University,7520,www.academyart.edu
41,108807,Allan Hancock College,9282,www.hancockcollege.edu/
203,109040,American Career College-Los Angeles,1237,americancareercollege.edu
3,447768,American Career College-Ontario,1275,americancareercollege.edu
105,109208,American River College,29082,www.arc.losrios.edu/


In [35]:
df = pd.DataFrame(list(documents))

In [36]:
df.head()

Unnamed: 0,ID,Name,Total Students,Website,_id
0,108232,Academy of Art University,7520,www.academyart.edu,5d5c752144d7079dbb0645d9
1,108807,Allan Hancock College,9282,www.hancockcollege.edu/,5d5c752144d7079dbb0645da
2,109040,American Career College-Los Angeles,1237,americancareercollege.edu,5d5c752144d7079dbb0645db
3,447768,American Career College-Ontario,1275,americancareercollege.edu,5d5c752144d7079dbb0645dc
4,109208,American River College,29082,www.arc.losrios.edu/,5d5c752144d7079dbb0645dd
