# College Scorecard Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

scorecard = pd.read_json('scorecard_raw.json', orient='records')

Rename the columns and verify datatypes

In [2]:
column_rename_map = {
    "id": "id",
    "location.lat": "lat",
    "location.lon": "lon",
    "school.name": "name",
    "school.city": "city",
    "school.state": "state",
    "school.zip": "zip",
    "school.ownership": "ownership",
    "school.region_id": "region",
    "school.locale": "locale",
    "school.carnegie_undergrad": "carnegie_undergrad",
    "school.carnegie_size_setting": "carnegie_size_setting",
    "latest.student.size": "size",
    "latest.student.demographics.avg_family_income": "income_avg",
    "latest.student.demographics.median_family_income": "income_med",
    "latest.cost.avg_net_price.private": "net_price_private",
    "latest.cost.avg_net_price.public": "net_price_public",
    "latest.cost.attendance.academic_year": "cost_attendance",
    "latest.admissions.sat_scores.average.overall": "sat_score",
    "latest.admissions.admission_rate.overall": "admission_rate"
}

scorecard = scorecard.rename(columns=column_rename_map)

print(scorecard.dtypes)

id                         int64
admission_rate           float64
sat_score                float64
cost_attendance          float64
net_price_private        float64
net_price_public         float64
income_avg               float64
income_med               float64
size                     float64
lat                      float64
lon                      float64
carnegie_size_setting      int64
carnegie_undergrad         int64
city                      object
locale                     int64
name                      object
ownership                  int64
region                     int64
state                     object
zip                       object
dtype: object


Map integers representing ownership/region to category strings, and split locale into first and second digit that represent locale type and locale size

In [3]:
ownership_map = {
    1: "public",
    2: "private non-profit",
    3: "private for-profit"
}

region_map = {
    0: "service schools",
    1: "new england",
    2: "mid east",
    3: "great lakes",
    4: "plains",
    5: "southeast",
    6: "southwest",
    7: "rocky mountains",
    8: "far west",
    9: "outlying areas"
}

locale_type_map = {
    1: "city",
    2: "suburb",
    3: "town",
    4: "rural"
}

locale_size_map = {
    1: "large",
    2: "medium",
    3: "small"
}

scorecard['ownership'] = scorecard['ownership'].map(ownership_map)
scorecard['region'] = scorecard['region'].map(region_map)
scorecard['locale_type'] = (scorecard['locale'] // 10).map(locale_type_map)
scorecard['locale_size'] = (scorecard['locale'] % 10).map(locale_size_map)

Filter to schools that we collected data for

In [4]:
# Load the mapping of Instagram usernames : University names
with open('username_map.json') as username_map_json:
    username_map = json.load(username_map_json)

pd.set_option("max_rows", None)

filtered_scorecard = scorecard[scorecard['name'].isin(username_map.values())]
filtered_scorecard.sort_values(by='name')

Unnamed: 0,id,admission_rate,sat_score,cost_attendance,net_price_private,net_price_public,income_avg,income_med,size,lat,...,carnegie_undergrad,city,locale,name,ownership,region,state,zip,locale_type,locale_size
803,164465,0.1281,1449.0,71300.0,25208.0,,78988.0,42053.0,1855.0,42.372459,...,14,Amherst,21,Amherst College,private non-profit,new england,MA,01002-5000,suburb,large
478,104151,0.8478,1240.0,25621.0,,11644.0,56365.0,35851.0,42529.0,33.417721,...,15,Tempe,12,Arizona State University-Tempe,public,southwest,AZ,85287,city,medium
21,164924,0.2789,1429.0,70588.0,33562.0,,132356.0,103007.0,9639.0,42.336213,...,14,Chestnut Hill,13,Boston College,private non-profit,new england,MA,02467,city,small
810,164988,0.2209,1420.0,70216.0,30729.0,,105821.0,81244.0,17238.0,42.351118,...,14,Boston,11,Boston University,private non-profit,new england,MA,02215,city,large
292,161004,0.1026,,68070.0,21300.0,,96476.0,61622.0,1825.0,43.907134,...,14,Brunswick,31,Bowdoin College,private non-profit,new england,ME,04011,town,large
1494,217156,0.0767,1492.0,71050.0,31685.0,,107609.0,82670.0,6752.0,41.82617,...,14,Providence,12,Brown University,private non-profit,new england,RI,02912,city,medium
120,110404,0.0662,1566.0,68901.0,23820.0,,74238.0,44848.0,948.0,34.137349,...,14,Pasadena,12,California Institute of Technology,private non-profit,far west,CA,91125,city,medium
820,173258,0.1984,1453.0,68835.0,30210.0,,98790.0,78779.0,2046.0,44.462318,...,14,Northfield,32,Carleton College,private non-profit,plains,MN,55057,town,medium
1252,211440,0.1712,1507.0,70060.0,32674.0,,107339.0,91372.0,6483.0,40.44357,...,14,Pittsburgh,11,Carnegie Mellon University,private non-profit,mid east,PA,15213-3890,city,large
448,201645,0.2925,1443.0,67083.0,32938.0,,109857.0,94000.0,5131.0,41.507419,...,14,Cleveland,11,Case Western Reserve University,private non-profit,great lakes,OH,44106,city,large


Simplifying the dataset even further

In [5]:
small_card = filtered_scorecard.drop(columns=['id', 'net_price_private', 'net_price_public', 'carnegie_undergrad', 'carnegie_size_setting', 'locale', 'zip'])
small_card = small_card.set_index('name')
small_card = small_card.infer_objects()
small_card

Unnamed: 0_level_0,admission_rate,sat_score,cost_attendance,income_avg,income_med,size,lat,lon,city,ownership,region,state,locale_type,locale_size
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Boston College,0.2789,1429.0,70588.0,132356.0,103007.0,9639.0,42.336213,-71.169242,Chestnut Hill,private non-profit,new england,MA,city,small
Washington University in St Louis,0.1503,1506.0,71975.0,119063.0,88696.0,7356.0,38.647929,-90.310604,Saint Louis,private non-profit,plains,MO,suburb,large
University of Colorado Boulder,0.8154,1281.0,30178.0,98821.0,70221.0,29753.0,40.008781,-105.270823,Boulder,public,rocky mountains,CO,city,medium
University of Southern California,0.1296,1445.0,72097.0,80335.0,45149.0,19548.0,34.021281,-118.284169,Los Angeles,private non-profit,far west,CA,city,large
University of Northern Colorado,0.9075,1099.0,22886.0,67385.0,51738.0,8903.0,40.410855,-104.692777,Greeley,public,rocky mountains,CO,city,medium
Yale University,0.0635,1517.0,71290.0,80258.0,44004.0,5963.0,41.311158,-72.926688,New Haven,private non-profit,new england,CT,city,medium
The University of Texas at Austin,0.3852,1367.0,25228.0,75674.0,53018.0,40329.0,30.282825,-97.738273,Austin,public,southwest,TX,city,large
California Institute of Technology,0.0662,1566.0,68901.0,74238.0,44848.0,948.0,34.137349,-118.125878,Pasadena,private non-profit,far west,CA,city,medium
Pitzer College,0.1333,,70500.0,93983.0,71480.0,1072.0,34.104107,-117.706675,Claremont,private non-profit,far west,CA,suburb,large
Grinnell College,0.2438,1450.0,65814.0,90298.0,74915.0,1683.0,41.747952,-92.722094,Grinnell,private non-profit,plains,IA,town,small


In [6]:
small_card.to_csv('scorecard.csv')