Attributions:
* [CollegeScorecard Datasets](https://collegescorecard.ed.gov/data/)
* [CollegeAI API - although not used in this project, great for receiving info about a specific college](https://api.collegeai.com/v1/docs)
* [PyScorecard GitHub code - looked at it for a while trying to understand why filtering by state doesn't work, eventually gave up. Still a good source for inspiration :)](https://github.com/deino475/PyScorecard/blob/master/PyScorecard/PyScorecard.py)
* [Detailed CollegeScorecard documentation that expains what each variable means](https://collegescorecard.ed.gov/assets/FullDataDocumentation.pcollegeInfo)
* [Basics of how to use the CollegeScorecard API](https://collegescorecard.ed.gov/data/documentation/). See the Data Dictionary file for the complete list of fields
* [Detailed description of every feature of the CollegeScorecard API](https://github.com/RTICWDT/open-data-maker/blob/master/API.md)
*[IPEDS - used here for some additional features](https://nces.ed.gov/ipeds/datacenter/InstitutionByName.aspx?goToReportId=1)

In [234]:
#get all the data from the CollegeScorecard API
import requests
import math
import numpy as np
from IPython.display import display
import webbrowser
import pyinputplus as pyip
import pandas as pd

key = "nyiNevdtUIEMrkovbB6bYPDsdi8V4rRBSEzXtN9s"
url_base = "https://api.data.gov/ed/collegescorecard/v1/schools/"
fields = ','.join([
    "location.lat","location.lon",
    "school.religious_affiliation","school.name","school.alias","school.city","school.state","school.zip","school.ownership_peps","school.school_url","school.locale","school.institutional_characteristics.level",
    "latest.admissions.admission_rate.overall","latest.admissions.sat_scores.midpoint.critical_reading","latest.admissions.sat_scores.midpoint.writing","latest.admissions.sat_scores.midpoint.math","latest.admissions.act_scores.midpoint.cumulative",
    "latest.academics.program_percentage.computer","latest.academics.program_percentage.mathematics",
    "latest.student.demographics.median_hh_income","latest.student.size","latest.student.demographics.female_share","latest.student.demographics.race_ethnicity.white","latest.student.demographics.race_ethnicity.black","latest.student.demographics.race_ethnicity.hispanic","latest.student.demographics.race_ethnicity.asian",
    "latest.cost.net_price.public.by_income_level.48001-75000","latest.cost.net_price.public.by_income_level.75001-110000","latest.cost.attendance.academic_year","latest.cost.tuition.in_state","latest.cost.tuition.out_of_state",
    "latest.aid.federal_loan_rate",
    "latest.earnings.10_yrs_after_entry.median","latest.earnings.6_yrs_after_entry.median",
])

def get_col_data(page):
    p = {
    "school.men_only":"0", "school.women_only":"0", "school.online_only":"0", "school.operating":"1",
    "school.state":"IL", #can change it to region (3 is midwest) or remove location altogether
    "fields":fields,
    "zip":"60056",
    "distance":"150mi", #only return colleges within 150 miles from my zip
    #"sort":"latest.student.demographics.median_hh_income:desc", #sort the results. Default: asc, add :desc to the field to do descending. EG: "sort":"latest.student.demographics.median_hh_income:desc"
    "page":page, 
    "api_key": key
    }
    resp = requests.get(url=url_base, params=p)
    return resp.json()

try:
    metadata = get_col_data(0)['metadata']
except:
    print(get_col_data(0)) #print the error

all_pages = []
for i in range(math.ceil(metadata['total']/metadata['per_page'])):
    all_pages.extend(get_col_data(i)['results']) #add data from all pages

In [235]:
#Turn it into dataframe, make it look nice
collegeInfo = pd.DataFrame(all_pages).set_index('school.name').fillna(value=np.nan)
collegeInfo = collegeInfo[~collegeInfo.index.duplicated(keep='last')] #remove duplicates

#If there is a year (latest in this case), remove it from column name. Also remove the dev category
collegeInfo = collegeInfo.rename(columns = lambda x: '.'.join(x.split('.')[2:]) if 'latest' in x else x) #turn latest.admissions.admission_rate.overall into admission_rate.overall.
collegeInfo = collegeInfo.reindex(sorted(collegeInfo.columns), axis=1) #sort column names alphabetically to make it easier to rename them

#manually rename the columns
#NOTE: median is used for all the calculations aside from net price
newNames = [
    'Salary 10 years after entry','Salary 6 years after entry',
    'Cumulative ACT score','Admission rate','Overall annual cost of attendance',
    '% female','Household income','% Asian','% Black','% Hispanic','% White','% with a federal loan',
    'Latitude','Longtitude',
    'Net price for $48001-75000 household income','Net price for $75001-110000 household income',
    '% students in computer science','% students in math',
    'SAT Reading','SAT Math','SAT Writing',
    'Aliases',
    'City','Level of institution','Locale',
    'Ownership','Religious affiliation','Website',
    'State','ZIP',
    '# students',
    'In-state tuition','Out-of-state tuition'
]
collegeInfo.columns = newNames

#manually rename the index
collegeInfo.index.names = ['Name'] #instead of school.name

#reoder the column names with a custom order
newOrder = [
    'Religious affiliation',
    'Salary 10 years after entry','Salary 6 years after entry',
    'Admission rate',
    '% female','% Asian','% Black','% Hispanic','% White',
    '% students in computer science','% students in math',
    'Cumulative ACT score','SAT Reading','SAT Math','SAT Writing',
    'Aliases',
    'City',
    'Level of institution',
    'Locale',
    'Ownership',
    'Website',
    'State',
    'ZIP','Latitude','Longtitude',
    '# students',
    'Household income',
    '% with a federal loan',
    'Net price for $48001-75000 household income','Net price for $75001-110000 household income',
    'Overall annual cost of attendance','In-state tuition','Out-of-state tuition'
]
collegeInfo = collegeInfo[newOrder]
#Ownership, level of instituion, and locale columns contain numbers that correspond to certain values - map them
#Religion will be mapped later
localeDic = {
    11:'Large City',12:'Midsize City',13:'Small City',
    21:'Large Suburb',22:'Midsize Suburb',23:'Small Suburb',
    31:'Fringe Town',32:'Distant Town',33:'Remote Town',
    41:'Fringe Rural',42:'Distant Rural',43:'Remote Rural',
}
ownDic = {1:'Public',2:'Private, Nonprofit',3:'Proprietary'}
levelDic = {1:'4-year',2:'2-year',3:'Less-than-2-year'}

collegeInfo['Locale'] = collegeInfo['Locale'].map(localeDic)
collegeInfo['Ownership'] = collegeInfo['Ownership'].map(ownDic)
collegeInfo['Level of institution'] = collegeInfo['Level of institution'].map(levelDic)

states = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas","CA":"California","CO":"Colorado","CT":"Connecticut","DE":"Delaware","FL":"Florida","GA":"Georgia","HI":"Hawaii","ID":"Idaho","IL":"Illinois","IN":"Indiana","IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine","MD":"Maryland","MA":"Massachusetts","MI":"Michigan","MN":"Minnesota","MS":"Mississippi","MO":"Missouri","MT":"Montana","NE":"Nebraska","NV":"Nevada","NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico","NY":"New York","NC":"North Carolina","ND":"North Dakota","OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania","RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota","TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia","WA":"Washington","WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming"}
collegeInfo['State'] = collegeInfo['State'].map(states) #rename state abbreviations to full names  

In [236]:
#combine it with another dataset that i got straight from the IPEDS. Data is from 2018/19
ipedsCols = [
    'instnm','Address',
    "Number of students receiving a Doctor's degree (DRVC2018)",
    "Number of students receiving a Master's degree (DRVC2018)",
    "Number of students receiving a Bachelor's degree (DRVC2018)",
    "Number of students receiving an Associate's degree (DRVC2018)",
    'Graduation rate','Average amount of aid awarded'
]
ipeds = pd.read_csv(r'C:\Users\timkh\OneDrive\Desktop\Programming\Projects\Colleges\IPEDS.csv',usecols=ipedsCols).set_index('instnm').replace(r'^\s*$', np.nan, regex=True) # replace field that's entirely space (or empty) with NaN
#set the graduate # to the sum of # of PhD, # of Masters, # of Bachleors, and # of Associates graduating. This ensures the percentages are correct
ipeds['Graduate #'] = ipeds["Number of students receiving a Doctor's degree (DRVC2018)"]+ipeds["Number of students receiving a Master's degree (DRVC2018)"]+ipeds["Number of students receiving a Bachelor's degree (DRVC2018)"]+ipeds["Number of students receiving an Associate's degree (DRVC2018)"]
ipeds["% receiving Doctor's"] = ipeds["Number of students receiving a Doctor's degree (DRVC2018)"]/ipeds['Graduate #']
ipeds["% receiving Master's"] = ipeds["Number of students receiving a Master's degree (DRVC2018)"]/ipeds['Graduate #']
ipeds["% receiving Bachelor's"] = ipeds["Number of students receiving a Bachelor's degree (DRVC2018)"]/ipeds['Graduate #']
ipeds["% receiving Associate's"] = ipeds["Number of students receiving an Associate's degree (DRVC2018)"]/ipeds['Graduate #']
#drop the columns that aren't needed anymore
ipeds.drop(["Number of students receiving a Doctor's degree (DRVC2018)",
    "Number of students receiving a Master's degree (DRVC2018)",
    "Number of students receiving a Bachelor's degree (DRVC2018)",
    "Number of students receiving an Associate's degree (DRVC2018)", "Graduate #"],axis=1,inplace=True)

#merge
collegeInfo = pd.merge(collegeInfo,ipeds,how='left',left_index=True,right_index=True)

In [237]:
#multiply all percentages by 100 and round numbers
for col in collegeInfo.columns:
    if '%' in col or col == 'Admission rate':
        collegeInfo[col] = collegeInfo[col].map(lambda x: x*100 if x<=1 else x)
collegeInfo = collegeInfo.round(2)

#remove the stuff after hyphen in zip+4
#collegeInfo['ZIP'] = collegeInfo['ZIP'].apply(lambda x: x[:x.index('-')] if '-' in x else x)

#add a new column with the distance of the college from home
def get_distance(row):
    lat1 = math.radians(row['Latitude'])
    lon1 = math.radians(row['Longtitude'])
    lat2=math.radians(42.04)
    lon2=math.radians(-87.94)
    
    r = 3958.8 #radius of the Earth
    dlon = lon2 - lon1 #change in coordinates
    dlat = lat2 - lat1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2 #Haversine formula
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = r * c
    return round(distance,2)
collegeInfo['Approximate distance from home (mi)'] = collegeInfo.apply(get_distance,axis=1)
collegeInfo.drop(['Latitude','Longtitude'],axis=1,inplace=True) #delete the columns that aren't needed anymore

In [238]:
#Done! Now just apply the needed filters! Ignore the NaN values
#| math.isnan(collegeInfo[Column]) makes sure that NaN values aren't accidentally filtered out
#can add act, 3 sats
collegeInfo = collegeInfo[(collegeInfo['Religious affiliation']==30) | (collegeInfo['Religious affiliation'].apply(math.isnan))].dropna(thresh=20) #remove religious (non-Catholic universities) and those that have 20 or more NaN values
collegeInfo['Religious affiliation'] = collegeInfo['Religious affiliation'].map({30:'Roman Catholic'})
filtered = collegeInfo[
    ((collegeInfo['Salary 10 years after entry'] >= 40000) | (collegeInfo['Salary 10 years after entry'].apply(math.isnan))) &
    #(collegeInfo['Salary 6 years after entry'] >= 50000) &
    #(collegeInfo['Admission rate'] >= 50) &
    ((collegeInfo['% female'] <= 65) | (collegeInfo['% female'].apply(math.isnan))) &
    #(3 <= collegeInfo['% Asian'] <= 20) &
    (((collegeInfo['% Black'] > 0) & (collegeInfo['% Black'] < 10)) | (collegeInfo['% Black'].apply(math.isnan))) &
    #(1 <= collegeInfo['% Hispanic'] <= 15) &
    (((collegeInfo['% White'] > 27) & (collegeInfo['% White'] < 90)) | (collegeInfo['% White'].apply(math.isnan))) &
    ((collegeInfo['% students in computer science'] > 0) | (collegeInfo['% students in computer science'].apply(math.isnan))) #&
    #(collegeInfo['% students in math'] > 0) & 
    #(collegeInfo['# students'] >= 5000) & 
    #(35000 <= collegeInfo['Household income'] <= 100000) &
    #(collegeInfo['Overall annual cost of attendance'] <= 25000) &
    #(collegeInfo['In-state tuition'] <= 17000) &
    #(collegeInfo['Approximate distance from home (mi)'] <= 60)
]
filtered

Unnamed: 0,Religious affiliation,Salary 10 years after entry,Salary 6 years after entry,Admission rate,% female,% Asian,% Black,% Hispanic,% White,% students in computer science,...,In-state tuition,Out-of-state tuition,Address,Graduation rate,Average amount of aid awarded,% receiving Doctor's,% receiving Master's,% receiving Bachelor's,% receiving Associate's,Approximate distance from home (mi)
Benedictine University,Roman Catholic,49000.0,40000.0,62.83,57.61,14.56,8.58,17.5,41.94,1.83,...,34290.0,34290.0,5700 College Rd,47.0,18891.0,1.54,51.82,46.64,0.0,19.76
Bradley University,,53600.0,42900.0,66.81,52.99,3.46,6.94,10.37,72.73,3.98,...,33760.0,33760.0,1501 W Bradley Ave,78.0,20186.0,2.03,20.65,77.32,0.0,127.12
DePaul University,Roman Catholic,53300.0,42100.0,67.53,55.0,9.84,7.86,18.69,52.44,7.35,...,39975.0,39975.0,1 E Jackson Blvd,72.0,24112.0,4.68,39.54,55.78,0.0,19.39
Illinois Institute of Technology,,69100.0,55800.0,58.18,30.01,15.08,5.16,16.52,36.35,16.99,...,47646.0,47646.0,10 West 35th Street,72.0,34691.0,12.86,65.56,21.58,0.0,21.55
Illinois State University,,47100.0,39600.0,89.18,56.24,2.11,9.31,11.06,73.13,2.79,...,14516.0,26040.0,North and School Streets,69.0,9572.0,0.96,12.96,86.08,0.0,118.94
Illinois Wesleyan University,,59100.0,41600.0,58.55,52.27,5.52,6.17,9.43,69.57,1.98,...,47636.0,47636.0,1312 N Park St,79.0,27418.0,0.0,0.0,100.0,0.0,120.18
Lake Forest College,,50900.0,39100.0,57.92,60.3,5.18,4.98,14.32,57.5,1.31,...,47064.0,47064.0,555 N Sheridan Road,69.0,31131.0,0.0,4.03,95.97,0.0,15.57
Lewis University,Roman Catholic,48800.0,40700.0,57.76,54.92,5.12,5.55,20.75,59.02,8.31,...,32450.0,32450.0,One University Parkway,64.0,20958.0,0.65,32.55,66.63,0.16,30.57
Morrison Institute of Technology,,51400.0,37600.0,,,0.0,0.86,9.48,80.17,24.14,...,16100.0,16100.0,,,,,,,,105.67
Northwestern University,,69000.0,58900.0,8.47,52.43,17.34,5.94,12.52,45.46,5.27,...,54568.0,54568.0,633 Clark St,95.0,45427.0,12.68,59.03,28.29,0.0,13.36


The next cell is a custom scoring system. Explanation (footnotes are in the code below):
1. Values used in the deviation section are of the perfect university - the more a value deviates from these, the worse. Columns are built with ABSOLUTE deviation from the ideal value for those features, e.g. the perfect admission rate would be 65% (not too high, not too low). Absolute deviation = |x-preferred_value|. Later these deviations will be normalized - those that are far away from the ideal/desired value will be high on a 0 to 100 scale, and those that are close to ideal will be low. Thus, we'd like to minimize this deviation, so we subtract it from the overall score. Higher deviations = lower score. Perfectly fits the criteria = deviation of 0, so nothing gets subtracted from the score. NOTE: normally, the optimal or "ideal" values would be computed through extensive research, formulas, and algorithms. Oftentimes optimal values ar either the mean or the median. However, as this project's "optimal values" are just my ideas of what a good college would look like for ME, they are completely subjective. These values are just what I personally would like to see in my university.
2. We now standardize *all* values on a 0-100 scale. To do this, the [min-max normalization/feature scaling technique](https://en.wikipedia.org/wiki/Feature_scaling#Methods) is utilized. It shows how each university compares to other universities - a value closer to 100 means that particular feature is higher than most other counterparts'. We will want some features (like salary) maximized, and some (like cost and deviation from perfection) minimized. The score is calculated by adding all the standardized features that need to be maximized. From this score, we then subtract the sum of all the standardized features that need to be minimized. If a value is NaN, it becomes 0 or 100 depending on whether it needs to be minimized or maximized. NaN + or - any number will be NaN, but by filling NaNs we ensure that no college will receive a score of NaN, however they are still going to have a disadvantage compared to the colleges without any NaNs. While adding and subtracting, we also multiply all the values by a specific weight - a value from 0 to 1 where 0 is "completely unimportant" and 1 is "extremely important."  So, a value that's less important (like the number of students) will have less impact on the score than an extremely important feature such as percentage of students in computer science or the overall cost of attendance. NOTE: again, weights are determined subjectively based on how relatively important certain features are to ME. Normally, the weights would be determined by a consensus of data scientists, often with the use of research and analysis. In this system, an absolutely ideal college that has a 100 in every "good feature" and a 0 in every "bad" feature will get 500 points: 100\*1+100\*0.6+100\*0.8+100\*0.6+100\*0.9+100\*0.4+100\*0.7-0 = 500
3. Instead of absolute deviation, standard deviation/z-score could've been used: 

    `toNorm.apply(lambda x: (x-x.mean())/x.std(ddof=0))`. 
    
    However, in this method, the more the value deviates from the mean, the worse. For most features, that would be a good idea, but that would not work with some of the deviation features. For instance, the mean % female in Illinois colleges is 60%, whereas my ideal value is 50%. With the z-score technique, a college with 50% females would get a lower z-score than a college with 60%. So, logically, this feature needs to be minimized (lower score = better), however, there's another issue - a percentage that's too low is bad as well. So, the mean in a lot of cases doesn't correspond to my preferred values. Another example is % computer science students - the mean in the state is 2, so a college with 20% comp sci students will get a high z score as it is far away from the mean. So, again, if this is the case, then high z score for % comp sci students should be good, right? But a college with 97% comp sci students would not be a good fit either. So z-score is not well suited for this project. The only way to calculate deviations is see how much each value deviates not from the mean, but from my preference.

In [239]:
pd.options.mode.chained_assignment = None #avoid the false warnings
toNorm = collegeInfo.select_dtypes('number')

#standardize directly, maximize: salary 10, salary 6, act, sats, # students, grad rate
#standardize directly, minimize: price, cost, tuition, distance
#standardize deviation, minimize deviation: admission rate, all with % besides federal loan and bachelor's, master's, etc

#1 - calculating deviation
toNorm['Admission rate deviation'] = toNorm['Admission rate'].map(lambda x: abs(x-65))
toNorm['% female deviation'] = toNorm['% female'].map(lambda x: abs(x-50))
toNorm['% Black deviation'] = toNorm['% Black'].map(lambda x: abs(x-5))
toNorm['% White deviation'] = toNorm['% White'].map(lambda x: abs(x-70))
toNorm['% students in computer science deviation'] = toNorm['% students in computer science'].map(lambda x: abs(x-20))
toNorm['% students in math deviation'] = toNorm['% students in math'].map(lambda x: abs(x-10))

#2 - normalizing, calculating score, multipying by weights
normalized=((toNorm-toNorm.min())/(toNorm.max()-toNorm.min()))*100 #Found this method by looking up 'pandas standardize each column', 2nd result
normalized['Score'] = (
            normalized['Salary 10 years after entry'].fillna(0)+ #extremely important - weight is 1
            normalized['Salary 6 years after entry'].fillna(0)*0.6+
            normalized['Cumulative ACT score'].fillna(0)*0.8+
            normalized['SAT Reading'].fillna(0)*0.6+
            normalized['SAT Math'].fillna(0)*0.9+
            normalized['# students'].fillna(0)*0.4+ #many universities have NaN SAT writing, so I removed it
            normalized['Graduation rate'].fillna(0)*0.7-
            
            normalized['Overall annual cost of attendance'].fillna(100)*0.9-
            normalized['In-state tuition'].fillna(100)*0.8-
            normalized['Approximate distance from home (mi)'].fillna(100)*0.6-
            normalized['Admission rate deviation'].fillna(100)*0.5-
            normalized['% Black deviation'].fillna(100)*0.7-
            normalized['% White deviation'].fillna(100)*0.7-
            normalized['% female deviation'].fillna(100)*0.4-
            normalized['% students in computer science deviation'].fillna(100)- #extremely important - weght is 1
            normalized['% students in math deviation'].fillna(100)*0.5
)


#add the word 'normalized' to the columns with 'deviation' in their names. 
#The other columns (aside from Score) will automatically have the word 'normalized' added to them as a suffix while merging, since the orginal collegeInfo df has the same column names. 
normalized.rename(lambda x: x + ' normalized' if 'deviation' in x else x,axis = 1,inplace=True)
normalizedExtended = pd.merge(normalized,collegeInfo,how='left',left_index=True,right_index=True,suffixes=(' normalized','')) 
#could really use any method (not just left) since the goal of this merge is  just to add new columns
with pd.option_context('display.max_columns', None):  #show all columns
    display(normalizedExtended.nlargest(20,columns=['Score']))

Unnamed: 0,Salary 10 years after entry normalized,Salary 6 years after entry normalized,Admission rate normalized,% female normalized,% Asian normalized,% Black normalized,% Hispanic normalized,% White normalized,% students in computer science normalized,% students in math normalized,Cumulative ACT score normalized,SAT Reading normalized,SAT Math normalized,SAT Writing normalized,# students normalized,Household income normalized,% with a federal loan normalized,Net price for $48001-75000 household income normalized,Net price for $75001-110000 household income normalized,Overall annual cost of attendance normalized,In-state tuition normalized,Out-of-state tuition normalized,Graduation rate normalized,Average amount of aid awarded normalized,% receiving Doctor's normalized,% receiving Master's normalized,% receiving Bachelor's normalized,% receiving Associate's normalized,Approximate distance from home (mi) normalized,Admission rate deviation normalized,% female deviation normalized,% Black deviation normalized,% White deviation normalized,% students in computer science deviation normalized,% students in math deviation normalized,Score,Religious affiliation,Salary 10 years after entry,Salary 6 years after entry,Admission rate,% female,% Asian,% Black,% Hispanic,% White,% students in computer science,% students in math,Cumulative ACT score,SAT Reading,SAT Math,SAT Writing,Aliases,City,Level of institution,Locale,Ownership,Website,State,ZIP,# students,Household income,% with a federal loan,Net price for $48001-75000 household income,Net price for $75001-110000 household income,Overall annual cost of attendance,In-state tuition,Out-of-state tuition,Address,Graduation rate,Average amount of aid awarded,% receiving Doctor's,% receiving Master's,% receiving Bachelor's,% receiving Associate's,Approximate distance from home (mi)
University of Illinois at Urbana-Champaign,65.04298,62.825279,65.777778,49.753272,89.386792,7.839066,12.03,51.352921,19.345485,40.300107,68.75,61.089494,73.529412,66.666667,100.0,71.258212,38.196345,69.205411,91.002985,32.226362,21.643567,48.626018,85.526316,25.425639,59.37931,44.282093,51.147666,0.0,91.384102,4.148034,0.0,1.416077,36.43812,76.282377,59.699893,124.583417,,61500.0,47100.0,62.16,49.48,18.95,6.04,12.03,44.22,4.67,3.76,29.0,645.0,685.0,640.0,Illinois|Illinios|Ilinois|Ilinios|Urbana|Champ...,Champaign,4-year,Small City,Public,www.illinois.edu/,Illinois,61820-5711,32974.0,77278.0,34.9,16954.0,23817.0,30082.0,15094.0,31664.0,601 E John Street,84.0,14328.0,8.61,29.7,61.69,0.0,134.89
University of Chicago,74.498567,76.208178,7.68254,52.360009,90.471698,6.722907,13.68,45.639299,19.925435,100.0,100.0,100.0,100.0,100.0,19.984224,65.598979,11.623071,,,100.0,100.0,100.0,98.684211,93.53748,98.482759,87.028478,7.357817,0.0,14.737354,88.804934,2.996981,0.22213,43.510134,75.571356,0.0,96.668499,,68100.0,54300.0,7.26,51.91,19.18,5.18,13.68,39.3,4.81,9.33,34.0,745.0,775.0,740.0,,Chicago,4-year,Large City,"Private, Nonprofit",WWW.UCHICAGO.EDU,Illinois,60637,6600.0,74573.0,10.62,,,75735.0,58230.0,58230.0,5801 S Ellis Ave,94.0,42732.0,14.28,58.37,27.35,0.0,24.58
Northwestern University,75.787966,84.758364,8.962963,52.917829,81.792453,7.70928,12.52,52.792939,21.830986,43.729904,100.0,94.163424,95.588235,95.0,25.599951,80.741872,22.895918,,,95.880406,93.347986,92.918335,100.0,100.0,87.448276,88.012524,8.556491,0.0,6.941356,86.93909,4.118154,1.277246,34.655742,73.235145,56.270096,92.634772,,69000.0,58900.0,8.47,52.43,17.34,5.94,12.52,45.46,5.27,4.08,34.0,730.0,760.0,725.0,,Evanston,4-year,Small City,"Private, Nonprofit",www.northwestern.edu,Illinois,60208,8451.0,81811.0,20.92,,,72960.0,54568.0,54568.0,633 Clark St,95.0,45427.0,12.68,59.03,28.29,0.0,13.36
Illinois Institute of Technology,75.931232,78.996283,61.566138,28.867196,71.132075,6.69695,16.52,42.213448,70.38111,11.468382,68.75,59.143969,69.117647,,8.83165,64.557094,48.473241,,,76.807055,80.774191,79.532401,69.736842,74.255431,88.689655,97.748621,0.0,0.0,12.632018,10.285274,41.979301,0.194363,47.750467,13.712544,88.531618,73.570709,,69100.0,55800.0,58.18,30.01,15.08,5.16,16.52,36.35,16.99,1.07,29.0,640.0,670.0,,IIT,Chicago,4-year,Large City,"Private, Nonprofit",web.iit.edu/,Illinois,60616,2924.0,74075.0,44.29,,,60112.0,47646.0,47646.0,10 West 35th Street,72.0,34691.0,12.86,65.56,21.58,0.0,21.55
University of Illinois at Chicago,54.727794,48.884758,79.94709,53.368376,100.0,10.266061,34.03,32.888166,25.932063,10.503751,37.5,28.015564,36.764706,31.666667,61.958072,60.121762,46.601729,55.439354,78.049936,25.067918,19.227625,39.567984,52.631579,21.881445,100.0,46.68257,41.58378,0.0,10.839355,16.037008,5.023717,4.012217,59.292799,68.207212,89.496249,13.837879,,54300.0,39600.0,75.55,52.85,21.2,7.91,34.03,28.32,6.26,0.98,24.0,560.0,560.0,535.0,UIC|U of I-Chicago|Illinois-Chicago|U of I-Med...,Chicago,4-year,Large City,Public,www.uic.edu,Illinois,60607,20435.0,71955.0,42.58,14532.0,21083.0,25260.0,13764.0,26980.0,601 S Morgan,59.0,12850.0,14.5,31.31,54.19,0.0,18.97
Lewis University,46.848138,50.929368,61.121693,55.588929,24.150943,7.203115,20.75,68.540239,34.424192,11.468382,37.5,28.015564,36.764706,13.333333,12.754467,76.559689,66.575462,,,46.869851,53.170696,50.146004,59.210526,41.324157,4.482759,48.531385,57.44708,0.524762,18.899389,10.932922,9.486848,0.735805,15.164582,57.795835,88.531618,-14.659684,Roman Catholic,48800.0,40700.0,57.76,54.92,5.12,5.55,20.75,59.02,8.31,1.07,24.0,560.0,560.0,480.0,Lewis College Lewis College of Science and Te...,Romeoville,4-year,Large Suburb,"Private, Nonprofit",www.lewisu.edu,Illinois,60446-2200,4217.0,79812.0,60.83,,,39946.0,32450.0,32450.0,One University Parkway,64.0,20958.0,0.65,32.55,66.63,0.16,30.57
Loyola University Chicago,54.441261,54.64684,71.873016,70.188801,59.150943,7.086308,16.48,63.627918,7.580779,11.682744,62.5,49.416342,48.529412,45.0,35.53897,68.699527,61.551932,,,72.628079,74.238433,72.574501,72.368421,45.482231,72.827586,47.860444,42.769702,7.248278,7.830739,4.271396,38.831393,0.610857,21.244789,90.705942,88.317256,-21.857115,Roman Catholic,54100.0,42700.0,67.92,68.53,12.54,5.46,16.48,54.79,1.83,1.09,28.0,615.0,600.0,575.0,,Chicago,4-year,Large City,"Private, Nonprofit",https://www.luc.edu,Illinois,60660,11727.0,76055.0,56.24,,,57297.0,44048.0,44048.0,1032 W. Sheridan Rd,74.0,22692.0,10.56,32.1,55.12,2.21,14.64
Bradley University,53.724928,55.018587,70.698413,53.518558,16.320755,9.007138,10.37,84.461735,16.487158,3.215434,50.0,41.634241,47.058824,,13.910379,64.550818,72.44172,,,56.308546,55.550308,52.679314,77.631579,39.472927,14.0,30.788728,71.078806,0.0,85.98527,2.559753,5.325571,2.665556,3.306023,79.786694,96.784566,-28.502463,,53600.0,42900.0,66.81,52.99,3.46,6.94,10.37,72.73,3.98,0.3,26.0,595.0,595.0,,,Peoria,4-year,Midsize City,"Private, Nonprofit",www.bradley.edu,Illinois,61625-0001,4598.0,74072.0,66.19,,,46304.0,33760.0,33760.0,1501 W Bradley Ave,78.0,20186.0,2.03,20.65,77.32,0.0,127.12
Illinois State University,44.412607,48.884758,94.37037,57.004935,9.95283,12.083063,11.06,84.926257,11.557581,17.470525,31.25,26.070039,32.352941,,54.704044,65.960919,61.25643,90.22394,99.355664,30.51469,20.593631,37.750189,65.789474,14.02091,6.62069,19.323095,82.249426,0.0,80.301556,37.054742,12.332902,5.955852,3.880983,85.830371,82.529475,-38.123102,,47100.0,39600.0,89.18,56.24,2.11,9.31,11.06,73.13,2.79,1.63,23.0,555.0,545.0,,,Normal,4-year,Midsize Suburb,Public,illinoisstate.edu/,Illinois,61790-1000,18044.0,74746.0,55.97,20652.0,25580.0,28929.0,14516.0,26040.0,North and School Streets,69.0,9572.0,0.96,12.96,86.08,0.0,118.94
Illinois Wesleyan University,61.604585,52.60223,61.957672,52.746192,26.037736,8.007787,9.43,80.79201,8.202154,26.580922,56.25,49.416342,50.882353,55.0,5.075696,79.19369,68.961366,,,75.871795,80.756026,79.513063,78.947368,56.815021,0.0,0.0,100.0,0.0,81.163146,9.714726,3.773178,1.596557,0.0,89.944134,73.419078,-44.82194,,59100.0,41600.0,58.55,52.27,5.52,6.17,9.43,69.57,1.98,2.48,27.0,615.0,608.0,605.0,IWU|Wesleyan,Bloomington,4-year,Small City,"Private, Nonprofit",www.iwu.edu,Illinois,61702-2900,1686.0,81071.0,63.01,,,59482.0,47636.0,47636.0,1312 N Park St,79.0,27418.0,0.0,0.0,100.0,0.0,120.18


In [240]:
#another custom subjective scoring system
#This one is more clear-cut (did not meet condition = no points), whereas
#the other one is more fluid - the more you fit the perfect criteria, the more points you get. 
#For instance, if the grad rate of a college is 84 (just one point below the threshold) you get no points for that feature, 
#while in the first one you would get, say, 90/100 points. 
#So, this system is stricter, and overall slightly worse in my opinion (reasons are explained in the next cell)
def score(colSeries):
    score = 0
    if not (isinstance(colSeries['Religious affiliation'],str)): #non-religious is a big plus
        score += 100*0.75
    if (colSeries['Salary 10 years after entry'] >= 60000): #or (math.isnan(colSeries['Salary 10 years after entry'])):
        score += 100*0.4 #add 100 for qualifying, 0.4 is priority on a 0-1 scale
    if (colSeries['Salary 6 years after entry'] >= 40000):
        score += 100*0.6
    if (colSeries['Graduation rate'] >= 85):
        score += 100*0.7
    if ((colSeries['Admission rate'] < 80) and (colSeries['Admission rate'] >= 20)):
        score += 100*0.5
    if (colSeries['SAT Math'] >= 650): #can add other SAT's and ACT
        score += 100*0.7
    if (colSeries['% female'] <= 65):
        score += 100*0.5
    if (colSeries['% Black'] < 10):
        score += 100*0.65
    if ((colSeries['% White'] < 90) and (colSeries['% White'] >= 30)):
        score += 100*0.65
    if (colSeries['% students in computer science'] >= 5):
        score += 100*0.8
    if (colSeries['% students in math'] >= 2.5):
        score += 100*0.5
    if (colSeries['# students'] >= 5000):
        score += 100*0.4
    if (colSeries['Overall annual cost of attendance'] <= 33000):
        score += 100*0.85
    if (colSeries['In-state tuition'] <= 17000):
        score += 100*0.8
    if (colSeries['Approximate distance from home (mi)'] <= 80):
        score += 100*0.6
    return score
collegeInfo['Score'] = collegeInfo.apply(score,axis=1)
bestScore = collegeInfo.nlargest(columns=['Score'],n=20) #out of 940 points
chosen = collegeInfo.loc[['Bradley University','DePaul University','Illinois Institute of Technology',
        'Illinois State University','Lake Forest College','Lewis University','Northwestern University',
        'Rockford University','University of Chicago','University of Illinois at Chicago','University of Illinois at Urbana-Champaign']] #loyolo, ttic?
    #manually chosen from the filtered dataset

#[[]] ensures that no columns get merged, only rows. Columns are the same in both dataframes - no need to duplicate them
chosenAndScore = pd.merge(bestScore, chosen[[]], how='inner', right_index=True,left_index=True) #overlap between chosen and top 20 scores
chosenAndScore #the best of the best. These colleges are also the top 5 according to the first scoring system

Unnamed: 0,Religious affiliation,Salary 10 years after entry,Salary 6 years after entry,Admission rate,% female,% Asian,% Black,% Hispanic,% White,% students in computer science,...,Out-of-state tuition,Address,Graduation rate,Average amount of aid awarded,% receiving Doctor's,% receiving Master's,% receiving Bachelor's,% receiving Associate's,Approximate distance from home (mi),Score
University of Illinois at Urbana-Champaign,,61500.0,47100.0,62.16,49.48,18.95,6.04,12.03,44.22,4.67,...,31664.0,601 E John Street,84.0,14328.0,8.61,29.7,61.69,0.0,134.89,730.0
Northwestern University,,69000.0,58900.0,8.47,52.43,17.34,5.94,12.52,45.46,5.27,...,54568.0,633 Clark St,95.0,45427.0,12.68,59.03,28.29,0.0,13.36,725.0
University of Chicago,,68100.0,54300.0,7.26,51.91,19.18,5.18,13.68,39.3,4.81,...,58230.0,5801 S Ellis Ave,94.0,42732.0,14.28,58.37,27.35,0.0,24.58,645.0
Illinois Institute of Technology,,69100.0,55800.0,58.18,30.01,15.08,5.16,16.52,36.35,16.99,...,47646.0,10 West 35th Street,72.0,34691.0,12.86,65.56,21.58,0.0,21.55,615.0
University of Illinois at Chicago,,54300.0,39600.0,75.55,52.85,21.2,7.91,34.03,28.32,6.26,...,26980.0,601 S Morgan,59.0,12850.0,14.5,31.31,54.19,0.0,18.97,585.0


In [241]:
chosenNotScore = chosen[~chosen.isin(bestScore)].dropna(how='all') #those that I chose, but are not in top 20 scores
#chosenNotScore.iloc[0] #Bradley University - 8th in the first system. Reasons for not being in the top 20: low salary, far away, expensive, too few students in comp sci and math, low math SAT, small student body, somewhat low grad rate
#chosenNotScore.iloc[1] #DePaul - 14th in the first system. Reasons: Catholic, low salary, few students in math, no info about sat, expensive, low grad rate
#chosenNotScore.iloc[2] #Illinois State - 9th. Low salary, high admission rate, few students in comp sci and math, low SAT, low grad rate, far away
#chosenNotScore.iloc[3] #Lake Forest - not in top 20 in the first system. Low salary, few students in comp sci, no SAT info, small student body, expensive, low grad rate
#chosenNotScore.iloc[4] #Lewis - 6th. Catholic, low salary, few students in math, low SAT, small student body, expensive, low grad rate
#chosenNotScore.iloc[5] #Rockford - not in top 20 in the first system. Low salary, few students in comp sci and math, low SAT math, very small student body, expensive

scoreNotChosen = bestScore[~bestScore.isin(chosen)].dropna(how='all') #those that are in top 20, but are not chosen
scoreNotChosen[scoreNotChosen['Ownership'] != 'Public'].iloc[0] #All are public universities, aside from Fox College, which only offers associate's degrees. 
#I guess this scoring system was too focused on cost, and cared less about the quality of education. 
#The first scoring system is better - it provides a more balanced perspective.

Religious affiliation                                          NaN
Salary 10 years after entry                                  33500
Salary 6 years after entry                                   29800
Admission rate                                               69.16
% female                                                     83.05
% Asian                                                       5.64
% Black                                                        8.8
% Hispanic                                                   22.12
% White                                                      59.14
% students in computer science                                   0
% students in math                                               0
Cumulative ACT score                                           NaN
SAT Reading                                                    NaN
SAT Math                                                       NaN
SAT Writing                                                   

IN CONCLUSION:
Top colleges are UIUC, Northwestern, U of C, IIT, UIC. 
The cells below are just exercises or failed pieces of code that I decided to leave just for reference. There is another notebook that uses the same algorithms, but it is not limited to Illinois. There is also another notebook which uses machine learning to predict what state a university is in based on its characteristics.

In [242]:
#search engine
def searchByName(name):
    results = pd.DataFrame(columns=collegeInfo.columns) #empty df
    
    alNoNANs = collegeInfo.dropna(subset=['Aliases'])['Aliases']
    results = results.append(alNoNANs[alNoNANs.str.contains(name,case=False)])
    
    ciNoNANs = collegeInfo.dropna(subset=['City'])['City']
    results = results.append(ciNoNANs[ciNoNANs.str.contains(name,case=False)])
    
    ind = collegeInfo.index
    results = results.append(collegeInfo[ind.str.contains(name,case=False)])

    if results.empty:
        return 'Nothing found, try another search'
    
    return results.drop_duplicates().drop(['Aliases','City'])

results = searchByName('University of Illinois') #can use regex
display(results)


def searchGoogle():
    inp = pyip.inputYesNo('Would you like to search for these colleges on the web?')
    if inp == 'yes':
        if len(results.index) < 6:
            for college in results.index:
                webbrowser.open_new_tab('https://www.google.com/search?q={}'.format(college))
        else:
            print('Sorry, too many colleges. Chrome will crash :)')
    else:
        print('OK')
if not isinstance(results,str):
    searchGoogle()

Unnamed: 0,Religious affiliation,Salary 10 years after entry,Salary 6 years after entry,Admission rate,% female,% Asian,% Black,% Hispanic,% White,% students in computer science,...,Graduation rate,Average amount of aid awarded,% receiving Doctor's,% receiving Master's,% receiving Bachelor's,% receiving Associate's,Approximate distance from home (mi),Score,University of Illinois at Chicago,University of Illinois at Urbana-Champaign
University of Illinois at Chicago,,54300.0,39600.0,75.55,52.85,21.2,7.91,34.03,28.32,6.26,...,59.0,12850.0,14.5,31.31,54.19,0.0,18.97,585.0,,
University of Illinois at Urbana-Champaign,,61500.0,47100.0,62.16,49.48,18.95,6.04,12.03,44.22,4.67,...,84.0,14328.0,8.61,29.7,61.69,0.0,134.89,730.0,,


Would you like to search for these colleges on the web?

 n


OK


In [243]:
#open college's websites after looking them up
def open_web(college):
    if len(searchByName(college).index) == 0:
        return 'No such college, try another search'
    elif 0 < len(searchByName(college).index) < 6:
        for url in searchByName(college)['Website'].values:
            webbrowser.open_new_tab('https://'+url)
        return searchByName(college)
    else:
        return 'College name is too ambiguous, try a narrower search'
#open_web('University of Illinois')

In [None]:
#This was supposed to scrape usnews for compsci rankings, but for some reason it results in a timeout error.

from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
def simple_get(url):
    #Attempts to get the content at url by making an HTTP GET request.
    #If the content-type of response is some kind of HTML/XML, return the
    #text content, otherwise return None.
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except RequestException as e:
        print('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
def is_good_response(resp):
    #Returns True if the response seems to be HTML, False otherwise.
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
def compSciRank(college):
    #college = '-'.join(college.split())
    raw_html = simple_get('https://www.usnews.com/best-graduate-schools/top-science-schools/computer-science-rankings')
    soup = BeautifulSoup(raw_html,'html.parser')
    return soup.find_all('a')

In [None]:
#USING THE UNOFFICIAL WRAPPER FOR THE COLLEGESCORECARD API - PyScorecard.
#Use this cautiosly - every now and then it randomly results in an error (that goes away without any apparent fix) 
#and some filters (such as by state) are inaccessible
#It is more readable but less reliable than the code above
"""
from PyScorecard import PyScorecard
import pandas as pd

scorecard = PyScorecard()
scorecard.set_api_key('nyiNevdtUIEMrkovbB6bYPDsdi8V4rRBSEzXtN9s')
scorecard.set_year('latest')

scorecard.add_filter("school.men_only","=","0")
scorecard.add_filter("school.women_only","=","0")
scorecard.add_filter("school.online_only","=","0")
scorecard.add_filter("school.operating","=","1")
scorecard.add_filter("school.region_id","=","3") #3 is midwest
#scorecard.add_filter("school.state","=","IL") #doesn't work for some reason - maybe because it's a string

scorecard.add_fields([
    "school.name","school.alias","school.city","school.state","school.zip","school.ownership_peps","school.school_url","school.locale","school.institutional_characteristics.level",
    "admissions.admission_rate.overall","admissions.sat_scores.midpoint.critical_reading","admissions.sat_scores.midpoint.writing","admissions.sat_scores.midpoint.math","admissions.act_scores.midpoint.cumulative",
    "academics.program_percentage.computer","academics.program_percentage.mathematics",
    "student.demographics.median_hh_income","student.size","student.demographics.female_share","student.demographics.race_ethnicity.white","student.demographics.race_ethnicity.black","student.demographics.race_ethnicity.hispanic","student.demographics.race_ethnicity.asian",
    "cost.net_price.public.by_income_level.48001-75000","cost.net_price.public.by_income_level.75001-110000","cost.attendance.academic_year","cost.tuition.in_state","cost.tuition.out_of_state",
    "aid.federal_loan_rate",
    "earnings.10_yrs_after_entry.median","earnings.6_yrs_after_entry.median"
])
collegeInfo = pd.DataFrame(scorecard.fetch_all()).sort_values(by='school.state').set_index(['school.state','school.name']).drop_duplicates(keep='first').fillna(value=np.nan)
collegeInfo"""