In [566]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import NearestNeighbors

## Method to standardize data

In [None]:
def standardize_data(df_sc):
  sc = StandardScaler()
  temp = sc.fit_transform(df_sc)
  df_sc[df_sc.columns] = temp
  return sc, df_sc

## Method to apply PCA

In [568]:
def apply_pca(df_pca):
  pca = PCA(n_components=10)
  df_pca = pca.fit_transform(df_pca)
  return pca, df_pca

## KNN implementation

In [569]:
df = pd.read_csv("cleaned_data.csv")

In [570]:
user_input = {'HIGHDEG': 4, 'SAT_AVG': 1500, 'ACTCMMID': 32, 'UGDS_WHITE': 1, 'UGDS_BLACK': 0, 
'UGDS_HISP': 0, 'UGDS_ASIAN': 0, 'UGDS_AIAN': 0, 'UGDS_NHPI': 0, 'UGDS_2MOR': 0, 'UGDS_NRA': 0, 
'UGDS_UNKN': 0, 'UG25ABV': 0, 'PPTUG_EF': 0, 'INC_PCT_LO': 0 , 'INC_PCT_M1': 0, 'INC_PCT_M2': 1, 
'INC_PCT_H1': 0, 'INC_PCT_H2': 0, 'PAR_ED_PCT_1STGEN': 0, 'C150_4': 1, 'PCIP14': 1}

In [571]:
user_filters = {'ADM_RATE': [0.1,1], 'UGDS': [5000,50000], 'TUITIONFEE_IN': [0,40000], 
                  'TUITIONFEE_OUT': None, 'STABBR': ['NC'], 'MAIN': 1, 'CONTROL': None, 
                  'RELAFFIL': None, 'DISTANCEONLY': 0, 'HBCU': None, 'PBI': 0, 'ANNHI': 0,
                  'HSI': 0, 'NANTI': 0, 'MENONLY': None, 'WOMENONLY': None,
                  'CIP14BACHL': 1, 'GRAD_DEBT_MDN10YR': [0,300]}

In [572]:
df_knn = df.copy()

In [573]:
#Filtering the Data Frame on State Values
states = user_filters["STABBR"]
if len(states) > 0:
    df_knn = df_knn.loc[df_knn['STABBR'].isin(states)]
user_filters.pop("STABBR", None)

['NC']

In [574]:
#Dropping input keys which are of no concern to the user
input_keys_to_drop = []
for key in user_input:
    if user_input[key] == None:
        cols_to_drop.append(key)
        input_keys_to_drop.append(key)
for key in input_keys_to_drop:
    user_input.pop(key, None)

In [575]:
#Adding all the columns except the user input to the drop list
cols_to_drop = [col for col in df_knn.columns if col not in user_input.keys()]

In [576]:
#Filtering the data further based on user specified filters
for col,val in user_filters.items():
    if val:
        if isinstance(val, list):
            df_knn = df_knn[df_knn[col].between(val[0], val[1])]
        else:
            df_knn = df_knn.loc[df_knn[col] == val]    

In [577]:
#Dropping all the columns in the drop list
df_knn.drop(cols_to_drop, axis=1, inplace=True)

In [578]:
#Standardizing the data
sc, df_knn_sc = standardize_data(df_knn) 

In [545]:
#Applying PCA to reduce dimensionality
pca, df_pca = apply_pca(df_knn_sc)

array([1.24957436e+01, 6.61351418e+00, 3.92873487e+00, 1.76694602e+00,
       3.95061287e-01, 2.24491078e-31])

In [579]:
#Putting the User input into a data frame
df_input = pd.DataFrame(data=None, columns=df_knn_sc.columns)
df_input = df_input.append(user_input, ignore_index=True)

In [580]:
#Standardizing the user input
temp1 = sc.transform(df_input)
df_input[df_input.columns] = temp1

In [581]:
#Checking if at least 10 rows are there in the dataset,
#if not reducing the number of final results
n = 10
if df_knn_sc.shape[0] < 10:
    n = df_knn_sc.shape[0]

In [582]:
#Fitting Nearest Neighbors on the dataset 
nbrs = NearestNeighbors(n_neighbors=n, algorithm='kd_tree').fit(df_pca)
distances, indices = nbrs.kneighbors(pca.transform(df_input))

In [584]:
#Returning the UNITIDs of the resulting universities/colleges
df_result = df_knn_sc.iloc[indices[0]]
df.iloc[list(df_result.index)]

Unnamed: 0,OPEID,UNITID,INSTNM,CITY,STABBR,ZIP,MAIN,HIGHDEG,CONTROL,RELAFFIL,...,PCIP44,PCIP45,PCIP46,PCIP47,PCIP48,PCIP49,PCIP50,PCIP51,PCIP52,PCIP54
1106,297200,199193,North Carolina State University at Raleigh,Raleigh,NC,27695-7001,1,4,1,-1.0,...,0.0098,0.051,0.0,0.0,0.0,0.0,0.0232,0.0,0.1381,0.011
1080,292300,198464,East Carolina University,Greenville,NC,27858-4353,1,4,1,-1.0,...,0.0135,0.0527,0.0,0.0,0.0,0.0,0.0436,0.1866,0.1728,0.0072
1125,298100,200004,Western Carolina University,Cullowhee,NC,28723-9646,1,4,1,-1.0,...,0.0578,0.04,0.0,0.0,0.0,0.0,0.0477,0.183,0.1603,0.0183
1102,297500,199139,University of North Carolina at Charlotte,Charlotte,NC,28223-0001,1,4,1,-1.0,...,0.0117,0.0924,0.0,0.0,0.0,0.0,0.025,0.0833,0.1839,0.0326
1099,290500,199102,North Carolina A & T State University,Greensboro,NC,27411,1,4,1,-1.0,...,0.0487,0.058,0.0,0.0,0.0,0.0,0.0363,0.0232,0.1021,0.0054
1082,292700,198516,Elon University,Elon,NC,27244-2010,1,4,2,-1.0,...,0.0555,0.0943,0.0,0.0,0.0,0.0,0.0601,0.0,0.2479,0.0106
