In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import NearestNeighbors

## Method to standardize data

In [None]:
def standardize_data(df_sc):
  sc = StandardScaler()
  temp = sc.fit_transform(df_sc)
  df_sc[df_sc.columns] = temp
  return sc, df_sc

## Method to apply PCA

In [None]:
def apply_pca(df_pca):
  pca = PCA(n_components=10)
  df_pca = pca.fit_transform(df_pca)
  return pca, df_pca

## KNN implementation

In [None]:
df = pd.read_csv("bestcollegesite/bestcollege/data/cleaned_data.csv")

In [None]:
importance = {'HIGHDEG': 8, 'SAT_AVG': 1, 'ACTCMMID': 1, 'UGDS_WHITE': 4, 'UGDS_BLACK': 4, 
                'UGDS_HISP': 4, 'UGDS_ASIAN': 4, 'UGDS_AIAN': 4, 'UGDS_NHPI': 4, 'UGDS_2MOR': 4, 
                'UGDS_NRA': 4, 'UGDS_UNKN': 4, 'UG25ABV': 3, 'PPTUG_EF': 4, 'INC_PCT_LO': 3 , 
                'INC_PCT_M1': 3, 'INC_PCT_M2': 3, 'INC_PCT_H1': 3, 'INC_PCT_H2': 3, 
                'PAR_ED_PCT_1STGEN': 4, 'C150_4': 1, 'PCIP14': 1, 'RPY_7YR_RT': 1, 
                'RPY_3YR_RT': 1, 'RPY_5YR_RT': 1, "MD_EARN_WNE_P6":1, "MD_EARN_WNE_P10":1, 
                'ADM_RATE': 1, 'COSTT4_A':1 , 'SPRING_TAVG': 2, 'SUMMER_TAVG': 2, 'FALL_TAVG': 2,
                'WINTER_TAVG': 2}

In [None]:
user_input = { 'HIGHDEG' : 4, 'SAT_AVG' : 1600, 'ACTCMMID' : 32, 
            'UGDS_WHITE' : 0.28307029548989115, 'UGDS_BLACK' : 0.09408304821150856,
            'UGDS_HISP' : 0.3764099533437014, 'UGDS_ASIAN' : 0.11225194401244168, 
            'UGDS_AIAN' : 0.005662363919129083, 'UGDS_NHPI' : 0.009393001555209954, 
            'UG25ABV' : 0, 'PPTUG_EF' : 0, 
            'INC_PCT_LO' : 0,
            'INC_PCT_M1' : 0, 
            'INC_PCT_M2' : 0,
            'INC_PCT_H1' : 1, 
            'INC_PCT_H2' : 0,
            'PAR_ED_PCT_1STGEN' : 0, 'C150_4' : 1, 'PCIP14' : 1, 'ADM_RATE': 0.6, 
            'MD_EARN_WNE_P10': 60000, 'RPY_3YR_RT': 1,
            'MD_EARN_WNE_P6':  60000,'RPY_7YR_RT': 1, 'RPY_5YR_RT': 1,
            'SPRING_TAVG': 65.0,'SUMMER_TAVG': 85.0,
            'FALL_TAVG': 60.0, 'WINTER_TAVG': 45.0
            }

In [None]:
user_filters = { 'ADM_RATE' : [0.6,1], 
        'UGDS' : [2500,10000],
        'TUITIONFEE_IN' : [0,40000], 'TUITIONFEE_OUT' : None, 'STABBR' : ['CA'], 
        'MAIN' : 1, 'CONTROL' : 1,
        'RELAFFIL' : None, 'DISTANCEONLY' : 0, 
        'HBCU': 0, 'PBI': 0,
        'ANNHI': 0, 'HSI': 0, 
        'NANTI': 0, 'MENONLY': None, 'WOMENONLY': None, 
        'CIP14BACHL': 1, 'GRAD_DEBT_MDN10YR': [0,250] }

In [None]:
df_knn = df.copy()

In [None]:
#Filtering the Data Frame on State Values
states = user_filters["STABBR"]
if len(states) > 0:
    df_knn = df_knn.loc[df_knn['STABBR'].isin(states)]
user_filters.pop("STABBR", None)

In [None]:
#Adding all the columns except the user input to the drop list
cols_to_drop = [col for col in df_knn.columns if col not in user_input.keys()]

In [None]:
#Dropping input keys which are of no concern to the user
input_keys_to_drop = []
for key in user_input:
    if user_input[key] == None:
        cols_to_drop.append(key)
        input_keys_to_drop.append(key)
for key in input_keys_to_drop:
    user_input.pop(key, None)

In [None]:
#Filtering the data further based on user specified filters
for col,val in user_filters.items():
    if val:
        if isinstance(val, list):
            df_knn = df_knn[df_knn[col].between(val[0], val[1])]
        else:
            df_knn = df_knn.loc[df_knn[col] == val]    

In [None]:
#Dropping all the columns in the drop list
df_knn.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
#Standardizing the data
sc, df_knn_sc = standardize_data(df_knn) 

In [None]:
#Assigning weights to features for a weighted KNN
for col in list(df_knn_sc.columns):
    df_knn_sc[col] = df_knn_sc[col].apply(lambda x: x*importance[col])

In [None]:
#Applying PCA to reduce dimensionality
pca, df_pca = apply_pca(df_knn_sc)

In [None]:
#Putting the User input into a data frame
df_input = pd.DataFrame(data=None, columns=df_knn_sc.columns)
df_input = df_input.append(user_input, ignore_index=True)

In [None]:
#Standardizing the user input
temp1 = sc.transform(df_input)
df_input[df_input.columns] = temp1

In [None]:
#Assigning weights to input features
for col in list(df_input.columns):
    df_input[col] = df_input[col].apply(lambda x: x*importance[col])

In [None]:
#Checking if at least 10 rows are there in the dataset,
#if not reducing the number of final results
n = 10
if df_knn_sc.shape[0] < 10:
    n = df_knn_sc.shape[0]

In [None]:
df_knn.head()

In [None]:
#Fitting Nearest Neighbors on the dataset 
nbrs = NearestNeighbors(n_neighbors=n, algorithm='kd_tree', metric='manhattan').fit(df_pca)
distances, indices = nbrs.kneighbors(pca.transform(df_input))

In [None]:
#Returning the UNITIDs of the resulting universities/colleges
df_result = df_knn_sc.iloc[indices[0]]
df.iloc[list(df_result.index)]

# Experiments

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_exp=df.copy()

cols_to_drop = [col for col in df_exp.columns if col not in user_input.keys()]

df_exp.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
sc, df_exp_sc = standardize_data(df_exp) 
for col in list(df_exp_sc.columns):
    df_exp_sc[col] = df_exp_sc[col].apply(lambda x: x*importance[col])

pca, df_pca = apply_pca(df_exp_sc)

## Manhattan Distance

In [None]:
zips=df['ZIP'].tolist()
zips=[zips[i].split("-")[0] for i in range(len(zips))]
df2=pd.read_csv('zip2latlong.csv')
z=df2['ZIP']
l1=df2['LAT']
l2=df2['LNG']
d={format(z[i],'05'):{'lat':l1[i],'lng':l2[i]} for i in range(len(z))}
lats=[d[zips[i]]['lat'] if zips[i] in d else 0.0 for i in range(len(zips))]
lngs=[d[zips[i]]['lng'] if zips[i] in d else 0.0 for i in range(len(zips))]

nbrs10dkm=NearestNeighbors(n_neighbors=10,algorithm='kd_tree',metric='manhattan').fit(df_pca)
X,Y=nbrs10dkm.kneighbors_graph(df_pca).nonzero()
X=X.tolist()
Y=Y.tolist()
plt.figure(figsize=(40,20))

for i in range(len(X)):
    if (lats[X[i]]==0.0 and lngs[X[i]]==0.0) or (lats[X[i]]<24.0 or lats[X[i]]>50.0) or (lngs[X[i]]<-125.0 or lngs[X[i]]>-50.0):
        continue
    if (lats[Y[i]]==0.0 and lngs[Y[i]]==0.0) or (lats[Y[i]]<24.0 or lats[Y[i]]>50.0) or (lngs[Y[i]]<-125.0 or lngs[Y[i]]>-50.0):
        continue
    if X[i]==Y[i]:
        continue
    plt.plot([lngs[X[i]],lngs[Y[i]]],[lats[X[i]],lats[Y[i]]],'k-',alpha=0.25,linewidth=0.2)
    
toremove=[]
for i in range(len(zips)):
    if (lats[i]==0.0 and lngs[i]==0.0) or (lats[i]<24.0 or lats[i]>50.0) or (lngs[i]<-125.0 or lngs[i]>-50.0):
        toremove.append(i)

tr=[]
for i in range(len(X)):
    if X[i] in toremove or Y[i] in toremove or X[i]==Y[i]:
        tr.append(i)

toremove.reverse()
tr.reverse()

for x in toremove:
    del lats[x]
    del lngs[x]
    
for x in tr:
    del X[x]
    del Y[x]

plt.plot(lngs,lats,'r.')
plt.show()

## Euclidian Distance

In [None]:
zips=df['ZIP'].tolist()
zips=[zips[i].split("-")[0] for i in range(len(zips))]
df2=pd.read_csv('zip2latlong.csv')
z=df2['ZIP']
l1=df2['LAT']
l2=df2['LNG']
d={format(z[i],'05'):{'lat':l1[i],'lng':l2[i]} for i in range(len(z))}
lats=[d[zips[i]]['lat'] if zips[i] in d else 0.0 for i in range(len(zips))]
lngs=[d[zips[i]]['lng'] if zips[i] in d else 0.0 for i in range(len(zips))]

nbrs10dke=NearestNeighbors(n_neighbors=10,algorithm='kd_tree').fit(df_pca)
X,Y=nbrs10dke.kneighbors_graph(df_pca).nonzero()
X=X.tolist()
Y=Y.tolist()
plt.figure(figsize=(40,20))

for i in range(len(X)):
    if (lats[X[i]]==0.0 and lngs[X[i]]==0.0) or (lats[X[i]]<24.0 or lats[X[i]]>50.0) or (lngs[X[i]]<-125.0 or lngs[X[i]]>-50.0):
        continue
    if (lats[Y[i]]==0.0 and lngs[Y[i]]==0.0) or (lats[Y[i]]<24.0 or lats[Y[i]]>50.0) or (lngs[Y[i]]<-125.0 or lngs[Y[i]]>-50.0):
        continue
    if X[i]==Y[i]:
        continue
    plt.plot([lngs[X[i]],lngs[Y[i]]],[lats[X[i]],lats[Y[i]]],'k-',alpha=0.25,linewidth=0.2)
    
toremove=[]
for i in range(len(zips)):
    if (lats[i]==0.0 and lngs[i]==0.0) or (lats[i]<24.0 or lats[i]>50.0) or (lngs[i]<-125.0 or lngs[i]>-50.0):
        toremove.append(i)

tr=[]
for i in range(len(X)):
    if X[i] in toremove or Y[i] in toremove or X[i]==Y[i]:
        tr.append(i)

toremove.reverse()
tr.reverse()

for x in toremove:
    del lats[x]
    del lngs[x]
    
for x in tr:
    del X[x]
    del Y[x]

plt.plot(lngs,lats,'r.')
plt.show()

## DBSCAN

In [None]:
import matplotlib.colors as clrs
from sklearn.cluster import DBSCAN
from collections import Counter

zips=df['ZIP'].tolist()
zips=[zips[i].split("-")[0] for i in range(len(zips))]
df2=pd.read_csv('zip2latlong.csv')
z=df2['ZIP']
l1=df2['LAT']
l2=df2['LNG']
d={format(z[i],'05'):{'lat':l1[i],'lng':l2[i]} for i in range(len(z))}
lats=[d[zips[i]]['lat'] if zips[i] in d else 0.0 for i in range(len(zips))]
lngs=[d[zips[i]]['lng'] if zips[i] in d else 0.0 for i in range(len(zips))]

plt.figure(figsize=(40,20))
db = DBSCAN(eps=4.0).fit(df_pca)
labels = db.labels_.tolist()
d=dict(Counter(labels))
print(d)
s=[(d[1]/d[x])**1.5 if (d[-1]/d[x])!=1.0 else 100 for x in labels]
toremove=[]
for i in range(len(zips)):
    if (lats[i]==0.0 and lngs[i]==0.0) or (lats[i]<24.0 or lats[i]>50.0) or (lngs[i]<-125.0 or lngs[i]>-50.0):
        toremove.append(i)

toremove.reverse()

for x in toremove:
    del lats[x]
    del lngs[x]
    del labels[x]

plt.scatter(lngs,lats,c=labels,s=s,marker='o',cmap=plt.get_cmap('tab20'))
plt.show()