In [52]:
import networkx as nx
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt

In [44]:
def compute_knn_vars_term(df_term,df2,demo,knn_lst,vars):
    """
    Creates dataframe with k nearest neighbors by term
    input:
    - df_term (pd.DataFrame): (students x terms) x knns
    - df2 (pd.DataFrame): (student x term) x features for students 
    - demo (pd.DataFrame): background data
    - knn_lst (list): a list of ks
    - vars (list): a list of variables for which knn statistics need to be calculated
    output:
    - (pd.DataFrame): (students x terms) x (knns x vars)
    """
    df_merged = df_term.merge(demo,how="left",on=["mellon_id"]) 
    df_merged = df_merged.merge(df2,how="left",on=["term_code","mellon_id"]) 
    df_merged = df_merged.set_index(["term_code","mellon_id"])
    
    def get_values(k):
        df_values = {var +"_"+str(k)+"nn":[]  for var in vars}

        for term,stud_id in df_merged.index:
            knn_id_lst = df_merged.loc[(term,stud_id),"term_nn_"+str(k)]
            for var in vars:
                df_values[var+"_"+str(k)+"nn"].append([df_merged.loc[(term,i),var] for i in knn_id_lst if i!=0]) 
        
        return pd.DataFrame(df_values)
    
    df_values_lst = []
    for k in knn_lst:
        print(k)
        df_values_lst.append(get_values(k))

    df_merged = df_merged.reset_index()    
    df_merged = pd.concat([df_merged]+df_values_lst,axis=1)
    return df_merged
    
def compute_knn_vars_year(df_year,df2_year,demo,knn_lst,vars):
    """
    Creates dataframe with k nearest neighbors by year
    input:
    - df_term (pd.DataFrame): (students x years) x knns
    - df2 (pd.DataFrame): (student x year) x features for students 
    - demo (pd.DataFrame): background data
    - knn_lst (list): a list of ks
    - vars (list): a list of variables for which knn statistics need to be calculated
    output:
    - (pd.DataFrame): (students x years) x (knns x vars)
    """

    df_merged = df_year.merge(df2_year,how="left",on=["year","mellon_id"])
    df_merged = df_merged.merge(demo,how="left",on=["mellon_id"])
    df_merged = df_merged.set_index(["year","mellon_id"])
    
    def get_values(k):
        df_values = {var +"_"+str(k)+"nn":[]  for var in vars}

        for year,stud_id in df_merged.index:
            knn_id_lst = df_merged.loc[(year,stud_id),"year_nn_"+str(k)]
            for var in vars:
                df_values[var+"_"+str(k)+"nn"].append([df_merged.loc[(year,i),var] for i in knn_id_lst if i!=0]) 
        
        return pd.DataFrame(df_values)
    
    df_values_lst = []
    for k in knn_lst:
        print(k)
        df_values_lst.append(get_values(k))
    
    df_merged = df_merged.reset_index() 
    df_merged = pd.concat([df_merged]+df_values_lst,axis=1)
    return df_merged

In [53]:
df1 = pd.read_pickle("../Data/for_network/df1.pkl")
df2 = pd.read_pickle("../Data/for_network/df2.pkl")
demo = pd.read_pickle("../Data/for_network/demo.pkl")

demo_vars = list(demo.columns[1:])
df_term = pd.read_pickle("../Data/KNN/KNN_TERM_201192+.pkl")
df_year = pd.read_pickle("../Data/KNN/KNN_YEAR_201192+.pkl")

In [32]:
df2 = df2.sort_values(["mellon_id","term_code"],ascending=True)
df2_year = df2.drop(["freshman_term","year_study","gpa_term","major_change"],axis=1)
df2_year = df2_year.groupby(["mellon_id","year"]).last().reset_index()
df2_year.drop("term_code",axis=1,inplace=True)

major_change = df2.loc[:,["mellon_id","year","major_change"]]
major_change = major_change.groupby(["mellon_id","year"]).sum().reset_index()
major_change["major_change"] = 1*(major_change["major_change"]>0)
print(sum(major_change["major_change"].isna()))

df2_year = df2_year.merge(major_change,how="left",on=["year","mellon_id"])
df2_year.head()

0


Unnamed: 0,mellon_id,year,gpa_cumulative,major_stem_1,major_name_1,major_school_name_1,US_citizen,major_change
0,162766,2019,3.53,0.0,Business Economics,School of Social Sciences,0.0,0
1,162766,2020,3.75,0.0,Business Economics,School of Social Sciences,0.0,0
2,162766,2021,3.78,0.0,Business Economics,School of Social Sciences,0.0,0
3,162766,2022,3.78,0.0,Business Economics,School of Social Sciences,0.0,0
4,162767,2018,3.26,0.0,Nursing Science,School of Nursing,1.0,0


In [37]:
knn_lst=[2,4,8,16]
vars = ["major_name_1","gpa_cumulative","gpa_term","major_change","US_citizen","major_stem_1"] + demo_vars
df_term = compute_knn_vars_term(df_term,df2,demo,knn_lst,vars)

2
4
8
16


In [38]:
df_term.to_pickle("../Data/KNN/KNN_TERM_VALUES_201192+.pkl")

In [45]:
df_term = pd.read_pickle("../Data/KNN/KNN_TERM_VALUES_201192+.pkl")
df_term.head()

Unnamed: 0,term_code,mellon_id,term_nn_2,term_nn_4,term_nn_8,term_nn_16,female,low_income_desc,hs_gpa,uc_total_score,...,major_stem_1_16nn,female_16nn,low_income_desc_16nn,hs_gpa_16nn,uc_total_score_16nn,URM_16nn,same_major_2nn,same_major_4nn,same_major_8nn,same_major_16nn
0,201692,162770,"[219955, 218741]","[203104, 180454, 219955, 218741]","[200563, 190631, 181183, 167962, 203104, 18045...","[168436, 176204, 215227, 190741, 195073, 16832...",0.0,0.0,4.22,221.0,...,0.5625,0.375,0.3125,3.920625,244.866667,0.5,0.5,0.5,0.25,0.1875
1,201692,162771,"[167104, 197331]","[212273, 388654, 167104, 197331]","[195327, 172060, 177431, 197663, 212273, 38865...","[388026, 163755, 164992, 196578, 183103, 22379...",1.0,0.0,4.07,229.0,...,0.0,0.5625,0.625,3.83125,227.071429,0.5625,0.0,0.25,0.25,0.25
2,201692,162781,"[201797, 177306]","[175148, 164727, 201797, 177306]","[175227, 198343, 191654, 389351, 175148, 16472...","[169623, 185018, 170875, 388163, 189235, 20284...",1.0,1.0,4.25,251.0,...,0.5625,0.625,0.25,3.944375,240.1875,0.4,0.5,0.25,0.5,0.5
3,201692,162782,"[179375, 166461]","[164097, 175516, 179375, 166461]","[202752, 192204, 171000, 168683, 164097, 17551...","[169391, 188714, 167979, 188980, 186364, 21658...",1.0,1.0,3.92,219.0,...,0.625,0.5625,0.25,4.075625,241.1875,0.133333,0.5,0.75,0.625,0.5625
4,201692,162784,"[177546, 194149]","[187592, 188594, 177546, 194149]","[206028, 189741, 176640, 168118, 187592, 18859...","[201470, 181228, 215632, 191680, 164533, 21279...",1.0,0.0,4.0,261.0,...,0.4375,0.4375,0.1875,3.85375,242.0625,0.1875,0.0,0.0,0.125,0.25


In [None]:
for k in knn_lst:
    for var in vars[1:]:
        df_term[var+"_"+str(k)+"nn"] = df_term[var+"_"+str(k)+"nn"].apply(np.nanmean)
    df_term["same_major"+"_"+str(k)+"nn"] = [np.nanmean([i==major for i in major_lst]) for major,major_lst in zip(df_term["major_name_1"],df_term["major_name_1"+"_"+str(k)+"nn"])]    


In [46]:
knn_lst=[2,4,8,16]
vars = ["major_name_1","gpa_cumulative","major_change","US_citizen","major_stem_1"] + demo_vars
df_year = compute_knn_vars_year(df_year,df2_year,demo,knn_lst,vars)

2
4
8
16


In [47]:
df_year.to_pickle("../Data/KNN/KNN_YEAR_METRICS_201192+.pkl")

In [48]:
df_year.head()

Unnamed: 0,year,mellon_id,year_nn_2,year_nn_4,year_nn_8,year_nn_16,gpa_cumulative,major_stem_1,major_name_1,major_school_name_1,...,major_name_1_16nn,gpa_cumulative_16nn,major_change_16nn,US_citizen_16nn,major_stem_1_16nn,female_16nn,low_income_desc_16nn,hs_gpa_16nn,uc_total_score_16nn,URM_16nn
0,2016,162770,"[199589, 181183]","[197951, 217162, 199589, 181183]","[199466, 186533, 169591, 174691, 197951, 21716...","[168436, 167990, 196992, 195591, 190741, 17331...",3.23,1.0,Computer Science,School of Info & Computer Sci,...,"[Computer Science, Computer Science, Computer ...","[3.79, 2.91, 3.98, 2.07, 3.6, 3.23, 3.72, 3.33...","[1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.0, 3.79, 4.3600001, 3.5699999, 3.6600001, 3...","[243.0, 233.0, 290.0, nan, 274.0, 224.0, 241.0...","[1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ..."
1,2016,162771,"[210732, 223794]","[172060, 207921, 210732, 223794]","[388713, 388237, 164025, 220328, 172060, 20792...","[388026, 389654, 389862, 197863, 378057, 38870...",3.17,0.0,Sociology,School of Social Sciences,...,"[Sociology, Sociology, Sociology, Sociology, S...","[3.17, 3.73, 3.79, 2.68, 2.73, 3.4, 3.28, 2.21...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3.5799999, 3.46, 3.1900001, 2.98, 3.1900001, ...","[nan, nan, nan, nan, nan, nan, 220.0, nan, nan...","[1.0, 0.0, 1.0, 0.0, 0.0, 1.0, nan, 0.0, 1.0, ..."
2,2016,162781,"[177306, 164996]","[169086, 198458, 177306, 164996]","[175148, 197387, 201396, 186507, 169086, 19845...","[186783, 164337, 183102, 193682, 186396, 18687...",3.24,1.0,Biological Sciences,School of Biological Sciences,...,"[Biological Sciences, Biological Sciences, Bio...","[3.93, 2.61, 3.94, 3.47, 2.97, 3.22, 3.72, 3.7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.1300001, 3.8800001, 3.8599999, 4.1399999, 4...","[266.0, 242.0, 290.0, 261.0, 271.0, 249.0, 263...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, nan, 1.0, 1.0, ..."
3,2016,162782,"[167933, 175516]","[170139, 196560, 167933, 175516]","[168683, 197809, 188656, 184954, 170139, 19656...","[197845, 201255, 185087, 188980, 164097, 18983...",2.9,1.0,Computer Science,School of Info & Computer Sci,...,"[Computer Science, Business Information Manage...","[2.56, 3.43, 3.84, 3.57, 1.7, 3.76, 3.67, 3.84...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, ...","[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, ...","[1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[3.8, 4.1999998, 4.27, 4.1700001, 4.0, 4.0, 4....","[230.0, 280.0, 290.0, 226.0, 193.0, 266.0, 229...","[1.0, 0.0, 0.0, 0.0, nan, 0.0, 0.0, 0.0, 0.0, ..."
4,2016,162784,"[163914, 183173]","[165440, 192569, 163914, 183173]","[201100, 177175, 200767, 185465, 165440, 19256...","[189741, 201470, 168267, 388254, 178477, 20708...",3.48,0.0,undeclared,School of Humanities,...,"[Education Sciences, Psychology and Social Beh...","[3.81, 3.36, 2.13, 3.12, 2.93, 2.53, 3.43, 2.5...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[4.0999999, 3.97, 3.3599999, 3.7, 4.0, 4.11999...","[253.0, 262.0, 236.0, 222.0, 233.0, 255.0, 280...","[1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
for k in knn_lst:
    for var in vars[1:]:
        df_year[var+"_"+str(k)+"nn"] = df_year[var+"_"+str(k)+"nn"].apply(np.nanmean)
    df_year["same_major"+"_"+str(k)+"nn"] = [np.nanmean([i==major for i in major_lst]) for major,major_lst in zip(df_year["major_name_1"],df_year["major_name_1"+"_"+str(k)+"nn"])]    



In [51]:
df_year.to_pickle("../Data/KNN/KNN_YEAR_METRICS_201192+.pkl")