In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

In [2]:
df1 =pd.read_pickle("../Data/for_network/df1.pkl")

# 1. K-nearest neighbours identification

In [22]:
def drop_one_student_course(data):
    """
    Drops the courses with only one student that passed the course.
    input:
    - data (pd.DataFrame): (students x courses) x features for courses
    output:
    - data (pd.DataFrame): (students x courses) x features for courses
    """
    df = data.loc[:,["mellon_id","course_id"]].copy()
    df = df.groupby("course_id")['mellon_id'].apply(list).reset_index()
    df.columns = ["course_id","mellon_id_list"]
    df["course_size"] = df["mellon_id_list"].apply(len)
    one_stud_courses = df.loc[df["course_size"]>1,"course_id"].to_list()
    return data.loc[data["course_id"].isin(one_stud_courses)]

def stud_by_course_matrix(df):
    """
    Creates a student by course adjacency matrix
    input:
    - df (pd.DataFrame): (students x courses) x features for courses
    output:
    - sparse_matrix (csr_matrix): students x courses
    - (list) student ids corresponding to rows in sparse_matrix
    - (list) course ids corresponding to columns in sparse_matrix
    """

    student_u = CategoricalDtype(sorted(df["mellon_id"].unique()), ordered=True) 
    course_u = CategoricalDtype(sorted(df["course_id"].unique()), ordered=True) 

    row = df["mellon_id"].astype(student_u).cat.codes
    col = df["course_id"].astype(course_u).cat.codes
    sparse_matrix = csr_matrix((df["total"], (row, col)), shape=(student_u.categories.size, course_u.categories.size))
    print(f"Average number of courses: {sparse_matrix.sum(axis=1).mean() }")
    print(f"Shape of Adjacency matrix: {sparse_matrix.shape}")
    return sparse_matrix, student_u.categories.to_list(), course_u.categories.to_list()


def generate_knn_year(data,knn_lst,years):
    """
    Creates dataframe with k nearest neighbors by year
    input:
    - data (pd.DataFrame): (students x courses) x features for courses
    - knn_lst (list): a list of ks
    - years (list): a list of years
    output:
    - (pd.DataFrame): (students x years) x knns
    """
    data = data.copy()
    data = drop_one_student_course(data)
    data["total"] = 1

    year_dfs = []
    for year in years:
        print(year)
        adj_year,stud_ids,course_ids = stud_by_course_matrix(data.loc[data["year"]==year])
        stud_ids = np.array(stud_ids)
        # student by student matrix
        adj_stud = (adj_year @ adj_year.T).A
        del adj_year
        np.fill_diagonal(adj_stud,0)
        adj_stud_sorted = adj_stud.argsort(axis=1)
        
        # The loop saves student ids of k neighbors in dataframe for each year and k
        df_year = pd.DataFrame({"year":int(year),"mellon_id":stud_ids})
        for k in knn_lst:
            k_ids = adj_stud_sorted[:,-k:]
            k_stud_ids = stud_ids[k_ids]
            df_year["year_nn_"+str(k)] = np.where(np.take_along_axis(adj_stud, k_ids, axis=1) > 0,k_stud_ids,0).tolist()

        year_dfs.append(df_year)
        del adj_stud_sorted
    return pd.concat(year_dfs,axis=0).reset_index(drop=True)

def generate_knn_term(data,knn_lst,terms):
    """
    Creates dataframe with k nearest neighbors by term
    input:
    - data (pd.DataFrame): (students x courses) x features for courses
    - knn_lst (list): a list of ks
    - terms (list): a list of terms
    output:
    - (pd.DataFrame): (students x terms) x knns
    """
    data = data.copy()
    data = drop_one_student_course(data)
    data["total"] = 1
    term_dfs = []
    for term in terms:
        print(term)
        adj_term,stud_ids,course_ids = stud_by_course_matrix(data.loc[data["term_code"]==term])
        stud_ids = np.array(stud_ids)
        # student by student matrix
        adj_stud = (adj_term @ adj_term.T).A
        np.fill_diagonal(adj_stud,0)
        adj_stud_sorted = adj_stud.argsort(axis=1)

        # The loop saves student ids of k neighbors in dataframe for each term and k
        df_term = pd.DataFrame({"term_code":term,"mellon_id":stud_ids})
        for k in knn_lst:
            k_ids = adj_stud_sorted[:,-k:]
            k_stud_ids = stud_ids[k_ids]
            df_term["term_nn_"+str(k)] = np.where(np.take_along_axis(adj_stud, k_ids, axis=1) > 0,k_stud_ids,0).tolist()

        term_dfs.append(df_term)

    return pd.concat(term_dfs,axis=0).reset_index(drop=True)

def remove_empty_lists(df_knn):
    df_knn["sum"] = df_knn.iloc[:,2:].sum(axis=1).apply(lambda x: sum(x))
    df_knn = df_knn.loc[df_knn["sum"] !=0].reset_index(drop=True)
    return df_knn.drop("sum",axis=1)

In [4]:
knn_lst = [2,4,8,16]
df_year = generate_knn_year(df1,knn_lst=knn_lst,years=np.arange(2016,2021))

2016
Average number of courses: 17.032705039615063
Shape of Adjacency matrix: (27641, 13131)
2017
Average number of courses: 16.940737489025462
Shape of Adjacency matrix: (29614, 13650)
2018
Average number of courses: 11.810823674337868
Shape of Adjacency matrix: (35718, 12577)
2019
Average number of courses: 10.33518618456557
Shape of Adjacency matrix: (37060, 11924)
2020
Average number of courses: 10.135731242753824
Shape of Adjacency matrix: (36226, 11743)


In [7]:
df_year = remove_empty_lists(df_year)
df_year.to_pickle("../Data/KNN/KNN_YEAR_201192+.pkl")

In [27]:
knn_lst = [2,4,8,16]
terms = np.array([201600,201700,201800,201900,202000])
terms = np.sort(np.concatenate([terms+92, terms+103 , terms+114]))
df_term = generate_knn_term(df1,knn_lst,terms)

201692
Average number of courses: 6.046097452606635
Shape of Adjacency matrix: (27008, 4393)
201703
Average number of courses: 5.9737250009451435
Shape of Adjacency matrix: (26451, 4455)
201714
Average number of courses: 5.890578825012806
Shape of Adjacency matrix: (25379, 4283)
201792
Average number of courses: 5.970844677258159
Shape of Adjacency matrix: (29017, 4571)
201803
Average number of courses: 5.952567772947372
Shape of Adjacency matrix: (28293, 4624)
201814
Average number of courses: 5.896197214238337
Shape of Adjacency matrix: (27138, 4455)
201892
Average number of courses: 6.0024809679173465
Shape of Adjacency matrix: (29424, 4484)
201903
Average number of courses: 3.7151805669921187
Shape of Adjacency matrix: (34004, 4076)
201914
Average number of courses: 3.645340282035561
Shape of Adjacency matrix: (32620, 4017)
201992
Average number of courses: 3.67666173886363
Shape of Adjacency matrix: (35851, 3929)
202003
Average number of courses: 3.734336777275736
Shape of Adjacen

In [29]:
df_term = remove_empty_lists(df_term)
df_term.to_pickle("../Data/KNN/KNN_TERM_201192+.pkl")