In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import recommendations
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv('../datasets/processed_dataset.csv')
df = df[['Student Number', 'Course Code', 'Letter Grade', 'Semester', 'Course Credit', 'GPA', 'Completed Credits', 'Department Code']]
df

In [None]:
df = pd.concat([df, pd.get_dummies(df['Department Code'], prefix='Department Code')], axis=1)
df.drop(['Department Code'], axis=1, inplace=True)
cluster_features = df[['GPA', 'Completed Credits'] + list(df.columns[7:])]

In [None]:
numerical_grades = {'A+': 4.1, 'A': 4.0, 'A-': 3.7, 'B+': 3.3, 'B': 3.0, 'B-': 2.7, 'C+': 2.3, 'C': 2.0,
                    'C-': 1.7, 'D+': 1.3, 'D': 1.0, 'D-': 0.5, 'F': 0.0}

In [None]:
semester_data = {}
course_credits = {}
for row_idx in df.index:
    student_number = df.iloc[row_idx, 0]
    course_code = df.iloc[row_idx, 1]
    letter_grade = df.iloc[row_idx, 2]
    semester = df.iloc[row_idx, 3]
    credit = df.iloc[row_idx, 4]
    
    course_credits[course_code] = credit
    semester_data.setdefault(semester, {})
    semester_data[semester].setdefault(student_number, {})
    semester_data[semester][student_number][course_code] = numerical_grades[letter_grade]

In [None]:
def get_avg_gpa(train_semester, student):
    courses = train_semester[student]
    total_credit = 0
    weights = 0
    for course in courses:
        total_credit += course_credits[course]
        weights += courses[course] * course_credits[course]
    
    return weights / total_credit

In [None]:
def get_errors(train_semester, test_semester, sim):
    average_gpa = {}
    y_true = []
    y_pred = []
    for student in train_semester:
        recommended_courses = {}
        recs = recommendations.getRecommendations(train_semester, student, sim)
        for rec_grade, rec_course in recs:
            recommended_courses.setdefault(rec_course, rec_grade)
        average_gpa.setdefault(student, get_avg_gpa(train_semester, student))
        
        if student not in test_semester:
            continue
            
        for course_code in test_semester[student]:
            y_true.append(test_semester[student][course_code])
            if course_code in recommended_courses:
                y_pred.append(recommended_courses[course_code])
            else:
                y_pred.append(average_gpa[student])
        
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2_statistics = r2_score(y_true, y_pred)
    
    return rmse, r2_statistics

In [None]:
def get_clusters(num_clusters, semester_data, cluster_features):
    kmeans_dict = {}
    kmeans = KMeans(n_clusters=num_clusters).fit(cluster_features)
    for label_idx in range(len(kmeans.labels_)):
        kmeans_dict.setdefault(kmeans.labels_[label_idx], {})
        student_number = df.iloc[label_idx, 0]
        if student_number in semester_data:
            kmeans_dict[kmeans.labels_[label_idx]][student_number] = semester_data[student_number]
    return kmeans_dict

In [None]:
def predict(semester_data, sim, cluster_features):
    errors = {}
    for num_clusters in range(2, 8):
        errors.setdefault(num_clusters, {})
        for sem_idx in range(len(sorted(semester_data))-1):
            errors[num_clusters].setdefault(sem_idx, {})
            if sem_idx == 0:
                train_semester = semester_data[sorted(semester_data)[sem_idx]]
            else:
                new_semester = semester_data[sorted(semester_data)[sem_idx]]
                for student in new_semester:
                    if student in train_semester:
                        train_semester[student].update(new_semester[student])
                    else:
                        train_semester[student] = new_semester[student]
            test_semester = semester_data[sorted(semester_data)[sem_idx+1]]
            
            clusters_train = get_clusters(num_clusters, train_semester, cluster_features)
            clusters_test = get_clusters(num_clusters, test_semester, cluster_features)
            
            for cluster_label in clusters_train:
                rmse, r2_score = get_errors(clusters_train[cluster_label], clusters_test[cluster_label], sim)
                errors[num_clusters][sem_idx][cluster_label] = [rmse, r2_score]
    return errors

In [None]:
errors = predict(semester_data, recommendations.sim_distance, cluster_features)

In [None]:
errors = predict(semester_data, recommendations.sim_jaccard, cluster_features)

In [None]:
errors = predict(semester_data, recommendations.sim_pearson, cluster_features)