In [1]:
import os
import math

import pandas as pd
import numpy as np

from scipy.spatial.distance import pdist,squareform

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
path_file = os.sep.join(['created_data', 'class_matrix_bak.csv'])
class_matrix_bak = pd.read_csv(path_file, sep=",", index_col='studium_id') 

In [3]:
class_matrix_bak

Unnamed: 0_level_0,BI-PA1,BI-PAI,BI-CAO,BI-PS1,BI-MLO,BI-ZMA,BI-PA2,BI-DBS,BI-SAP,BI-LIN,BI-AG1,BI-AAG,BI-ZDM,BI-OSY,BI-PSI,BI-BEZ,BI-PST,BI-DPR,BI-SI1.2,BI-EMP
studium_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
17929506,1.5,2.0,1.5,2.0,2.0,2.5,4.0,2.0,4.0,3.0,4.0,4.0,4.0,,,,,,,
17931206,4.0,2.0,1.5,2.5,4.0,4.0,4.0,4.0,4.0,4.0,,,,,,,,,,
15569706,3.0,1.5,1.5,2.0,1.5,1.5,2.0,1.0,1.0,2.0,2.0,1.5,2.0,1.0,2.0,1.5,4.0,4.0,1.5,4.0
16729706,,,,,2.0,2.0,4.0,,4.0,4.0,4.0,4.0,4.0,,,,,,1.5,
17931606,1.5,1.0,1.0,1.5,2.5,2.5,2.0,1.5,1.0,4.0,4.0,4.0,4.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19351806,4.0,,4.0,4.0,,,,,,,,,,,,,,,,
19351406,4.0,,4.0,4.0,,,,,,,,,,,,,,,,
19351006,4.0,,4.0,4.0,,,,,,,,,,,,,,,,
19350506,4.0,,4.0,4.0,,,,,,,,,,,,,,,,


# Predikce pomocí similarity matrix

In [4]:
def get_grade_predict(student, course, matrix, metrics, nan_filler, s_num):
    '''
    Use similarity matrix to predict student's grade
    Arguments:
        student: student whose grades predict
        course: course whose grade prediction is wonted
        matrix: matrix with students and their grades
        metrics: metrika pro vytvoreni similarity matrix
        nan_filler: value to fill NaN grade values
        s_num: number of most similar students
    Returns:
        Grade prediction based on s_num most similar students
    '''
    
    # predmety ktere zadany student absolvoval + odstraneni hledaneho predmetu
    completed_courses = matrix.loc[student].notna().dot(matrix.columns+',').rstrip(',').split(',')
    if course in completed_courses:
        completed_courses.remove(course)
        
    # jen ti studenti, kteri maji vyplnen predikovany predmet
    #print("pocet stud " + str(len(matrix)))
    matrix_course_notna = matrix[matrix[course].notna()]
    #print("pocet stud potom" + str(len(matrix_course_notna)))
    # zmena nan na nan_filler v matici
    matrix_filled = matrix_course_notna.fillna(nan_filler)
    # submatice s pouze predmety, ktere zadany student absolvoval
    sub_matrix_filled = matrix_filled[completed_courses]
    # similarity matrix
    similarity_matrix = squareform(pdist(sub_matrix_filled, metric=metrics))
    similarity_matrix_sid = []
    
    students_index = 0
    i = 0
    for index, row in sub_matrix_filled.iterrows():
        similarity_matrix_sid.append((index, similarity_matrix[i])) 
        if index == student:
            students_index = i
        i += 1
    #print(similarity_matrix_sid)
     
    # serazeny list tuplu studentu s mirou podobnosti mezi nimi a zadanym studentem
    #similar_students = list(enumerate(similarity_matrix[translator_sid_index[student]]))
    sorted_similar_students = sorted(similarity_matrix_sid, key=lambda x:x[1][students_index], reverse=False)    
    
    # vypocitana predikce znamky z s_num nejpodobnejsich studentu
    stu_pred = np.array([])
    for stu_i in range(min(s_num + 1, len(sorted_similar_students))):
        if sorted_similar_students[stu_i][0] != student:
            stu_pred = np.append(stu_pred, [matrix[course][sorted_similar_students[stu_i][0]]])
            #print()
            #print(sorted_similar_students[stu_i][0])
            #print(sorted_similar_students[stu_i][1][students_index])
            #print(matrix[course][sorted_similar_students[stu_i][0]])
    prediction = stu_pred.mean()
    return prediction

In [5]:
def number_to_grade(number):
    
    if number < 1.5:
        return 'A'
    if number < 2:
        return 'B'
    if number < 2.5:
        return 'C'
    if number < 3:
        return 'D'
    if number < 4:
        return 'E'
    return 'F'

In [6]:
def predict(student, matrix, nan_filler, s_num):
    for metrics in ['euclidean', 'cityblock', 'chebyshev', 'hamming', 'cosine']:
        print('\033[1m' + metrics + '\033[0m')
        spravne = 0
        celkem = 0
        courses = matrix.columns.unique()
        for course in courses:
            print(course)
            if not math.isnan(matrix.loc[student , course]):
                celkem += 1
            print('real: ' + str(matrix.loc[student , course]) + " "  + number_to_grade(matrix.loc[student , course]))
            prediction = get_grade_predict(student, course, matrix, metrics, nan_filler, s_num)
            prediction_grade = number_to_grade(prediction)
            print('predicted: ' + str(prediction) + " " + prediction_grade)
            if prediction_grade == number_to_grade(matrix.loc[student , course]):
                spravne += 1
            print()
        print('\033[4m' + 'spravne = ' + str(spravne) + ' z ' + str(celkem) + ' absolvovanych predmetu' +'\033[0m')
        print()    

In [7]:
predict(17937206, class_matrix_bak, 0, 5)

[1meuclidean[0m
BI-PA1
real: 2.0 C
predicted: 3.3 E

BI-PAI
real: 1.0 A
predicted: 1.5 B

BI-CAO
real: 1.0 A
predicted: 1.1 A

BI-PS1
real: 1.5 B
predicted: 2.0 C

BI-MLO
real: 1.5 B
predicted: 1.9 B

BI-ZMA
real: 2.0 C
predicted: 2.2 C

BI-PA2
real: 4.0 F
predicted: 2.4 C

BI-DBS
real: 1.5 B
predicted: 1.4 A

BI-SAP
real: 1.5 B
predicted: 1.3 A

BI-LIN
real: 2.5 D
predicted: 2.6 D

BI-AG1
real: 4.0 F
predicted: 4.0 F

BI-AAG
real: 4.0 F
predicted: 4.0 F

BI-ZDM
real: 4.0 F
predicted: 4.0 F

BI-OSY
real: nan F
predicted: 1.8333333333333333 B

BI-PSI
real: nan F
predicted: 2.0 C

BI-BEZ
real: nan F
predicted: 1.5 B

BI-PST
real: nan F
predicted: 2.0833333333333335 C

BI-DPR
real: nan F
predicted: 1.5 B

BI-SI1.2
real: nan F
predicted: 1.3333333333333333 A

BI-EMP
real: nan F
predicted: 1.5833333333333333 B

[4mspravne = 7 z 13 absolvovanych predmetu[0m

[1mcityblock[0m
BI-PA1
real: 2.0 C
predicted: 2.9 D

BI-PAI
real: 1.0 A
predicted: 1.4 A

BI-CAO
real: 1.0 A
predicted: 1.0 A

BI

In [8]:
predict(17937206, class_matrix_bak, 5, 5)

[1meuclidean[0m
BI-PA1
real: 2.0 C
predicted: 3.3 E

BI-PAI
real: 1.0 A
predicted: 1.6 B

BI-CAO
real: 1.0 A
predicted: 1.1 A

BI-PS1
real: 1.5 B
predicted: 2.0 C

BI-MLO
real: 1.5 B
predicted: 1.9 B

BI-ZMA
real: 2.0 C
predicted: 2.2 C

BI-PA2
real: 4.0 F
predicted: 2.4 C

BI-DBS
real: 1.5 B
predicted: 1.4 A

BI-SAP
real: 1.5 B
predicted: 1.3 A

BI-LIN
real: 2.5 D
predicted: 2.9 D

BI-AG1
real: 4.0 F
predicted: 4.0 F

BI-AAG
real: 4.0 F
predicted: 4.0 F

BI-ZDM
real: 4.0 F
predicted: 4.0 F

BI-OSY
real: nan F
predicted: 1.8333333333333333 B

BI-PSI
real: nan F
predicted: 2.0 C

BI-BEZ
real: nan F
predicted: 1.5 B

BI-PST
real: nan F
predicted: 2.0833333333333335 C

BI-DPR
real: nan F
predicted: 1.5 B

BI-SI1.2
real: nan F
predicted: 1.3333333333333333 A

BI-EMP
real: nan F
predicted: 1.5833333333333333 B

[4mspravne = 7 z 13 absolvovanych predmetu[0m

[1mcityblock[0m
BI-PA1
real: 2.0 C
predicted: 2.9 D

BI-PAI
real: 1.0 A
predicted: 1.4 A

BI-CAO
real: 1.0 A
predicted: 1.0 A

BI