# Recomendações básicas
Esse notebook almeja desenvolver um sistema de recomendação simples para disciplinas optativas.

## Processamento inicial dos dados

In [3]:
import pandas as pd

dados_formados_path = 'input/dados_formados.csv'
dados_formados = pd.read_csv(dados_formados_path, delimiter=';')
dados_formados.columns

Index(['ID_ANONIMO', 'CR', 'PERIODO', 'INGRESSO', 'CODIGO', 'ANO', 'PERIODO.1',
       'NOTA', 'CH', 'SITUACAOALUNO'],
      dtype='object')

In [4]:
dados_regulares_path = 'input/dados_regulares.csv'
dados_regulares = pd.read_csv(dados_regulares_path, delimiter=';')
dados_regulares.columns

Index(['ID_ANONIMO', 'CR', 'PERIODO', 'INGRESSO', 'CODIGO', 'ANO', 'PERIODO.1',
       'NOTA', 'CH', 'SITUACAOALUNO'],
      dtype='object')

In [5]:
dados_desistentes_path = 'input/dados_desistentes.csv'
dados_desistentes = pd.read_csv(dados_desistentes_path, delimiter=';')
dados_desistentes.columns

Index(['ID_ANONIMO', 'CR', 'PERIODO', 'INGRESSO', 'CODIGO', 'ANO', 'PERIODO.1',
       'NOTA', 'CH', 'SITUACAOALUNO'],
      dtype='object')

In [6]:
dados_trancados_path = 'input/dados_trancados.csv'
dados_trancados = pd.read_csv(dados_trancados_path, delimiter=';')
dados_trancados.columns

Index(['ID_ANONIMO', 'CR', 'PERIODO', 'INGRESSO', 'CODIGO', 'ANO', 'PERIODO.1',
       'NOTA', 'CH', 'SITUACAOALUNO'],
      dtype='object')

In [7]:
todos_dados = pd.concat([dados_desistentes, dados_regulares, dados_formados, dados_trancados])

In [8]:
def select_random_student(data : pd.DataFrame, max_tries : int, minimum_entry_year : int) -> int:
    id = 0
    tries = 0
    while id == 0 and tries < max_tries:
        try:
            id = data[[int(x[3].split('/')[2].split(' ')[0]) >= 14 for x in data.values]].sample()
            return int(id.ID_ANONIMO.iloc[0])
        except:
            id = 0
            tries += 1
    return 0

In [9]:
print(str(select_random_student(dados_formados, 100, 14)))

198543


In [10]:
print(str(select_random_student(dados_regulares, 100, 11)))

202702


In [11]:
dados_regulares[[int(x[3].split('/')[2].split(' ')[0]) >= 14 for x in dados_regulares.values]]

Unnamed: 0,ID_ANONIMO,CR,PERIODO,INGRESSO,CODIGO,ANO,PERIODO.1,NOTA,CH,SITUACAOALUNO
0,11357,06967,5,23/01/23,CSD21,2023,1,47,45,Regular
1,11357,06967,5,23/01/23,ES70P,2023,1,76,45,Regular
2,11357,06967,5,23/01/23,CSE20,2023,1,6,60,Regular
3,11357,06967,5,23/01/23,ES70G,2023,1,73,45,Regular
4,11357,06967,5,23/01/23,CSF20,2023,1,63,45,Regular
...,...,...,...,...,...,...,...,...,...,...
19877,261036,0695,2,09/08/23,ICSF13,2023,2,66,90,Regular
19878,261911,07143,1,17/08/23,ICSD21,2023,2,8,45,Regular
19879,261911,07143,1,17/08/23,ELEX10,2023,2,8,45,Regular
19880,261911,07143,1,17/08/23,ICSF13,2023,2,6,90,Regular


In [12]:
def get_all_data_from_student(student_id : int, data : pd.DataFrame):
    return data[[x == student_id for x in data.ID_ANONIMO]]
    pass

In [13]:
get_all_data_from_student(select_random_student(dados_formados, 100, 14), dados_formados)

Unnamed: 0,ID_ANONIMO,CR,PERIODO,INGRESSO,CODIGO,ANO,PERIODO.1,NOTA,CH,SITUACAOALUNO
24718,165431,08502,10,24/09/14,EL68E,2017,2,88,60,Formado
24719,165431,08502,10,24/09/14,FI64C,2016,1,94,60,Formado
24720,165431,08502,10,24/09/14,IF64C,2016,1,95,45,Formado
24721,165431,08502,10,24/09/14,EL65G,2016,1,89,60,Formado
24722,165431,08502,10,24/09/14,EL65A,2016,1,92,90,Formado
...,...,...,...,...,...,...,...,...,...,...
24838,165431,08502,10,24/09/14,CSB51,2018,2,10,45,Formado
24839,165431,08502,10,24/09/14,EEB31,2017,1,92,90,Formado
24840,165431,08502,10,24/09/14,CSX42,2018,1,9,30,Formado
24841,165431,08502,10,24/09/14,CSI41,2018,1,89,45,Formado


## Preparo dos dados

Queremos recomendar disciplinas para estudantes com base em sua performance nas disciplinas prévias. Para tal, precisamos converter os dados do DataFrame para um vetor de _features_. Escolhemos para esse teste simples utilizar a nota do estudante como valor a ser analisado. A nota utilizada será a menor entre todas as vezes que a disciplina foi cursada.

| Curso | Nota | Ultima data ((Ano - 2000)*2 + periodo)|
| -- | -- | -- |
| FI73A | 6 |  20 (2009 - 2o periodo) |
| EL68A | -1 ( nao fez ) |  -1 ( nao fez ) |
| GE60C | 4 (passou, mas reprovou uma vez com 4) |  20 (2009 - 2o periodo) |

In [14]:
from typing import List

def get_user_array(student_id : int, data : pd.DataFrame):
    student_data = get_all_data_from_student(student_id, data)
    student_array : List[ List[string], List[float], List[int] ] = [[],[],[]]
    for course in student_data.values:
        if [course[4]] not in student_array[0]:
            #print([course[4], float(str(course[7]).replace(',','.')),(course[5] - 2000)*2 + course[6]])
            student_array[0].append(course[4])
            student_array[1].append(float(str(course[7]).replace(',','.')))
            student_array[2].append((course[5] - 2000)*2 + course[6])
        else:
            index = student_array[0].index(course[4])
            nota = float(str(course[7]).replace(',','.'))
            student_array[1] = nota if nota < student_array[1][index] else student_array[1][index]
    
    return student_array

In [15]:
print(get_user_array(93846, dados_formados))

[['EL66C', 'IF6AE', 'IF6AG', 'IF6AL', 'IF6AB', 'IF60A', 'ES60F', 'ES60G', 'IF69D', 'ES60A', 'EL62A', 'MA63B', 'MA61A', 'MA63A', 'MA65A', 'MA62A', 'IF60K', 'GE60B', 'IF61C', 'FI62A', 'FI66A', 'ES65A', 'MA61B', 'FI61A', 'IF63F', 'ES61A', 'QB62A', 'IF62J', 'IF66B', 'FI63A', 'EL68F', 'ENADE C', 'GE60D', 'EL65A', 'FI64C', 'IF64C', 'EL63B', 'IF63C', 'IF63E', 'ENADE I', 'ES60B', 'ES60D', 'IF66C', 'IF66D', 'IF65E', 'EL65H', 'IF65D', 'IF64J', 'IF65C', 'EL64H', 'EL64H', 'EL65G', 'IF67C', 'EL66D', 'IF66J', 'IF67B', 'IF67B', 'EL66H', 'EL66I', 'EL66G', 'EL66H', 'IF61B', 'IF60J', 'IF60B', 'EL68E', 'EL68G', 'IF68D', 'IF68E', 'IF67H', 'IF67D', 'EL68A', 'GE60C', 'IF62C', 'GE60A', 'QB60A'], [7.3, 10.0, 9.2, 9.0, 8.9, 8.8, 10.0, 7.8, 6.1, 10.0, 10.0, 9.9, 7.0, 5.2, 8.7, 7.5, 0.0, 8.1, 9.3, 7.3, 8.1, 9.1, 7.2, 8.9, 8.3, 8.6, 7.7, 9.0, 7.8, 7.6, 8.2, nan, 6.5, 7.8, 7.5, 5.0, 5.7, 8.4, 6.9, 0.0, 10.0, 6.8, 8.4, 7.0, 8.6, 5.0, 10.0, 9.6, 6.3, 7.6, 4.0, 6.0, 8.7, 9.5, 7.7, 9.0, 0.0, 4.6, 6.5, 5.1, 6.0, 7.8, 1

## Recomendações "ingênuas"

### Recomendando os mais populares

In [16]:
import numpy as np

disciplinas = []
with open('input/dependencias.txt') as f:
    for line in f:
        curr = line.split(';')
        if 'P8' in curr:
            curr = curr[1].split('\n')[0]
            disciplinas.append(curr)

disciplinas_obrigatorias = ['GE70D', 'EEC31', 'CSS30','EEX23']
disciplinas = np.array(disciplinas)
disciplinas = disciplinas[[disc not in disciplinas_obrigatorias for disc in disciplinas]]

dados_formados_path = 'input/dados_formados.csv'
dados_formados = pd.read_csv(dados_formados_path, delimiter=';')

dados_formados_optativas = dados_formados[[int(x[3].split('/')[2].split(' ')[0]) >= 14 for x in dados_formados.values]]
dados_formados_optativas = dados_formados_optativas[[float(str(x[7]).replace(',','.')) >= 6 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[[x[4] != 'ES70N' or x[5] > 2017 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[[x[4] != 'FI70D' or x[5] > 2017 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[[x[4] != 'FI70A' or x[5] > 2017 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[[x[4] != 'GE70F' or x[5] > 2017 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[dados_formados_optativas.CODIGO.isin(disciplinas)]

In [17]:
from collections import Counter

popular_courses = Counter([materia for materia in dados_formados_optativas["CODIGO"]])

print(popular_courses)

Counter({'CSH30': 61, 'CSV30': 35, 'CSH42': 33, 'CSI53': 18, 'CSV40': 15, 'CSR41': 14, 'CSR44': 14, 'CSR42': 13, 'MA70C': 12, 'ED70T': 11, 'CSB51': 11, 'CSV45': 10, 'CSI41': 10, 'CSB41': 10, 'DI84D': 10, 'CSM41': 9, 'CSM43': 9, 'CSR43': 9, 'CSM44': 9, 'ES70J': 8, 'ES70B': 8, 'CSM40': 7, 'CSB53': 7, 'CSA44': 6, 'ES70N': 6, 'GE70F': 6, 'CSH44': 5, 'CSB54': 5, 'CSH43': 4, 'CSM30': 4, 'CSE40': 4, 'EEY41': 4, 'CSI56': 4, 'ED70U': 4, 'CSA45': 3, 'CSA42': 3, 'FI70D': 3, 'FI70B': 3, 'EL64B': 2, 'CSI58': 2, 'CSR53': 2, 'CSD41': 2, 'CSD40': 2, 'CSG42': 2, 'CSV52': 2, 'CSB52': 2, 'FI70A': 2, 'EL6AE': 1, 'CSI54': 1, 'CSW47': 1, 'CSD52': 1, 'CSR45': 1, 'EEY42': 1, 'CSH41': 1, 'EEY51': 1, 'EEL51': 1, 'CSR48': 1, 'CSI31': 1, 'IF6BV': 1, 'EL6CB': 1, 'CSG44': 1, 'FI70E': 1, 'CSA43': 1, 'EEY43': 1, 'CSG48': 1, 'EEC41': 1, 'CSH45': 1, 'CSI55': 1, 'CSE43': 1, 'CSW45': 1, 'CSI51': 1, 'CSV41': 1, 'CSI57': 1, 'FCH7HB': 1, 'CSR47': 1, 'EEY44': 1, 'CSA41': 1, 'EL75H': 1})


In [18]:
from typing import List

# Recebe um estudante e sugere as disciplinas mais populares que ele não fez ainda
def sugestoes_populares(student_id : int, max_sugestoes : int, data : pd.DataFrame) -> List:
    suggestions = [interest for interest, _ in popular_courses.most_common() if interest not in get_all_data_from_student(student_id, data)["CODIGO"].unique()]
    return suggestions[:max_sugestoes]

In [19]:
aluno = (select_random_student(dados_formados, 100, 14))
print(aluno)

179397


In [20]:
print(sugestoes_populares(aluno, 5, dados_formados))

['CSV30', 'CSI53', 'CSV40', 'CSR42', 'MA70C']


In [21]:
[x for x in get_all_data_from_student(aluno, dados_formados)["CODIGO"] if x in dados_formados_optativas["CODIGO"].unique()]

['CSH42', 'CSR41', 'CSH30', 'CSR44', 'CSA42', 'ED70T']

In [22]:
disciplinas_optativas = dados_formados_optativas["CODIGO"].unique()
print(disciplinas_optativas)

['CSM41' 'CSH43' 'EL64B' 'MA70C' 'CSM43' 'CSH30' 'EL6AE' 'CSI58' 'CSR53'
 'CSR41' 'CSI54' 'CSW47' 'CSV30' 'CSH42' 'CSM40' 'CSD41' 'CSV45' 'CSM30'
 'CSE40' 'EEY41' 'CSR43' 'CSV40' 'CSR44' 'CSI53' 'CSI41' 'CSA44' 'CSA45'
 'CSB41' 'CSI56' 'CSD52' 'CSR45' 'CSD40' 'ES70J' 'ES70B' 'EEY42' 'DI84D'
 'CSM44' 'CSH41' 'ED70T' 'CSR42' 'CSG42' 'ED70U' 'CSA42' 'CSB51' 'EEY51'
 'EEL51' 'CSB53' 'CSR48' 'ES70N' 'CSH44' 'CSB54' 'CSI31' 'IF6BV' 'GE70F'
 'FI70D' 'EL6CB' 'CSG44' 'FI70E' 'FI70B' 'CSA43' 'EEY43' 'CSG48' 'EEC41'
 'CSH45' 'CSV52' 'CSB52' 'CSI55' 'CSE43' 'CSW45' 'CSI51' 'CSV41' 'CSI57'
 'FI70A' 'FCH7HB' 'CSR47' 'EEY44' 'CSA41' 'EL75H']


## Outros sistemas de recomendação

In [23]:
todas_disciplinas = todos_dados["CODIGO"].unique()
print(todas_disciplinas)

['ES70R' 'FI71M' 'IF61C' 'MA71B' 'IF61B' 'MA71A' 'MA61A' 'FI61A' 'ES61A'
 'MA61B' 'CSD20' 'CSF13' 'ENADE I' 'IF66J' 'FI62A' 'FI63A' 'MA63A' 'EL63B'
 'EL65A' 'IF64J' 'MA63C' 'ET77B' 'ET75E' 'QB70D' 'ET75F' 'QB70E' 'QB60A'
 'MA65A' 'IF62J' 'MA62A' 'EL64H' 'EL66C' 'EL66H' 'EL66D' 'EL68G' 'IF63C'
 'EL65G' 'IF65C' 'EL66I' 'IF62C' 'EL62A' 'ES60A' 'IF64C' 'IF65E' 'IF67D'
 'MA64A' 'FI64A' 'IF60A' 'IF63F' 'EL65H' 'FI64C' 'QB62A' 'FI73A' 'FI72N'
 'MA70G' 'MA70H' 'IF63E' 'IF63O' 'MA72A' 'ES60F' 'ES70N' 'FI72M' 'MA73A'
 'ES70L' 'EL62O' 'IF66B' 'GE60D' 'IF65D' 'IF67B' 'IF6AG' 'GE60C' 'ES60G'
 'GE60B' 'MA63B' 'FI66A' 'IF66C' 'ET30N' 'ET38N' 'EL66G' 'ES65A' 'GE60A'
 'ES70G' 'EEF21' 'CSD21' 'CSF20' 'CSG20' 'QB70C' 'EEX11' 'FI72S' 'FI70A'
 'FI71S' 'EEF11' 'CSE20' 'EEX21' 'CSH30' 'DI64G' 'GE60G' 'EL66K' 'EL64O'
 'QB65A' 'IF66D' 'IF68E' 'ES70Q' 'ME37K' 'IF67H' 'EL68E' 'IF6AB' 'IF6AE'
 'ES70H' 'ES60B' 'EL68F' 'IF68D' 'ES70B' 'IF69D' 'EL68A' 'IF67C' 'ES60D'
 'FI62B' 'CE62A' 'ED60E' 'ED60D' 'ED60C' 'ED60B' 

### Filtragem colaborativa baseada no Usuário
Utilizando semelhança de cossenos, sem considerar as notas

In [24]:
# Vetor de interesses por presença
def make_user_interest_vector(user_interests : List[str]) -> List[int]:
    return [1 if interest in user_interests else 0 for interest in todas_disciplinas]

all_student_ids = dados_formados["ID_ANONIMO"].unique()

vetor_interesses_formados = {user_id: make_user_interest_vector(get_all_data_from_student(user_id, dados_formados)["CODIGO"].values) 
                             for user_id in all_student_ids}

from numpy import dot
from numpy.linalg import norm

def cos_sim(a : List[int], b : List[int]) -> int:
    return dot(a, np.transpose(b))/(norm(a)*norm(b))

user_similarities = {user_id: {other_user_id: cos_sim(interest_vector_i, interest_vector_j) 
                          for other_user_id, interest_vector_j in vetor_interesses_formados.items()}
                          for user_id, interest_vector_i in vetor_interesses_formados.items()}

def most_similar_students_to(user_id : int):
    pairs = [(other_user_id, similarity) for
                other_user_id, similarity in 
                    user_similarities[user_id].items()
                if user_id != other_user_id and similarity > 0]
    return sorted(pairs, key = lambda pair: pair[-1], reverse = True)

In [25]:
# todos os dados

# Vetor de interesses por presença
def make_user_interest_vector_alldata(user_interests : List[str]) -> List[int]:
    return [1 if interest in user_interests else 0 for interest in todas_disciplinas]

all_student_ids = todos_dados["ID_ANONIMO"].unique()

from numpy import dot
from numpy.linalg import norm

def cos_sim(a : List[int], b : List[int]) -> int:
    try:
        return dot(a, np.transpose(b))/(norm(a)*norm(b))
    except:
        return 0

def most_similar_students_to_alldata(user_id : int):
    vetor_interesse = make_user_interest_vector(get_all_data_from_student(user_id, todos_dados)["CODIGO"].values)
    this_user_similarities = {other_user_id: cos_sim(vetor_interesse, interest_vector_j) 
                          for other_user_id, interest_vector_j in vetor_interesses_formados.items()}
    
    pairs = [(other_user_id, similarity) for
                other_user_id, similarity in 
                    this_user_similarities.items()
                if user_id != other_user_id and similarity > 0]
    return sorted(pairs, key = lambda pair: pair[-1], reverse = True)

In [26]:
from collections import defaultdict

def user_based_suggestions_alldata(user_id: int, include_user_interests: bool = False, only_optionals = True):
    # Some as semelhanças
    suggestions : Dict[str, float] = defaultdict(float)

    for other_user_id, similarity in most_similar_students_to_alldata(user_id):
        for interest in get_all_data_from_student(other_user_id, dados_formados)["CODIGO"].values:
            suggestions[interest] += similarity
    # Converta em uma lista classificada 
    suggestions = sorted(suggestions.items(),
                        key = lambda pair: pair[-1],
                        reverse = True)

    # Exclua nao optativas
    if only_optionals:
        suggestions = [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion in disciplinas_optativas]
    
    # Exclua interesses existentes
    if include_user_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in get_all_data_from_student(user_id, todos_dados)["CODIGO"].values]

In [27]:
user_based_suggestions_alldata(160791)

[('CSH30', 63.52194127518745),
 ('ES70N', 56.05199336319832),
 ('FI70D', 49.53789046887516),
 ('FI70A', 47.21253903215579),
 ('CSH42', 34.298148168933764),
 ('GE70F', 31.623626965825242),
 ('CSV30', 28.463915359960588),
 ('CSR41', 24.148154615180694),
 ('CSR44', 21.650917252324355),
 ('CSI53', 16.767879727948134),
 ('ES70B', 14.93048134375213),
 ('MA70C', 13.803869211466031),
 ('CSV40', 13.743085450198617),
 ('CSR42', 13.481809028302806),
 ('CSI41', 12.902991559611655),
 ('CSB51', 11.637744913529925),
 ('CSM43', 9.567134026407807),
 ('CSB41', 9.21874776786959),
 ('ED70T', 8.92712235223443),
 ('CSV45', 7.997799648274994),
 ('CSR43', 7.456736286000127),
 ('DI84D', 7.319231105601114),
 ('CSM44', 6.711587485350914),
 ('CSB53', 6.650441729929046),
 ('ES70J', 6.643092405237685),
 ('CSE40', 6.609737780877982),
 ('CSM40', 6.051782132222358),
 ('CSI56', 5.169469564418043),
 ('CSH44', 5.168608945697874),
 ('CSA44', 4.8404120908212445),
 ('CSD41', 4.77842634349457),
 ('CSA42', 4.261808863110351),

In [28]:
get_all_data_from_student(93846, dados_formados)["CODIGO"].values

array(['EL66C', 'IF6AE', 'IF6AG', 'IF6AL', 'IF6AB', 'IF60A', 'ES60F',
       'ES60G', 'IF69D', 'ES60A', 'EL62A', 'MA63B', 'MA61A', 'MA63A',
       'MA65A', 'MA62A', 'IF60K', 'GE60B', 'IF61C', 'FI62A', 'FI66A',
       'ES65A', 'MA61B', 'FI61A', 'IF63F', 'ES61A', 'QB62A', 'IF62J',
       'IF66B', 'FI63A', 'EL68F', 'ENADE C', 'GE60D', 'EL65A', 'FI64C',
       'IF64C', 'EL63B', 'IF63C', 'IF63E', 'ENADE I', 'ES60B', 'ES60D',
       'IF66C', 'IF66D', 'IF65E', 'EL65H', 'IF65D', 'IF64J', 'IF65C',
       'EL64H', 'EL64H', 'EL65G', 'IF67C', 'EL66D', 'IF66J', 'IF67B',
       'IF67B', 'EL66H', 'EL66I', 'EL66G', 'EL66H', 'IF61B', 'IF60J',
       'IF60B', 'EL68E', 'EL68G', 'IF68D', 'IF68E', 'IF67H', 'IF67D',
       'EL68A', 'GE60C', 'IF62C', 'GE60A', 'QB60A'], dtype=object)

In [29]:
print(all_student_ids)

[  7034.  10262.  10721. ... 255641. 255670.     nan]


In [30]:
print(vetor_interesses_formados)

{13236: [0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [31]:
print(user_similarities)

{13236: {13236: 1.0, 13976: 0.8535204983803663, 30701: 0.8751130125933031, 51624: 0.5938923908220626, 51868: 0.8388668159944593, 54937: 0.8679018950386385, 54968: 0.8545495581918902, 55055: 0.8075936301394427, 55726: 0.8883723006628986, 65057: 0.8618537245237077, 65450: 0.8388668159944593, 65496: 0.8751130125933031, 65541: 0.8606318382097687, 65608: 0.8632061582884765, 65621: 0.8751130125933031, 74306: 0.797921501227884, 75585: 0.8751130125933031, 81780: 0.8679018950386385, 83464: 0.8176262338182698, 83987: 0.8227848101265823, 84004: 0.8718665271495971, 84041: 0.8812542318853867, 84050: 0.8632061582884765, 84061: 0.883414931347818, 88923: 0.869098418897731, 89035: 0.8444407432001202, 90291: 0.8606318382097687, 90294: 0.8679018950386385, 90296: 0.8812542318853867, 90298: 0.8939348005464146, 90299: 0.8618537245237077, 90302: 0.8679018950386385, 90686: 0.3969547986585982, 90729: 0.517054715806478, 90741: 0.8176262338182698, 90758: 0.8176262338182698, 90787: 0.7583301736308593, 90797: 0.38

In [32]:
most_similar_students_to(150892)

[(198372, 0.9474504407420671),
 (150878, 0.9413574486632832),
 (150887, 0.9271726499455306),
 (156158, 0.9190028346233009),
 (198543, 0.9021342216356464),
 (191284, 0.8993491139867924),
 (202261, 0.8725028717782315),
 (145077, 0.8684962373468949),
 (190495, 0.8684962373468949),
 (190499, 0.8684962373468949),
 (190508, 0.8684962373468949),
 (190494, 0.8666666666666665),
 (190493, 0.8629109946080096),
 (190615, 0.8573214099741122),
 (150873, 0.8553372034476995),
 (179408, 0.8541985556144385),
 (151217, 0.8533333333333332),
 (179385, 0.8391405500541003),
 (186024, 0.8367621765895851),
 (189048, 0.8365160106923107),
 (161216, 0.8324187420999798),
 (179652, 0.8314493471508876),
 (182856, 0.8314493471508876),
 (186036, 0.8314493471508876),
 (179823, 0.8294391771908315),
 (180984, 0.8266160642323973),
 (182913, 0.8266160642323973),
 (172248, 0.8261843893231644),
 (179944, 0.8243996016473729),
 (186021, 0.8211203828474677),
 (183413, 0.817059487979028),
 (179397, 0.812403840463596),
 (175962, 

In [33]:
from collections import defaultdict

def user_based_suggestions(user_id: int, include_user_interests: bool = False, only_optionals = True):
    # Some as semelhanças
    suggestions : Dict[str, float] = defaultdict(float)

    for other_user_id, similarity in most_similar_students_to(user_id):
        for interest in get_all_data_from_student(other_user_id, dados_formados)["CODIGO"].values:
            suggestions[interest] += similarity
    # Converta em uma lista classificada 
    suggestions = sorted(suggestions.items(),
                        key = lambda pair: pair[-1],
                        reverse = True)

    # Exclua nao optativas
    if only_optionals:
        suggestions = [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion in disciplinas_optativas]
    
    # Exclua interesses existentes
    if include_user_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in get_all_data_from_student(user_id, dados_formados)["CODIGO"].values]

In [34]:
user_based_suggestions(190615, True)

[('CSH30', 71.10393286968205),
 ('ES70N', 66.54575892495151),
 ('FI70D', 58.36151176720843),
 ('FI70A', 53.141909298993355),
 ('GE70F', 39.60964896604904),
 ('CSH42', 38.2641258810539),
 ('CSV30', 31.282610533574488),
 ('CSR41', 27.18163789104994),
 ('CSR44', 24.208328296652528),
 ('ES70B', 19.827666291777568),
 ('CSI53', 19.032878821989634),
 ('MA70C', 15.929046462727413),
 ('CSV40', 15.596328804976846),
 ('CSR42', 15.337820138842813),
 ('CSI41', 14.68080302467024),
 ('CSM41', 13.565190689051633),
 ('CSB51', 12.475648527194041),
 ('CSM43', 10.537808886993409),
 ('CSB41', 10.357773989290752),
 ('ED70T', 10.188497219128886),
 ('CSV45', 9.193282231685137),
 ('CSR43', 8.478633720890016),
 ('DI84D', 8.17053554523468),
 ('ES70J', 7.828547077495909),
 ('CSM44', 7.482005172575583),
 ('CSE40', 7.472295959603473),
 ('CSB53', 6.815560925035174),
 ('CSM40', 6.699823571829504),
 ('CSA44', 6.3138987929792725),
 ('CSI56', 5.998363864009491),
 ('CSH44', 5.685541411353794),
 ('CSD41', 5.35451185213113

## Filtragem colaborativa por usuário com notas

In [35]:
import math

# Vetor de interesses por presença
def make_user_interest_vector_notas(user_interests) -> List[int]:
    return [user_interests[1][user_interests[0].index(interest)] 
            if interest in user_interests[0] 
            and not math.isnan(user_interests[1][user_interests[0].index(interest)])
            else 0 for interest in todas_disciplinas]

all_student_ids = dados_formados["ID_ANONIMO"].unique()

all_student_ids = [x for x in all_student_ids if x]

vetor_interesses_formados_notas = {user_id: make_user_interest_vector_notas(get_user_array(user_id, dados_formados)) 
                             for user_id in all_student_ids}

user_similarities_notas = {user_id: {other_user_id: cos_sim(interest_vector_i, interest_vector_j) 
                          for other_user_id, interest_vector_j in vetor_interesses_formados_notas.items()}
                          for user_id, interest_vector_i in vetor_interesses_formados_notas.items()}

def most_similar_students_to_notas(user_id : int):
    pairs = [(other_user_id, similarity) for
                other_user_id, similarity in 
                    user_similarities_notas[user_id].items()
                if user_id != other_user_id and similarity > 0]
    return sorted(pairs, key = lambda pair: pair[-1], reverse = True)

from collections import defaultdict

def user_based_suggestions_notas(user_id: int, include_user_interests: bool = False, only_optionals = True):
    # Some as semelhanças
    suggestions : Dict[str, float] = defaultdict(float)

    for other_user_id, similarity in most_similar_students_to_notas(user_id):
        for interest in get_all_data_from_student(other_user_id, dados_formados)["CODIGO"].values:
            suggestions[interest] += similarity
    # Converta em uma lista classificada 
    suggestions = sorted(suggestions.items(),
                        key = lambda pair: pair[-1],
                        reverse = True)

    # Exclua nao optativas
    if only_optionals:
        suggestions = [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion in disciplinas_optativas]
    
    # Exclua interesses existentes
    if include_user_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in get_all_data_from_student(user_id, dados_formados)["CODIGO"].values]

In [36]:
user_based_suggestions_notas(190615, True)

[('CSH30', 67.64158091028911),
 ('ES70N', 61.358675760167166),
 ('FI70D', 54.28447488205224),
 ('FI70A', 50.520334135759256),
 ('CSH42', 36.397821472534034),
 ('GE70F', 35.67988558791086),
 ('CSV30', 29.902468487221988),
 ('CSR41', 25.996461486053793),
 ('CSR44', 23.034335502101946),
 ('CSI53', 18.044663769453344),
 ('ES70B', 17.037954309839666),
 ('CSV40', 14.975961035925291),
 ('MA70C', 14.844693935739683),
 ('CSR42', 14.53692992767101),
 ('CSI41', 13.860327133338707),
 ('CSM41', 12.71794411127994),
 ('CSB51', 11.92805625850115),
 ('CSM43', 10.003769045246283),
 ('CSB41', 9.95577033380634),
 ('ED70T', 9.634271000557494),
 ('CSV45', 8.664185339873752),
 ('CSR43', 8.046098424389193),
 ('DI84D', 7.7839457578429965),
 ('ES70J', 7.173572528358359),
 ('CSM44', 7.142471390162717),
 ('CSE40', 6.994898859835831),
 ('CSB53', 6.515511685777597),
 ('CSM40', 6.463957769529654),
 ('CSA44', 5.64565047112234),
 ('CSI56', 5.604392660272002),
 ('CSH44', 5.45848805573023),
 ('CSD41', 5.086276414589184)

In [37]:
most_similar_students_to(13236)

[(114260, 0.9217710503460783),
 (101561, 0.9079589055788833),
 (109701, 0.9009739556258517),
 (109715, 0.894606568732135),
 (90298, 0.8939348005464146),
 (116513, 0.8917378341287409),
 (55726, 0.8883723006628986),
 (104800, 0.8883723006628986),
 (104766, 0.887526583153824),
 (84061, 0.883414931347818),
 (97274, 0.8822665767598178),
 (84041, 0.8812542318853867),
 (90296, 0.8812542318853867),
 (109705, 0.8812542318853867),
 (120886, 0.8775837459634345),
 (133884, 0.8775837459634345),
 (104767, 0.8762850394746655),
 (113453, 0.8762850394746655),
 (30701, 0.8751130125933031),
 (65496, 0.8751130125933031),
 (65621, 0.8751130125933031),
 (75585, 0.8751130125933031),
 (93845, 0.8751130125933031),
 (104783, 0.8751130125933031),
 (109731, 0.8751130125933031),
 (114263, 0.8751130125933031),
 (92511, 0.8740792106817964),
 (101564, 0.8740792106817964),
 (104771, 0.8740792106817964),
 (84004, 0.8718665271495971),
 (101575, 0.8704235352985854),
 (109697, 0.8704235352985854),
 (109702, 0.870423535298

## Cosseno centralizado - _adjusted cosine similarity_
É um jeito de reduzir o efeito da média de um usuário ser diferente da média de outro, efetivamente ignorando o conceito de um "bom aluno". Para identificar afinidades, talvez seja útil.

O vetor é calculado subtraindo a média dos valores de cada um dos valores.

In [38]:
vetor_interesses_formados_notas

{13236: [0,
  0,
  9.6,
  0,
  7.0,
  0,
  8.8,
  7.0,
  9.2,
  7.2,
  0,
  0,
  0,
  8.0,
  7.0,
  8.2,
  5.2,
  5.6,
  9.3,
  8.7,
  8.6,
  0,
  0,
  0,
  0,
  8.5,
  8.5,
  7.0,
  7.0,
  7.5,
  8.5,
  10.0,
  7.2,
  9.5,
  8.3,
  8.7,
  6.0,
  7.8,
  9.5,
  8.6,
  9.0,
  9.5,
  6.0,
  8.0,
  9.5,
  2.2,
  0,
  6.0,
  8.3,
  5.0,
  7.8,
  6.9,
  0,
  0,
  0,
  0,
  8.0,
  0,
  0,
  7.2,
  0,
  0,
  0,
  0,
  0,
  7.6,
  6.8,
  8.2,
  8.8,
  0,
  9.5,
  8.0,
  7.7,
  5.0,
  7.0,
  8.1,
  0,
  0,
  6.0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  8.1,
  8.2,
  0,
  0,
  8.3,
  6.9,
  0,
  0,
  0,
  0,
  9.2,
  6.8,
  0,
  6.0,
  10.0,
  6.5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  9.8,
  0,
  0,
  0,
  0,
  0,
  9.3,
  0.0,
  0,
  0,
  3.0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [58]:
import math


def most_similar_students_to_notas_ajustadas(user_id : int):
    # Para cada item do vetor, subtraia a media de todos os itens
    def get_nonzero_len(lista : List):
        length = 0
        for x in lista:
            if x:
                length += 1
        return length

    vetor_medias = {user_id: sum(vetor_interesses_formados_notas[user_id])/get_nonzero_len(vetor_interesses_formados_notas[user_id]) for user_id in vetor_interesses_formados_notas.keys()}

    def get_vetor_ajustado(user_id : int):
        return [x - vetor_medias[user_id] if x else 0 for x in vetor_interesses_formados_notas[user_id]]

    vetor_interesses_formados_notas_ajustadas = {user_id: get_vetor_ajustado(user_id) for user_id in vetor_interesses_formados_notas.keys()}
    
    user_similarities_notas_ajustadas = {user_id: {other_user_id: cos_sim(interest_vector_i, interest_vector_j) 
                          for other_user_id, interest_vector_j in vetor_interesses_formados_notas_ajustadas.items()}
                          for user_id, interest_vector_i in vetor_interesses_formados_notas_ajustadas.items()}
    
    pairs = [(other_user_id, similarity) for
                other_user_id, similarity in 
                    user_similarities_notas_ajustadas[user_id].items()
                if user_id != other_user_id and similarity > 0]
    return sorted(pairs, key = lambda pair: pair[-1], reverse = True)

from collections import defaultdict

def user_based_suggestions_notas_ajustadas(user_id: int, include_user_interests: bool = False, only_optionals = True):
    # Some as semelhanças
    suggestions : Dict[str, float] = defaultdict(float)

    for other_user_id, similarity in most_similar_students_to_notas_ajustadas(user_id):
        for interest in get_all_data_from_student(other_user_id, todos_dados)["CODIGO"].values:
            suggestions[interest] += similarity
    # Converta em uma lista classificada 
    suggestions = sorted(suggestions.items(),
                        key = lambda pair: pair[-1],
                        reverse = True)

    # Exclua nao optativas
    if only_optionals:
        suggestions = [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion in disciplinas_optativas]
    
    # Exclua interesses existentes
    if include_user_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in get_all_data_from_student(user_id, todos_dados)["CODIGO"].values]

In [54]:
user_based_suggestions_notas_ajustadas(190615)

[('CSH30', 5.729383166467709),
 ('ES70N', 5.284659254330637),
 ('FI70A', 3.8096259906757055),
 ('CSH42', 3.5864357706526477),
 ('FI70D', 3.488709581152746),
 ('GE70F', 2.9238448252866736),
 ('CSI53', 2.19719430417584),
 ('CSR44', 2.0349771572577056),
 ('CSR41', 1.8696139309160364),
 ('CSV40', 1.4012470248249602),
 ('ES70B', 1.399565450025096),
 ('CSR42', 1.2872311046080767),
 ('ED70T', 1.2851696741833187),
 ('MA70C', 1.2218162727025381),
 ('CSV45', 1.2075705529394225),
 ('CSB41', 1.1596463903849394),
 ('CSI41', 1.0596896829406237),
 ('CSM41', 0.9807725662945548),
 ('CSM40', 0.9603340824940869),
 ('DI84D', 0.8939059476573137),
 ('CSM44', 0.879757680118058),
 ('ES70J', 0.7823564846091785),
 ('CSM43', 0.6744517695233699),
 ('CSB54', 0.634987827580997),
 ('CSI56', 0.6254594921923113),
 ('CSR43', 0.5496055013160805),
 ('CSD41', 0.5228443063342751),
 ('CSE40', 0.5167281123504964),
 ('CSH43', 0.46646678406139436),
 ('CSA44', 0.44018002082641855),
 ('CSM30', 0.42855832837169117),
 ('CSH41', 0.

In [None]:

def most_similar_students_to_notas_ajustadas_alldata(user_id : int):
    vetor_interesse = make_user_interest_vector(get_all_data_from_student(user_id, todos_dados)["CODIGO"].values)
    this_user_similarities = {other_user_id: cos_sim(vetor_interesse, interest_vector_j) 
                          for other_user_id, interest_vector_j in vetor_interesses_formados.items()}
    
    pairs = [(other_user_id, similarity) for
                other_user_id, similarity in 
                    this_user_similarities.items()
                if user_id != other_user_id and similarity > 0]
    return sorted(pairs, key = lambda pair: pair[-1], reverse = True)

def user_based_suggestions_notas_ajustadas_alldata(user_id: int, include_user_interests: bool = False, only_optionals = True):
    # Some as semelhanças
    suggestions : Dict[str, float] = defaultdict(float)

    for other_user_id, similarity in most_similar_students_to_notas_ajustadas(user_id):
        for interest in get_all_data_from_student(other_user_id, todos_dados)["CODIGO"].values:
            suggestions[interest] += similarity
    # Converta em uma lista classificada 
    suggestions = sorted(suggestions.items(),
                        key = lambda pair: pair[-1],
                        reverse = True)

    # Exclua nao optativas
    if only_optionals:
        suggestions = [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion in disciplinas_optativas]
    
    # Exclua interesses existentes
    if include_user_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in get_all_data_from_student(user_id, todos_dados)["CODIGO"].values]

In [45]:
import math

from collections import defaultdict

def user_based_suggestions_notas_ajustadas_balanceadas(user_id: int, include_user_interests: bool = False, only_optionals = True):
    # Some as semelhanças
    suggestions : Dict[str, float] = defaultdict(float)
    suggestion_weight : Dict[str, float] = defaultdict(float)

    for other_user_id, similarity in most_similar_students_to_notas_ajustadas(user_id):
        for interest in get_all_data_from_student(other_user_id, todos_dados)["CODIGO"].values:
            suggestions[interest] += similarity
            suggestion_weight[interest] += 1
            
            
    for key in suggestions.keys():
        suggestions[key] = suggestions[key]/suggestion_weight[key]
        
    # Converta em uma lista classificada 
    suggestions = sorted(suggestions.items(),
                        key = lambda pair: pair[-1],
                        reverse = True)

    
    # Exclua nao optativas
    if only_optionals:
        suggestions = [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion in disciplinas_optativas]
    
    # Exclua interesses existentes
    if include_user_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in get_all_data_from_student(user_id, todos_dados)["CODIGO"].values]

In [57]:
user_based_suggestions_notas_ajustadas_balanceadas(190615,True)

[('CSI31', 0.2219069201469048),
 ('CSV52', 0.1955791720657975),
 ('CSB54', 0.15874695689524926),
 ('CSA45', 0.13904977354801837),
 ('CSB52', 0.133400680923636),
 ('CSB53', 0.12851964970103874),
 ('CSA42', 0.12840761747566826),
 ('CSM44', 0.125679668588294),
 ('CSI55', 0.12261780864226737),
 ('CSE43', 0.12261780864226737),
 ('CSI51', 0.12261780864226737),
 ('CSV41', 0.12261780864226737),
 ('CSG42', 0.12171675496544716),
 ('CSM40', 0.12004176031176086),
 ('FI70B', 0.11727998921367412),
 ('DI84D', 0.11173824345716421),
 ('CSB41', 0.10542239912590358),
 ('CSV30', 0.10335452008558571),
 ('CSI53', 0.0998724683716291),
 ('CSM43', 0.09635025278905285),
 ('CSR48', 0.09367335578581848),
 ('CSV40', 0.09341646832166402),
 ('CSH43', 0.09329335681227888),
 ('CSV45', 0.09289004253380173),
 ('ED70T', 0.09179783387023706),
 ('CSR44', 0.08847726770685677),
 ('CSR53', 0.08778708569588181),
 ('CSH42', 0.08747404318664995),
 ('CSD41', 0.08714071772237919),
 ('EL64B', 0.08208616318453331),
 ('CSH30', 0.0806

In [51]:
print([data for data in get_all_data_from_student(190615, todos_dados)['CODIGO'] if data in disciplinas_optativas])

['CSA41', 'CSV30', 'CSB53', 'CSA42', 'CSB51', 'CSA45']
