# Recomendações básicas
Esse notebook almeja desenvolver um sistema de recomendação simples para disciplinas optativas.

## Processamento inicial dos dados

In [1]:
import pandas as pd

dados_formados_path = 'input/dados_formados.csv'
dados_formados = pd.read_csv(dados_formados_path, delimiter=';')
dados_formados.columns

Index(['ID_ANONIMO', 'CR', 'PERIODO', 'INGRESSO', 'CODIGO', 'ANO', 'PERIODO.1',
       'NOTA', 'CH', 'SITUACAOALUNO'],
      dtype='object')

In [2]:
dados_regulares_path = 'input/dados_regulares.csv'
dados_regulares = pd.read_csv(dados_regulares_path, delimiter=';')
dados_regulares.columns

Index(['ID_ANONIMO', 'CR', 'PERIODO', 'INGRESSO', 'CODIGO', 'ANO', 'PERIODO.1',
       'NOTA', 'CH', 'SITUACAOALUNO'],
      dtype='object')

In [21]:
dados_desistentes_path = 'input/dados_desistentes.csv'
dados_desistentes = pd.read_csv(dados_desistentes_path, delimiter=';')
dados_desistentes.columns

Index(['ID_ANONIMO', 'CR', 'PERIODO', 'INGRESSO', 'CODIGO', 'ANO', 'PERIODO.1',
       'NOTA', 'CH', 'SITUACAOALUNO'],
      dtype='object')

In [22]:
dados_trancados_path = 'input/dados_trancados.csv'
dados_trancados = pd.read_csv(dados_trancados_path, delimiter=';')
dados_trancados.columns

Index(['ID_ANONIMO', 'CR', 'PERIODO', 'INGRESSO', 'CODIGO', 'ANO', 'PERIODO.1',
       'NOTA', 'CH', 'SITUACAOALUNO'],
      dtype='object')

In [24]:
todos_dados = pd.concat([dados_desistentes, dados_regulares, dados_formados, dados_trancados])

In [3]:
def select_random_student(data : pd.DataFrame, max_tries : int, minimum_entry_year : int):
    id = 0
    tries = 0
    while id == 0 and tries < max_tries:
        try:
            id = data[[int(x[3].split('/')[2].split(' ')[0]) >= 14 for x in data.values]].sample()
            return int(id.ID_ANONIMO.iloc[0])
        except:
            id = 0
            tries += 1
    return None

In [4]:
print(str(select_random_student(dados_formados, 100, 14)))

172974


In [5]:
print(str(select_random_student(dados_regulares, 100, 11)))

200458


In [6]:
dados_regulares[[int(x[3].split('/')[2].split(' ')[0]) >= 14 for x in dados_regulares.values]]

Unnamed: 0,ID_ANONIMO,CR,PERIODO,INGRESSO,CODIGO,ANO,PERIODO.1,NOTA,CH,SITUACAOALUNO
0,11357,06967,5,23/01/23,CSD21,2023,1,47,45,Regular
1,11357,06967,5,23/01/23,ES70P,2023,1,76,45,Regular
2,11357,06967,5,23/01/23,CSE20,2023,1,6,60,Regular
3,11357,06967,5,23/01/23,ES70G,2023,1,73,45,Regular
4,11357,06967,5,23/01/23,CSF20,2023,1,63,45,Regular
...,...,...,...,...,...,...,...,...,...,...
19877,261036,0695,2,09/08/23,ICSF13,2023,2,66,90,Regular
19878,261911,07143,1,17/08/23,ICSD21,2023,2,8,45,Regular
19879,261911,07143,1,17/08/23,ELEX10,2023,2,8,45,Regular
19880,261911,07143,1,17/08/23,ICSF13,2023,2,6,90,Regular


In [7]:
def get_all_data_from_student(student_id : int, data : pd.DataFrame):
    return data[[x == student_id for x in data.ID_ANONIMO]]
    pass

In [8]:
get_all_data_from_student(select_random_student(dados_formados, 100, 14), dados_formados)

Unnamed: 0,ID_ANONIMO,CR,PERIODO,INGRESSO,CODIGO,ANO,PERIODO.1,NOTA,CH,SITUACAOALUNO
27149,172248,07135,10,19/08/15,FI74M,2019,2,47,60,Formado
27150,172248,07135,10,19/08/15,EEQ31,2017,2,99,75,Formado
27151,172248,07135,10,19/08/15,CSM43,2022,2,6,60,Formado
27152,172248,07135,10,19/08/15,FI71S,2017,1,8,60,Formado
27153,172248,07135,10,19/08/15,EEE32,2019,2,61,60,Formado
...,...,...,...,...,...,...,...,...,...,...
27253,172248,07135,10,19/08/15,CSF13,2017,1,64,90,Formado
27254,172248,07135,10,19/08/15,FI73M,2017,2,61,60,Formado
27255,172248,07135,10,19/08/15,CSD21,2018,1,6,45,Formado
27256,172248,07135,10,19/08/15,QB70D,2016,1,6,90,Formado


## Preparo dos dados

Queremos recomendar disciplinas para estudantes com base em sua performance nas disciplinas prévias. Para tal, precisamos converter os dados do DataFrame para um vetor de _features_. Escolhemos para esse teste simples utilizar a nota do estudante como valor a ser analisado. A nota utilizada será a menor entre todas as vezes que a disciplina foi cursada.

| Curso | Nota | Ultima data ((Ano - 2000)*2 + periodo)|
| -- | -- | -- |
| FI73A | 6 |  20 (2009 - 2o periodo) |
| EL68A | -1 ( nao fez ) |  -1 ( nao fez ) |
| GE60C | 4 (passou, mas reprovou uma vez com 4) |  20 (2009 - 2o periodo) |

In [9]:
from typing import List

def get_user_array(student_id : int, data : pd.DataFrame):
    student_data = get_all_data_from_student(student_id, data)
    student_array : List[ List[string], List[float], List[int] ] = [[],[],[]]
    for course in student_data.values:
        if [course[4]] not in student_array[0]:
            print([course[4], float(str(course[7]).replace(',','.')),(course[5] - 2000)*2 + course[6]])
            student_array[0].append(course[4])
            student_array[1].append(float(str(course[7]).replace(',','.')))
            student_array[2].append((course[5] - 2000)*2 + course[6])
        else:
            index = student_array[0].index(course[4])
            nota = float(str(course[7]).replace(',','.'))
            student_array[1] = nota if nota < student_array[1][index] else student_array[1][index]
    
    pass

In [10]:
print(get_user_array(93846, dados_formados))

['EL66C', 7.3, 20]
['IF6AE', 10.0, 25]
['IF6AG', 9.2, 24]
['IF6AL', 9.0, 25]
['IF6AB', 8.9, 25]
['IF60A', 8.8, 24]
['ES60F', 10.0, 19]
['ES60G', 7.8, 20]
['IF69D', 6.1, 24]
['ES60A', 10.0, 18]
['EL62A', 10.0, 17]
['MA63B', 9.9, 18]
['MA61A', 7.0, 16]
['MA63A', 5.2, 18]
['MA65A', 8.7, 18]
['MA62A', 7.5, 17]
['IF60K', 0.0, 25]
['GE60B', 8.1, 22]
['IF61C', 9.3, 16]
['FI62A', 7.3, 17]
['FI66A', 8.1, 20]
['ES65A', 9.1, 24]
['MA61B', 7.2, 16]
['FI61A', 8.9, 16]
['IF63F', 8.3, 18]
['ES61A', 8.6, 16]
['QB62A', 7.7, 17]
['IF62J', 9.0, 17]
['IF66B', 7.8, 21]
['FI63A', 7.6, 17]
['EL68F', 8.2, 23]
['ENADE C', nan, 26]
['GE60D', 6.5, 21]
['EL65A', 7.8, 19]
['FI64C', 7.5, 19]
['IF64C', 5.0, 19]
['EL63B', 5.7, 18]
['IF63C', 8.4, 18]
['IF63E', 6.9, 18]
['ENADE I', 0.0, 18]
['ES60B', 10.0, 24]
['ES60D', 6.8, 25]
['IF66C', 8.4, 21]
['IF66D', 7.0, 21]
['IF65E', 8.6, 20]
['EL65H', 5.0, 20]
['IF65D', 10.0, 20]
['IF64J', 9.6, 19]
['IF65C', 6.3, 20]
['EL64H', 7.6, 20]
['EL64H', 4.0, 19]
['EL65G', 6.0, 19]
['

## Recomendações "ingênuas"

### Recomendando os mais populares

In [11]:
import numpy as np

disciplinas = []
with open('input/dependencias.txt') as f:
    for line in f:
        curr = line.split(';')
        if 'P8' in curr:
            curr = curr[1].split('\n')[0]
            disciplinas.append(curr)

disciplinas_obrigatorias = ['GE70D', 'EEC31', 'CSS30','EEX23']
disciplinas = np.array(disciplinas)
disciplinas = disciplinas[[disc not in disciplinas_obrigatorias for disc in disciplinas]]

dados_formados_path = 'input/dados_formados.csv'
dados_formados = pd.read_csv(dados_formados_path, delimiter=';')

dados_formados_optativas = dados_formados[[int(x[3].split('/')[2].split(' ')[0]) >= 14 for x in dados_formados.values]]
dados_formados_optativas = dados_formados_optativas[[float(str(x[7]).replace(',','.')) >= 6 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[[x[4] != 'ES70N' or x[5] > 2017 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[[x[4] != 'FI70D' or x[5] > 2017 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[[x[4] != 'FI70A' or x[5] > 2017 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[[x[4] != 'GE70F' or x[5] > 2017 for x in dados_formados_optativas.values]]
dados_formados_optativas = dados_formados_optativas[dados_formados_optativas.CODIGO.isin(disciplinas)]

In [12]:
from collections import Counter

popular_courses = Counter([materia for materia in dados_formados_optativas["CODIGO"]])

print(popular_courses)

Counter({'CSH30': 61, 'CSV30': 35, 'CSH42': 33, 'CSI53': 18, 'CSV40': 15, 'CSR41': 14, 'CSR44': 14, 'CSR42': 13, 'MA70C': 12, 'ED70T': 11, 'CSB51': 11, 'CSV45': 10, 'CSI41': 10, 'CSB41': 10, 'DI84D': 10, 'CSM41': 9, 'CSM43': 9, 'CSR43': 9, 'CSM44': 9, 'ES70J': 8, 'ES70B': 8, 'CSM40': 7, 'CSB53': 7, 'CSA44': 6, 'ES70N': 6, 'GE70F': 6, 'CSH44': 5, 'CSB54': 5, 'CSH43': 4, 'CSM30': 4, 'CSE40': 4, 'EEY41': 4, 'CSI56': 4, 'ED70U': 4, 'CSA45': 3, 'CSA42': 3, 'FI70D': 3, 'FI70B': 3, 'EL64B': 2, 'CSI58': 2, 'CSR53': 2, 'CSD41': 2, 'CSD40': 2, 'CSG42': 2, 'CSV52': 2, 'CSB52': 2, 'FI70A': 2, 'EL6AE': 1, 'CSI54': 1, 'CSW47': 1, 'CSD52': 1, 'CSR45': 1, 'EEY42': 1, 'CSH41': 1, 'EEY51': 1, 'EEL51': 1, 'CSR48': 1, 'CSI31': 1, 'IF6BV': 1, 'EL6CB': 1, 'CSG44': 1, 'FI70E': 1, 'CSA43': 1, 'EEY43': 1, 'CSG48': 1, 'EEC41': 1, 'CSH45': 1, 'CSI55': 1, 'CSE43': 1, 'CSW45': 1, 'CSI51': 1, 'CSV41': 1, 'CSI57': 1, 'FCH7HB': 1, 'CSR47': 1, 'EEY44': 1, 'CSA41': 1, 'EL75H': 1})


In [13]:
from typing import List

# Recebe um estudante e sugere as disciplinas mais populares que ele não fez ainda
def sugestoes_populares(student_id : int, max_sugestoes : int, data : pd.DataFrame) -> List:
    suggestions = [interest for interest, _ in popular_courses.most_common() if interest not in get_all_data_from_student(student_id, data)["CODIGO"].unique()]
    return suggestions[:max_sugestoes]

In [14]:
aluno = (select_random_student(dados_formados, 100, 14))
print(aluno)

165894


In [15]:
print(sugestoes_populares(aluno, 5, dados_formados))

['CSH42', 'CSI53', 'CSR41', 'CSR44', 'CSR42']


In [16]:
[x for x in get_all_data_from_student(aluno, dados_formados)["CODIGO"] if x in dados_formados_optativas["CODIGO"].unique()]

['CSV40', 'CSM43', 'CSG44', 'CSV30', 'CSH30', 'ES70N', 'FI70D', 'FI70A']

In [26]:
disciplinas_optativas = dados_formados_optativas["CODIGO"].unique()
print(disciplinas_optativas)

['CSM41' 'CSH43' 'EL64B' 'MA70C' 'CSM43' 'CSH30' 'EL6AE' 'CSI58' 'CSR53'
 'CSR41' 'CSI54' 'CSW47' 'CSV30' 'CSH42' 'CSM40' 'CSD41' 'CSV45' 'CSM30'
 'CSE40' 'EEY41' 'CSR43' 'CSV40' 'CSR44' 'CSI53' 'CSI41' 'CSA44' 'CSA45'
 'CSB41' 'CSI56' 'CSD52' 'CSR45' 'CSD40' 'ES70J' 'ES70B' 'EEY42' 'DI84D'
 'CSM44' 'CSH41' 'ED70T' 'CSR42' 'CSG42' 'ED70U' 'CSA42' 'CSB51' 'EEY51'
 'EEL51' 'CSB53' 'CSR48' 'ES70N' 'CSH44' 'CSB54' 'CSI31' 'IF6BV' 'GE70F'
 'FI70D' 'EL6CB' 'CSG44' 'FI70E' 'FI70B' 'CSA43' 'EEY43' 'CSG48' 'EEC41'
 'CSH45' 'CSV52' 'CSB52' 'CSI55' 'CSE43' 'CSW45' 'CSI51' 'CSV41' 'CSI57'
 'FI70A' 'FCH7HB' 'CSR47' 'EEY44' 'CSA41' 'EL75H']


## Outros sistemas de recomendação

In [25]:
todas_disciplinas = todos_dados["CODIGO"].unique()
print(todas_disciplinas)

['ES70R' 'FI71M' 'IF61C' 'MA71B' 'IF61B' 'MA71A' 'MA61A' 'FI61A' 'ES61A'
 'MA61B' 'CSD20' 'CSF13' 'ENADE I' 'IF66J' 'FI62A' 'FI63A' 'MA63A' 'EL63B'
 'EL65A' 'IF64J' 'MA63C' 'ET77B' 'ET75E' 'QB70D' 'ET75F' 'QB70E' 'QB60A'
 'MA65A' 'IF62J' 'MA62A' 'EL64H' 'EL66C' 'EL66H' 'EL66D' 'EL68G' 'IF63C'
 'EL65G' 'IF65C' 'EL66I' 'IF62C' 'EL62A' 'ES60A' 'IF64C' 'IF65E' 'IF67D'
 'MA64A' 'FI64A' 'IF60A' 'IF63F' 'EL65H' 'FI64C' 'QB62A' 'FI73A' 'FI72N'
 'MA70G' 'MA70H' 'IF63E' 'IF63O' 'MA72A' 'ES60F' 'ES70N' 'FI72M' 'MA73A'
 'ES70L' 'EL62O' 'IF66B' 'GE60D' 'IF65D' 'IF67B' 'IF6AG' 'GE60C' 'ES60G'
 'GE60B' 'MA63B' 'FI66A' 'IF66C' 'ET30N' 'ET38N' 'EL66G' 'ES65A' 'GE60A'
 'ES70G' 'EEF21' 'CSD21' 'CSF20' 'CSG20' 'QB70C' 'EEX11' 'FI72S' 'FI70A'
 'FI71S' 'EEF11' 'CSE20' 'EEX21' 'CSH30' 'DI64G' 'GE60G' 'EL66K' 'EL64O'
 'QB65A' 'IF66D' 'IF68E' 'ES70Q' 'ME37K' 'IF67H' 'EL68E' 'IF6AB' 'IF6AE'
 'ES70H' 'ES60B' 'EL68F' 'IF68D' 'ES70B' 'IF69D' 'EL68A' 'IF67C' 'ES60D'
 'FI62B' 'CE62A' 'ED60E' 'ED60D' 'ED60C' 'ED60B' 

### Filtragem colaborativa baseada no Usuário
Utilizando semelhança de cossenos, sem considerar as notas

In [236]:
# Vetor de interesses por presença
def make_user_interest_vector(user_interests : List[str]) -> List[int]:
    return [1 if interest in user_interests else 0 for interest in todas_disciplinas]

all_student_ids = dados_formados["ID_ANONIMO"].unique()

vetor_interesses_formados = {user_id: make_user_interest_vector(get_all_data_from_student(user_id, dados_formados)["CODIGO"].values) 
                             for user_id in all_student_ids}

from numpy import dot
from numpy.linalg import norm

def cos_sim(a : List[int], b : List[int]) -> int:
    return dot(a, np.transpose(b))/(norm(a)*norm(b))

user_similarities = {user_id: {other_user_id: cos_sim(interest_vector_i, interest_vector_j) 
                          for other_user_id, interest_vector_j in vetor_interesses_formados.items()}
                          for user_id, interest_vector_i in vetor_interesses_formados.items()}

def most_similar_students_to(user_id : int):
    pairs = [(other_user_id, similarity) for
                other_user_id, similarity in 
                    user_similarities[user_id].items()
                if user_id != other_user_id and similarity > 0]
    return sorted(pairs, key = lambda pair: pair[-1], reverse = True)

In [237]:
make_user_interest_vector(get_all_data_from_student(93846, dados_formados)["CODIGO"].values)

[0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [238]:
get_all_data_from_student(93846, dados_formados)["CODIGO"].values

array(['EL66C', 'IF6AE', 'IF6AG', 'IF6AL', 'IF6AB', 'IF60A', 'ES60F',
       'ES60G', 'IF69D', 'ES60A', 'EL62A', 'MA63B', 'MA61A', 'MA63A',
       'MA65A', 'MA62A', 'IF60K', 'GE60B', 'IF61C', 'FI62A', 'FI66A',
       'ES65A', 'MA61B', 'FI61A', 'IF63F', 'ES61A', 'QB62A', 'IF62J',
       'IF66B', 'FI63A', 'EL68F', 'ENADE C', 'GE60D', 'EL65A', 'FI64C',
       'IF64C', 'EL63B', 'IF63C', 'IF63E', 'ENADE I', 'ES60B', 'ES60D',
       'IF66C', 'IF66D', 'IF65E', 'EL65H', 'IF65D', 'IF64J', 'IF65C',
       'EL64H', 'EL64H', 'EL65G', 'IF67C', 'EL66D', 'IF66J', 'IF67B',
       'IF67B', 'EL66H', 'EL66I', 'EL66G', 'EL66H', 'IF61B', 'IF60J',
       'IF60B', 'EL68E', 'EL68G', 'IF68D', 'IF68E', 'IF67H', 'IF67D',
       'EL68A', 'GE60C', 'IF62C', 'GE60A', 'QB60A'], dtype=object)

In [239]:
print(all_student_ids)

[ 13236  13976  30701  51624  51868  54937  54968  55055  55726  65057
  65450  65496  65541  65608  65621  74306  75585  81780  83464  83987
  84004  84041  84050  84061  88923  89035  90291  90294  90296  90298
  90299  90302  90686  90729  90741  90758  90787  90797  90801  90813
  92324  92511  92847  93845  93846  93847  93848  93849  93852  93854
  93857  93958  94343  94397  94405  94442  96838  96887  96956  96962
  97274  97494 101327 101334 101337 101560 101561 101562 101563 101564
 101565 101571 101572 101574 101575 103432 104355 104448 104766 104767
 104771 104779 104780 104783 104784 104787 104797 104800 104824 106138
 107939 109365 109369 109478 109693 109694 109695 109696 109697 109701
 109702 109703 109704 109705 109707 109710 109713 109714 109715 109720
 109721 109722 109723 109726 109731 113453 113853 113868 114245 114250
 114260 114263 114483 114484 114486 114666 114705 114709 114713 114720
 114964 115160 115577 115579 116470 116471 116504 116513 116616 116620
 11697

In [240]:
print(vetor_interesses_formados)

{13236: [0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [241]:
print(user_similarities)

{13236: {13236: 1.0, 13976: 0.8535204983803663, 30701: 0.8751130125933031, 51624: 0.5938923908220626, 51868: 0.8388668159944593, 54937: 0.8679018950386385, 54968: 0.8545495581918902, 55055: 0.8075936301394427, 55726: 0.8883723006628986, 65057: 0.8618537245237077, 65450: 0.8388668159944593, 65496: 0.8751130125933031, 65541: 0.8606318382097687, 65608: 0.8632061582884765, 65621: 0.8751130125933031, 74306: 0.797921501227884, 75585: 0.8751130125933031, 81780: 0.8679018950386385, 83464: 0.8176262338182698, 83987: 0.8227848101265823, 84004: 0.8718665271495971, 84041: 0.8812542318853867, 84050: 0.8632061582884765, 84061: 0.883414931347818, 88923: 0.869098418897731, 89035: 0.8444407432001202, 90291: 0.8606318382097687, 90294: 0.8679018950386385, 90296: 0.8812542318853867, 90298: 0.8939348005464146, 90299: 0.8618537245237077, 90302: 0.8679018950386385, 90686: 0.3969547986585982, 90729: 0.517054715806478, 90741: 0.8176262338182698, 90758: 0.8176262338182698, 90787: 0.7583301736308593, 90797: 0.38

In [242]:
most_similar_students_to(150892)

[(198372, 0.9474504407420671),
 (150878, 0.9413574486632832),
 (150887, 0.9271726499455306),
 (156158, 0.9190028346233009),
 (198543, 0.9021342216356464),
 (191284, 0.8993491139867924),
 (202261, 0.8725028717782315),
 (145077, 0.8684962373468949),
 (190495, 0.8684962373468949),
 (190499, 0.8684962373468949),
 (190508, 0.8684962373468949),
 (190494, 0.8666666666666665),
 (190493, 0.8629109946080096),
 (190615, 0.8573214099741122),
 (150873, 0.8553372034476995),
 (179408, 0.8541985556144385),
 (151217, 0.8533333333333332),
 (179385, 0.8391405500541003),
 (186024, 0.8367621765895851),
 (189048, 0.8365160106923107),
 (161216, 0.8324187420999798),
 (179652, 0.8314493471508876),
 (182856, 0.8314493471508876),
 (186036, 0.8314493471508876),
 (179823, 0.8294391771908315),
 (180984, 0.8266160642323973),
 (182913, 0.8266160642323973),
 (172248, 0.8261843893231644),
 (179944, 0.8243996016473729),
 (186021, 0.8211203828474677),
 (183413, 0.817059487979028),
 (179397, 0.812403840463596),
 (175962, 

In [243]:
from collections import defaultdict

def user_based_suggestions(user_id: int, include_user_interests: bool = False, only_optionals = True):
    # Some as semelhanças
    suggestions : Dict[str, float] = defaultdict(float)

    for other_user_id, similarity in most_similar_students_to(user_id):
        for interest in get_all_data_from_student(other_user_id, dados_formados)["CODIGO"].values:
            suggestions[interest] += similarity
    # Converta em uma lista classificada 
    suggestions = sorted(suggestions.items(),
                        key = lambda pair: pair[-1],
                        reverse = True)

    # Exclua nao optativas
    if only_optionals:
        suggestions = [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion in disciplinas_optativas]
    
    # Exclua interesses existentes
    if include_user_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in get_all_data_from_student(user_id, dados_formados)["CODIGO"].values]

In [244]:
user_based_suggestions(150892, True)

[('CSH30', 71.9266330299655),
 ('ES70N', 67.61032616460733),
 ('FI70D', 59.49668104725401),
 ('FI70A', 54.15696245004997),
 ('GE70F', 40.35908194630637),
 ('CSH42', 38.80656174590277),
 ('CSV30', 32.09922367903795),
 ('CSR41', 27.430820371172718),
 ('CSR44', 24.444183374316836),
 ('ES70B', 19.8527134751005),
 ('CSI53', 19.154444071447585),
 ('MA70C', 16.21218114018863),
 ('CSR42', 15.637422588791624),
 ('CSV40', 15.548774024537922),
 ('CSI41', 14.8589373344407),
 ('CSM41', 13.407573433696573),
 ('CSB51', 13.168681801945535),
 ('CSB41', 10.41653207137993),
 ('ED70T', 10.31869920275294),
 ('CSM43', 10.208155036983726),
 ('CSV45', 9.265001340128295),
 ('CSR43', 8.686613344902755),
 ('DI84D', 8.293149943659854),
 ('CSE40', 7.699110244251666),
 ('CSM44', 7.607989783956705),
 ('CSB53', 7.546268463852839),
 ('ES70J', 7.503458198792877),
 ('CSM40', 6.84597259197361),
 ('CSA44', 6.313682727352546),
 ('CSI56', 6.05436936363045),
 ('CSH44', 5.9126030970613535),
 ('CSD41', 5.458036157537697),
 ('C