In [36]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import re
import csv

CSV_PATH = './UC.csv'
MATRIZES_ROOT = './matrizes'
TRAJECTORY_ROOT = MATRIZES_ROOT
TRAJECTORY_FNAME = None

# Read CSV
df = pd.read_csv(CSV_PATH,quoting=csv.QUOTE_NONNUMERIC)


df.to_csv('UC.csv',quoting=csv.QUOTE_NONNUMERIC, index=False)

def restrict_to_trajectory(df, path_to_txt):
    # Map values at columns to id
    dct = {value:key for key,value in df['ucname_file'].to_dict().items()}
    dct.update({value:key for key,value in df['nome'].to_dict().items()})
    
    fname = osp.split(path_to_txt)[-1].split('.txt')[0]

    with open(path_to_txt,'r') as f:
        nodes = [x.lower().strip() for x in f.readlines()]
        nodes = [x for x in nodes if len(x) > 0]
        for n in nodes:
            if not n in dct:
                raise ValueError(f'{n} not found. Please ensure it matches some entry in the UC CSV.')
        df1 = df.loc[df.ucname_file.isin(nodes),:].copy()
        df1 = df1.reset_index(drop=True)
    return df1
if TRAJECTORY_FNAME is not None:
    if not TRAJECTORY_FNAME.endswith('.txt'):
        raise Exception('Trajectory file must end with .txt')
    tpath = osp.join(TRAJECTORY_ROOT, TRAJECTORY_FNAME)
    print(tpath)
    df = restrict_to_trajectory(df, path_to_txt=tpath)
display(df)

Unnamed: 0,ucname_file,error_flag,nome,Pré-Requisitos,Carga Horária Total,Carga Horária Prática,Carga Horária Teórica,Carga Horária Extensão
0,algoritmos e estruturas de dados i,False,algoritmos e estruturas de dados i,lógica de programação,72.0,36.0,36.0,0.0
1,algoritmos e estruturas de dados ii,False,algoritmos e estruturas de dados ii,algoritmos e estruturas de dados i,72.0,26.0,46.0,0.0
2,algoritmos em bioinformática,False,algoritmos em bioinformática,lógica de programação,72.0,36.0,36.0,0.0
3,anatomia,False,anatomia,não há,36.0,10.0,26.0,0.0
4,análise de sinais,False,análise de sinais,séries e equações diferenciais ordinárias,72.0,0.0,72.0,0.0
...,...,...,...,...,...,...,...,...
248,"vidros, vitrocerâmicas e vidrados",False,"vidros, vitrocerâmicos e vidrados",materiais cerâmicos,72.0,18.0,54.0,0.0
249,álgebra linear,False,álgebra linear,geometria analítica,72.0,10.0,62.0,0.0
250,álgebra linear computacional,False,álgebra linear computacional,cálculo numérico,72.0,22.0,50.0,0.0
251,álgebra linear ii,False,álgebra linear ii,álgebra linear,72.0,0.0,72.0,0.0


In [37]:

def get_edges(df):
    """ Creates list of tuples representing directed edges. 
        The values in each tuple are taken from the ``ucname_file`` column.
    
        Returns:
            List[Tuple[str,str]]
    """
    
    # Create 'id' attribute
    df['id'] = np.arange(df.shape[0])

    # Map values at columns to id
    dct = {value:key for key,value in df['ucname_file'].to_dict().items()}
    dct.update({value:key for key,value in df['nome'].to_dict().items()})

    edges = []
    for i, row in df.iterrows():
        prs = row['Pré-Requisitos'].strip().replace(',',';').split(';')
        name1 = row['ucname_file']
        name2 =  row['nome']
        for pr in prs:
            pr = pr.strip()
            if re.search('não há',pr) is not None:
                break
            if len(pr) == 0:
                continue
            if pr not in dct:
                print(f"'{pr}' not found in dictionary, but present in {name1}")
            else:
                edges.append( (df['ucname_file'].values[dct[pr]], name1))
    return edges     
def get_edge_dataframe(edges):
    return pd.DataFrame({'A':[x[0] for x in edges],'B':[x[1] for x in edges]})

edges = get_edges(df)
df_edges = get_edge_dataframe(edges)

# Get output path
df_edges_fpath = osp.join(TRAJECTORY_ROOT,'edges.csv')
if TRAJECTORY_FNAME is not None:
    df_edges_fpath = osp.join(TRAJECTORY_ROOT,TRAJECTORY_FNAME.split('.txt')[0] + '_edges.csv')
    
# Save to CSV
df_edges.to_csv(df_edges_fpath,index=False)
print('saved to ' + df_edges_fpath)

saved to ./matrizes/edges.csv


In [38]:
import networkx as nx
def create_graph(df, edges):
    """ Creates a NetworkX graph from the node dataframe and edge list.
    
        df (pd.DataFrame): node dataframe.
        edges (List[Tuple[str,str]]): list of directed edges.
        
        Returns:
            nx.Digraph
    """
    df_mini = df.loc[:,['ucname_file']].rename(columns={'ucname_file':'id'})
    
    G = nx.DiGraph()
    nodes = []
    for i, row in df_mini.iterrows():
        nodes.append((i, row.to_dict()) )
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    G.remove_nodes_from(range(df.shape[0]))
    return G

def add_topological_info(df,G):
    """ Add information to later be used for the interactive visualization. """
    num_gen = len(list(nx.topological_generations(G)))
    df['group'] = '000'
    for i, gen in  enumerate(nx.topological_generations(G)):
        b = df['ucname_file'].isin(gen)
        df.loc[b,'group'] = str(num_gen-i).zfill(3)
        df.loc[b,'topgen'] = str(i)
        
    num_gen = len(list(nx.topological_generations(G.reverse(copy=True))))
    for i, gen in  enumerate(nx.topological_generations(G.reverse(copy=True))):
        b = df['ucname_file'].isin(gen)
        df.loc[b,'size'] = 1 + i
    

    
# Create graph
G = create_graph(df, edges)
add_topological_info(df,G)

# Get output path
df_fpath = 'nodes.txt'
df_nodes_fpath = osp.join(TRAJECTORY_ROOT,'nodes.csv')
if TRAJECTORY_FNAME is not None:
    df_nodes_fpath = osp.join(TRAJECTORY_ROOT,TRAJECTORY_FNAME .split('.txt')[0] + '_nodes.csv')

# Save node CSV
df.to_csv(df_nodes_fpath,index=False,quoting=csv.QUOTE_NONNUMERIC)
print('saved to ' + df_nodes_fpath)


saved to ./matrizes/nodes.csv


In [35]:
print(df1.shape,df2.shape, df.shape)

NameError: name 'df1' is not defined