In [7]:
# Constructing a  graph

# 1048576
def create_graph(n):
    # Take the first edges -- this is a list
    first_edges = list(edge_df.iloc[:n]['source_target_key'])

    # Convert each edge into a tuple
    edge_tuples = lambda x: tuple(map(int,x.split(';')))

    # Put all those tuples in a list
    edge_list = list(map(edge_tuples, first_edges))

    # Add edges to graph
    geneGraph.add_edges_from(edge_list)

    # Weighting edges on DIOPT scores
    diopt_dict = {}

    for index, edge in enumerate(edge_list):
        diopt_dict[edge] = edge_df.iloc[index]['interaction_count']

    nx.set_edge_attributes(geneGraph, diopt_dict, 'weight')

    # Add bidirectional edges based on reciprocal value
    bidirectional_edges = []

    for index, edge in enumerate(edge_list):
        reciprocal = edge_df.iloc[index]['reciprocal']
    if reciprocal:
        reversed_edge = (edge[1], edge[0])
        bidirectional_edges.append(reversed_edge)

    # Adding edges to graph
    geneGraph.add_edges_from(bidirectional_edges)


In [2]:
# Nodes color spectrum based on publication count

# Map values to colors based on color map
def get_colors(nodes, cmap):
    if not nodes:
        return {}
    values = list(nodes.values())
    norm = mcolors.Normalize(vmin=min(numpy.log1p(values)), vmax=max(numpy.log1p(values)))   # normalizing all values on a scale of 0 to 1 for coloring
    cmap = cm.get_cmap(cmap)
    return {node: cmap(norm(value)) for node, value in nodes.items()}

# Apply colormap to each category's nodes and collect colors
def nodes_apply_colormap(G):
    # Creating a dict and storing all genes
    disease_assoc = {}
    non_disease_assoc = {}
    non_omim = {}
    merged = {}
    for node in G.nodes:
        if node not in node_df.index:
            merged[node] = 1
            continue
        disease_cat = node_df.loc[node]['disease_assoc_cat']
        if disease_cat == 'disease_assoc':
            disease_assoc[node] = node_df.loc[node]['publication_count']
        elif disease_cat == 'non_disease_assoc':
            non_disease_assoc[node] = node_df.loc[node]['publication_count']
        elif disease_cat == 'non_omim':
            non_omim[node] = node_df.loc[node]['publication_count']
        else:
            merged[node] = 1
    # Identifying base colors for each category
    base_colors = {'disease_assoc':'Reds', 'non_disease_assoc':'Blues', 'non_omim': 'Grays', 1:'Grays'}

    # Mapping the categories to each other to normalize
    all_nodes = {'disease_assoc': disease_assoc,'non_disease_assoc': non_disease_assoc, 'non_omim': non_omim, 1:merged}

    # Storing node colors in a list
    node_colors = []
    for category, nodes in all_nodes.items():
        if category not in base_colors:
            print(f"Category {category} is not in base colors")
        else:
            cmap_name = base_colors[category]
            colors = get_colors(nodes, cmap_name)
            if category == 'unknown':
                colors = {node: (0, 0, 0, 1) for node in nodes} # Making 'unknown' nodes black
            node_colors.extend(colors[node] for node in G.nodes if node in colors)
    return node_colors

In [1]:
def get_edges(list_of_lists):
    pairs = []
    for sublist in list_of_lists:
        for i in range(len(sublist) - 1):
            pairs.append((sublist[i], sublist[i + 1]))
    return pairs

In [2]:
# Coloring edges by 6 sections of interaction_type

# GPT's 6 sections of the 84 items

def color_edge_section(G):
    genetic_interactions = ['cisphenotypic genetic ','mutual genetic over-su','association', 'genetic interaction', 'genetic suppression', 'genetic suppression (p', 'genetic suppression (c', 'genetic enhancement', 'genetic epistasis (sen', 'phenotypic suppression', 'asynthetic genetic int', 'cisphenotypic co-suppr', 'opposing genetic epist', 'minimal genetic epista', 'synthetic growth defec', 'transphenotypic enhanc', 'transphenotypic geneti', 'genetic over-suppressi', 'negative genetic inter', 'positive genetic inter']
    physical_interactions = ['self interaction','physical association', 'direct interaction', 'colocalization', 'proximity', 'protein cleavage', 'disulfide bond', 'Co-localization', 'Co-fractionation', 'Co-purification', 'Co-crystal Structure', 'Protein-peptide', 'Protein-RNA']
    post_translational_modifications = ['phosphorylation','demethylation reaction', 'phosphorylation reacti', 'acetylation reaction', 'covalent binding', 'cleavage reaction', 'dephosphorylation reac', 'ubiquitination reactio', 'methylation reaction', 'deacetylation', 'cleavage', 'dephosphorylation', 'methylation', 'ubiquitination', 'deformylation reaction']
    experimental_methods = ['Affinity Capture-RNA', 'Affinity Capture-Weste', 'Affinity Capture-MS', 'Two-hybrid', 'Biochemical Activity', 'Reconstituted Complex', 'Negative Genetic', 'Positive Genetic', 'PCA', 'FRET', 'Proximity Label-MS', 'Far Western', 'Affinity Capture-Lumin']
    synthetic_rescue_and_dosage_interactions = ['dosage lethality (sens','synthetic genetic inte', 'mutual genetic enhance', 'maximal genetic epista', 'synthetic rescue (sens', 'dosage rescue (sensu B', 'synthetic rescue (sens', 'synthetic lethality (s', 'synthetic haploinsuffi', 'dosage rescue (sensu b', 'dosage growth defect (', 'Synthetic Rescue', 'Synthetic Lethality', 'Dosage Lethality', 'Dosage Rescue',  'Dosage Growth Defect', 'Synthetic Growth Defec', 'Synthetic Haploinsuffi']
    phenotypic_interactions = ['monophenotypic genetic', 'phenotypic enhancement', 'putative self interact', 'cisphenotypic inter-su', 'cisphenotypic inter-suppr', 'Phenotypic Suppression', 'Phenotypic Enhancement']

    # Creating a dict that assigns each interaction_type to its respective section
    interaction_section_dict = {}
    for index, row in edge_df.iterrows():
        edge = (row['source'], row['target'])
        interaction_type = row['interaction_type']
        if interaction_type in genetic_interactions:
            interaction_section_dict[edge] = {'category':'Genetic Interaction'}
        elif interaction_type in physical_interactions:
            interaction_section_dict[edge] = {'category':'Physical Interaction'}
        elif interaction_type in post_translational_modifications:
            interaction_section_dict[edge] = {'category':'Post Translational Modification'}
        elif interaction_type in experimental_methods:
            interaction_section_dict[edge] = {'category':'Experimental Method'}
        elif interaction_type in synthetic_rescue_and_dosage_interactions:
            interaction_section_dict[edge] = {'category':'Synthetic Rescue and Dosage Interaction'}
        elif interaction_type in phenotypic_interactions:
            interaction_section_dict[edge] = {'category':'Phenotypic Interaction'}
        else:
            print(f'edge color failed for {interaction_type}')
            interaction_section_dict[edge] = {'category':'Unknown'}


    # Assigning attributes to edge
    nx.set_edge_attributes(G, interaction_section_dict)

    # Creating a color map that maps each section to a color
    edge_section_color_map = {'Genetic Interaction':'orange', 'Physical Interaction':'yellow', 'Post Translational Modification':'green','Experimental Method':'purple','Synthetic Rescue and Dosage Interaction':'pink','Phenotypic Interaction':'brown', 'Unknown':'black'}

    edge_colors_int_section = [edge_section_color_map.get(G.edges[edge].get('category', 'Unknown'), 'black') for edge in G.edges]    
    # edge_colors_int_section = [edge_section_color_map[G.edges[edge]['category']] for edge in G.edges]
    return edge_colors_int_section


In [3]:
# Changing style of edges by interaction_cat

def edge_style(G):
    # Creating a dictionary that assigns edges to their interaction_cat in a dict
    int_cat_dict = {}
    for index,edge in enumerate(G.edges):
        interaction_cat = edge_df.loc[index]['interaction_cat']
        if interaction_cat == 'physical':
            int_cat_dict[edge] = {'category':'physical'}
        elif interaction_cat == 'genetic':
            int_cat_dict[edge] = {'category':'genetic'}
        else:
            print('category not found')
            int_cat_dict[edge] = {'category':'unknown'}

    # Assigning attributes to edge
    nx.set_edge_attributes(G, int_cat_dict)

    # Creating a style map for the categories to colors
    edge_cat_style_map = {'physical':'dotted', 'genetic':'solid', 'unknown':'dashed'}
    edge_styles_int_cat = [edge_cat_style_map[G.edges[edge]['category']] for edge in G.edges]
    return edge_styles_int_cat


In [4]:
# Random walk (second order)

def second_order_random_walk(graph, steps, n, p, q): #graph, length of walk, number of walks per node, return probability, in-out probability
    for start_node in graph.nodes():
        for _ in range(n):  # walk iterations per node
            walk = [start_node]
            current_node = start_node
            previous_node = None
            
            while len(walk)<=steps:
            #for _ in range(steps): # length of walk
                neighbors = list(graph.neighbors(current_node))
                if not neighbors:
                    break

                if previous_node is None:
                    # First step, no previous node
                    next_node = random.choice(neighbors)
                else:
                    # Adjusting the probabilities based on the neighbors' connections to the previous node
                    probabilities = []
                    for neighbor in neighbors:
                        if neighbor == previous_node:
                            probabilities.append(1/p)#prob = graph[current_node][neighbor].get('weight', 1) / p # Incorporating edge weights into probability
                        elif graph.has_edge(previous_node, neighbor):                 # Note: there is a tradeoff between edge weights + p,q (i.e:
                            probabilities.append(1)#prob = graph[current_node][neighbor].get('weight', 1)     # if p=2, q=0.5 but the weight of the last edge is much greater,
                        else:                                                         # there is a larger probability of returning to that edge)
                            probabilities.append(1/q)#prob = graph[current_node][neighbor].get('weight', 1) / q
                        #probabilities.append(prob)

                    # Normalize probabilities
                    probabilities = numpy.array(probabilities, dtype = float)
                    probabilities /= probabilities.sum()

                    # Choose next node based on the transition probabilities
                    next_node = numpy.random.choice(neighbors, p=probabilities)

                walk.append(next_node)
                previous_node = current_node
                current_node = next_node

        return walk

def generate_random_walks(graph, steps, n, p, q): # generating random walks
    walks = []
    nodes = list(graph.nodes())
    for _ in range(n):
        random.shuffle(nodes)
        for node in nodes:
            walks.append(second_order_random_walk(graph, steps, n, p, q))
    return walks

In [28]:
def generate_vectors(graph, steps, n, p, q):
    # Generating vectors
    create_graph(30000)   # 1048576
    walks = generate_random_walks(graph, steps, n, p, q)
    str_walks = [[str(n) for n in walk] for walk in walks]
    model = Word2Vec(str_walks, vector_size=128, window=5, min_count=0, sg=0, workers=2, hs=0, epochs=1)
    node_ids = list(model.wv.index_to_key)  # node ids
    int_node_ids = []
    for node_id in node_ids:
        int_node_ids.append(int(node_id))
    int_node_ids = numpy.array(int_node_ids)
    vectors = (model.wv.vectors) # vectors
    return vectors, int_node_ids

In [29]:
def train_model(vectors, int_node_ids):
    # Creating a dataset with just the genes, classification, and vectors
    vector_data = pd.DataFrame(vectors)
    vector_data['gene'] = int_node_ids

    if node_df.index.name != 'id':
        node_df.set_index('id', inplace=True)

    disease_assoc_cat = numpy.array([node_df.loc[node_id]['disease_assoc_cat'] for node_id in int_node_ids])
    vector_data['disease_assoc_cat'] = disease_assoc_cat

    # vector_data.set_index('gene',inplace=True)

    # Creating features
    X = vector_data.iloc[:, :-1].values
    y = vector_data['disease_assoc_cat'].values

    # Splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initializing the classifier
    mlp_model = MLPClassifier(activation='tanh', alpha=0.0001, hidden_layer_sizes=(100, 100, 50), learning_rate='adaptive', max_iter=400, solver='adam') # parameters found using GridSearchCV

    # Train the classifier
    mlp_model.fit(X_train_scaled, y_train)

    # Evaluate the model
    y_pred = mlp_model.predict(X_test_scaled)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    return report
    # # Evaluate the classifier
    # accuracy = mlp_model.score(X_test, y_test)
    # print(f'Accuracy: {accuracy}')



In [52]:
def create_node_graph(n):
    sampled_node_df = node_df.sample(n=n) # Taking random nodes and incident edges
    edges = []
    for n_index, n_row in sampled_node_df.iterrows():
        for e_index, e_row in edge_df.iterrows():
            if n_index == e_row['source'] or n_index == e_row['target']:
                edges.append(e_row['source_target_key'])

    # Convert each edge into a tuple
    edge_tuples = lambda x: tuple(map(int,x.split(';')))

    # Put all those tuples in a list
    edge_list = list(map(edge_tuples, edges))

    geneGraph.add_edges_from(edge_list)

    # Weighting edges on DIOPT scores
    diopt_dict = {}

    for index, edge in enumerate(edge_list):
        diopt_dict[edge] = edge_df.iloc[index]['interaction_count']

    nx.set_edge_attributes(geneGraph, diopt_dict, 'weight')

    # Add bidirectional edges based on reciprocal value
    bidirectional_edges = []

    for index, edge in enumerate(edge_list):
        reciprocal = edge_df.iloc[index]['reciprocal']
    if reciprocal:
        reversed_edge = (edge[1], edge[0])
        bidirectional_edges.append(reversed_edge)

    # Adding edges to graph
    geneGraph.add_edges_from(bidirectional_edges)