In [13]:
import ast
import pandas as pd
import re
import numpy as np

In [2]:
def extract_numbers(s):
    numbers = re.findall('\d+', s)
    numbers = [int(number) for number in numbers]
    return numbers

In [3]:
def convert_to_array(s):
    # Find all arrays in the string
    arrays_str = re.findall(r'array\(\[.*?\]\)', s.replace('\n', ''))

    arrays = []
    for arr_str in arrays_str:
        # Remove 'array([' from the start and '])' from the end
        s_trimmed = arr_str[7:-2]
        # Convert string to array
        array = np.fromstring(s_trimmed, sep=',')
        arrays.append(array)
    
    return arrays

In [85]:
def avg_min_max(sampe_array_list):
    if len(sampe_array_list) >1:
        avg_val=np.mean([np.mean(item) for item in sampe_array_list])
        min_val=min([min(item) for item in sampe_array_list])
        max_val=max([max(item) for item in sampe_array_list])
    else:
        avg_val = np.mean(sampe_array_list[0])  # get the mean of the single array 
        min_val = min(sampe_array_list[0])  # get the min value of the single array
        max_val = max(sampe_array_list[0])  # get the max value of the single array
    return avg_val, min_val, max_val

In [72]:
def read_motif_score_file(path):
    df_raw=pd.read_csv(path, header=None)
    df=df_raw.T
    df=df.dropna()
    df.columns=["seq", "index", "position", "score"]
    df=df[["index", "position","seq","score"]]
    df["index"]=df["index"].apply(extract_numbers)
    df["position"]=df["position"].apply(ast.literal_eval)
    df["score"]=df["score"].apply(convert_to_array)
    return df

In [86]:
def filtered_motif_n_score(score_path,filter_path):
    """
    score_path: the file for extracting motif and corresponding attention scores
    filter_path: the file consists of "motif", "N", "K", "n", "x", "p"
    """
    # motif scores are extracted for "all high attention area" meets the conditions
    df=read_motif_score_file(score_path)
    # filter is for p<0.05
    fil_df=pd.read_csv(filter_path)
    fil_df.rename(columns={'motif': 'seq'}, inplace=True)
    df_merged=pd.merge(df, fil_df, on="seq")
    avg_all=[]
    min_all=[]
    max_all=[]
    for lst_item in df_merged["score"]:
        avg_val, min_val, max_val=avg_min_max(lst_item)
        avg_all.append(avg_val)
        min_all.append(min_val)
        max_all.append(max_val)
    print(len(avg_all))
    df_merged["score_mean"]=avg_all
    df_merged["score_min"]=min_all
    df_merged["score_max"]=max_all
    df_merged.rename(columns={'x': 'appeared'}, inplace=True)
    df_merged=df_merged[["seq","p","score_mean","score_max","score_min","score","appeared","index", "position"]]
    return df_merged

In [98]:
df_merged=filtered_motif_n_score(score_path="test_extract_df.csv",filter_path="init_df.csv")

100


In [99]:
df_merged #compNg

Unnamed: 0,seq,p,score_mean,score_max,score_min,score,appeared,index,position
0,EDDDDDDDDD,3.481513e-22,0.168107,0.460114,0.086554,"[[0.46011406, 0.27481819, 0.20763572, 0.177448...",71,[3],"[(0, 10)]"
1,GGEEGE,4.238938e-02,0.364030,0.537039,0.127413,"[[0.12741323, 0.4328789, 0.51694293, 0.5370389...",7,[5],"[(60, 66)]"
2,EDDDDEE,8.823711e-05,0.278837,0.488326,0.147065,"[[0.21659432, 0.25254921, 0.27884906, 0.329665...",14,"[6, 12, 46, 130]","[(10, 17), (9, 16), (10, 17), (9, 16)]"
3,MMMMLL,3.621162e-04,0.190739,0.306284,0.096671,"[[0.09921008, 0.10074106, 0.09667146, 0.306284...",32,[8],"[(66, 72)]"
4,JJJJJJMM,3.557096e-02,0.137660,0.173266,0.087962,"[[0.123083, 0.13560234, 0.1413136, 0.15140216,...",16,[17],"[(0, 8)]"
...,...,...,...,...,...,...,...,...,...
95,DDDDDD,6.001000e-36,0.115945,0.155103,0.074618,"[[0.09378298, 0.12188744, 0.1551031, 0.1354113...",118,[497],"[(22, 28)]"
96,GEEEEEE,4.569201e-04,0.345391,0.540571,0.177684,"[[0.3632555, 0.42026314, 0.44964689, 0.5405707...",123,[500],"[(12, 19)]"
97,EEEEEBB,4.226345e-03,0.302297,0.389891,0.159809,"[[0.26188118, 0.30539507, 0.32995011, 0.330835...",11,[502],"[(18, 25)]"
98,AAAAAA,1.062472e-12,0.155020,0.240345,0.054338,"[[0.08126886, 0.24034504, 0.22740874, 0.173166...",98,[504],"[(0, 6)]"


In [102]:
type(df_merged["seq"].to_list())

list

In [123]:
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity

def one_hot_embed(df_merged):
    from keras.preprocessing.sequence import pad_sequences
    from sklearn.metrics.pairwise import cosine_similarity 
    
    sequences=df_merged["seq"].to_list()
    # One-hot encode the sequences
    encoded_sequences = []
    for sequence in sequences:
        encoded_sequence = []
        for char in sequence:
            one_hot = np.zeros(15)
            one_hot[char_to_int[char]] = 1
            encoded_sequence.append(one_hot)
        encoded_sequences.append(np.array(encoded_sequence))

    # Print the one-hot encoded sequences
    for i, encoded_sequence in enumerate(encoded_sequences):
        print(f"Sequence {i + 1}:")
        print(encoded_sequence)
        print()
        
    # Pad the sequences with zeros
    padded_one_hot_vectors = pad_sequences(encoded_sequences)
    # Flatten each sequence into a 1D array
    flattened_vectors = [vector.flatten() for vector in padded_one_hot_vectors]
    # Now compute the similarity matrix
    similarity_matrix = cosine_similarity(flattened_vectors)
        
    return padded_one_hot_vectors

In [124]:
padded_one_hot_vectors=one_hot_embed(df_merged)

Sequence 1:
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Sequence 2:
[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Sequence 3:
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [111]:
type(encoded_sequences)

list

In [106]:
import matplotlib.pyplot as plt

In [107]:
weights=df_merged["score_mean"]

In [115]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity

# Assuming 'one_hot_vectors' is a list of your one-hot-encoded vectors
# Pad the sequences with zeros
padded_one_hot_vectors = pad_sequences(encoded_sequences)

# Flatten each sequence into a 1D array
flattened_vectors = [vector.flatten() for vector in padded_one_hot_vectors]

# Now compute the similarity matrix
similarity_matrix = cosine_similarity(flattened_vectors)

In [122]:
np.size(similarity_matrix,0)

100

In [127]:
sequences=df_merged["seq"].to_list()
weights=df_merged["score_mean"].to_list()

In [128]:
import networkx as nx

# Create an empty graph
G = nx.Graph()

# Add nodes
for seq, score in zip(sequences, weights):
    G.add_node(seq, weight=score)

# Add edges with weights
for i, seq1 in enumerate(sequences):
    for j, seq2 in enumerate(sequences[i+1:], start=i+1):
        G.add_edge(seq1, seq2, weight=similarity_matrix[i, j])

# You can now visualize the graph
nx.draw(G, with_labels=True)


TypeError: '_AxesStack' object is not callable

<Figure size 640x480 with 0 Axes>