In [1]:
from Bio import SeqIO
# Define the file paths for the two FASTA files.
Nucleus_data = "./data/TestSet/mRNA_sublocation_TestSet_nucleus_data.fasta"
Cytoplasm_data = "./data/TestSet/mRNA_sublocation_TestSet_cytoplasm_data.fasta"

def read_fasta_file(file_path):
    sequences = []
    # Parse the FASTA file and iterate over each record.
    for i, record in enumerate(SeqIO.parse(file_path, "fasta")):
        sequence = str(record.seq)
        sequences.append((record.id, sequence))
    return sequences
# Read sequences from the FASTA files.
Nucleus_data = read_fasta_file(Nucleus_data)
Cytoplasm_data = read_fasta_file(Cytoplasm_data)
Nucleus_data

[('ENST00000229416',
  'ATAAAACCTGGAGCGCAGGATCGCGCCCAGGAGCGGCGAGCTAGCGGACGCAAAGACTGGGCATGCTCCGCGGCGGCGCAGGTTTTGGTCACAAGTAGGAAGAAGCCAGTGCACCAGACCGGCAAAGAGAAGCGGGAGCCGCCGCGGCAGCGCGGCCGTGGGGTCCGCCGCCGCCGCATCGGAGCGGGAGGAGGAGCAGCGGGGAGGGCGAGGCCGCCGGGCCGAGAGCCGTCCCGCCTGCTCTCGGTCTTCTGCCTTCGCCTCCGCGCGGTGCGTCGGACCCAGGGTCTGTCACCTGGGCGCCAGGGGCCGCCGCCGGGGAGCCGGAGCGGGCAGGACCCTCCCTCCGCCGACTGCGGCCCGAGAGCGCCCCCGCGGGGTGGAGCGGCAGCCGCCTTCTGCGGGCGGCTGAGTGTCCGTCTCGCGCCCGGAGCGGGCGACCGCCGTCAGCCCGGAGGAGGAGGAGGAGGAGGAGGGGGCGGCCATGGGGCTGCTGTCCCAGGGCTCGCCGCTGAGCTGGGAGGAAACCAAGCGCCATGCCGACCACGTGCGGCGGCACGGGATCCTCCAGTTCCTGCACATCTACCACGCCGTCAAGGACCGGCACAAGGACGTTCTCAAGTGGGGCGATGAGGTGGAATACATGTTGGTATCTTTTGATCATGAAAATAAAAAAGTCCGGTTGGTCCTGTCTGGGGAGAAAGTTCTTGAAACTCTGCAAGAGAAGGGGGAAAGGACAAACCCAAACCATCCTACCCTTTGGAGACCAGAGTATGGGAGTTACATGATTGAAGGGACACCAGGACAGCCCTACGGAGGAACAATGTCCGAGTTCAATACAGTTGAGGCCAACATGCGAAAACGCCGGAAGGAGGCTACTTCTATATTAGAAGAAAATCAGGCTCTTTGCACAATAACTTCATTTCCCAGATTAGGCTGTCCTGGGTTCACACTGCCCGAGGTCAAACCCAACCCA

In [2]:
def extract_elements(input_list):
    extracted_elements = []
    # Iterate over each sublist in the input list.
    for item in input_list:
        # Check if the sublist has a length of 2 or more.
        if len(item) >= 2:  
            # Append the second element of the sublist to the result list.
            extracted_elements.append(item[1])
    
    return extracted_elements

In [3]:
Nucleus_data = extract_elements(Nucleus_data)
Cytoplasm_data = extract_elements(Cytoplasm_data)
Nucleus_data

['ATAAAACCTGGAGCGCAGGATCGCGCCCAGGAGCGGCGAGCTAGCGGACGCAAAGACTGGGCATGCTCCGCGGCGGCGCAGGTTTTGGTCACAAGTAGGAAGAAGCCAGTGCACCAGACCGGCAAAGAGAAGCGGGAGCCGCCGCGGCAGCGCGGCCGTGGGGTCCGCCGCCGCCGCATCGGAGCGGGAGGAGGAGCAGCGGGGAGGGCGAGGCCGCCGGGCCGAGAGCCGTCCCGCCTGCTCTCGGTCTTCTGCCTTCGCCTCCGCGCGGTGCGTCGGACCCAGGGTCTGTCACCTGGGCGCCAGGGGCCGCCGCCGGGGAGCCGGAGCGGGCAGGACCCTCCCTCCGCCGACTGCGGCCCGAGAGCGCCCCCGCGGGGTGGAGCGGCAGCCGCCTTCTGCGGGCGGCTGAGTGTCCGTCTCGCGCCCGGAGCGGGCGACCGCCGTCAGCCCGGAGGAGGAGGAGGAGGAGGAGGGGGCGGCCATGGGGCTGCTGTCCCAGGGCTCGCCGCTGAGCTGGGAGGAAACCAAGCGCCATGCCGACCACGTGCGGCGGCACGGGATCCTCCAGTTCCTGCACATCTACCACGCCGTCAAGGACCGGCACAAGGACGTTCTCAAGTGGGGCGATGAGGTGGAATACATGTTGGTATCTTTTGATCATGAAAATAAAAAAGTCCGGTTGGTCCTGTCTGGGGAGAAAGTTCTTGAAACTCTGCAAGAGAAGGGGGAAAGGACAAACCCAAACCATCCTACCCTTTGGAGACCAGAGTATGGGAGTTACATGATTGAAGGGACACCAGGACAGCCCTACGGAGGAACAATGTCCGAGTTCAATACAGTTGAGGCCAACATGCGAAAACGCCGGAAGGAGGCTACTTCTATATTAGAAGAAAATCAGGCTCTTTGCACAATAACTTCATTTCCCAGATTAGGCTGTCCTGGGTTCACACTGCCCGAGGTCAAACCCAACCCAGTGGAAGGAGGAGCTTCCAAGT

In [4]:
import numpy as np
import pandas as pd

# Define the EIIP values for individual nucleotides.
EIIP_values = {
    'A': 0.1260,
    'C': 0.1340,
    'G': 0.0806,
    'T': 0.1335,
}

# List of nucleotides.
nucleos = ['A', 'C', 'G', 'T']

# Generate all possible trinucleotides combinations.
trinucleos = [a + b + c for a in nucleos for b in nucleos for c in nucleos]

# Function to calculate EIIP values for all possible trinucleotides.
def calculate_EIIP():
    EIIP_dic = {}  # Initialize a dictionary to store the EIIP values.
    for x in nucleos:
        for y in nucleos:
            for z in nucleos:
                # Compute the EIIP value for the trinucleotide combination.
                EIIP_dic[x + y + z] = round(EIIP_values[x] + EIIP_values[y] + EIIP_values[z], 4)
    return EIIP_dic

# Calculate and store the EIIP dictionary.
EIIP_dic = calculate_EIIP()


In [5]:
def calculate_frequency(sequence, keyword):
    count = 0
    total_count = len(sequence) - len(keyword) + 1
    for i in range(total_count):
        if sequence[i:i+len(keyword)] == keyword:
            count += 1
    return count / total_count

In [6]:
def EIIP(sequence, k_tuple, EIIP_dic):
    vector = []
    # Calculate the number of possible substrings of the keyword length.
    total_count = len(sequence) - k_tuple + 1
    nucleos = ['A', 'C', 'G', 'T']
    # Iterate over the sequence and compare each substring of the length of the keyword with the keyword.
    for x in nucleos:
        for y in nucleos:
            for z in nucleos:
                keyword = x + y + z
                frequency_value = calculate_frequency(sequence, keyword)
                tmp_value = frequency_value * EIIP_dic[keyword]
                vector.append(tmp_value)
    # Calculate the frequency as the ratio of the count to the total number of possible substrings.            
    return vector

In [7]:
import numpy as np
Nucleus_feature = [EIIP(seq,3,EIIP_dic) for seq in Nucleus_data]
Cytoplasm_feature = [EIIP(seq,3,EIIP_dic) for seq in Cytoplasm_data]


In [8]:
import pandas as pd

Nucleus_feature = pd.DataFrame(Nucleus_feature)
Nucleus_feature


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.011010,0.005166,0.007069,0.009104,0.006584,0.004962,0.000983,0.006195,0.007331,0.005005,...,0.001461,0.007892,0.007139,0.005115,0.005413,0.007023,0.006909,0.006419,0.007570,0.013872
1,0.001138,0.002840,0.004449,0.000387,0.005293,0.008433,0.003076,0.004343,0.004783,0.009227,...,0.003609,0.006303,0.005346,0.008732,0.007491,0.004069,0.000920,0.007644,0.002325,0.003215
2,0.003841,0.003923,0.006374,0.003134,0.005604,0.007207,0.001879,0.005027,0.005505,0.008505,...,0.002224,0.006520,0.006814,0.009703,0.008557,0.007166,0.001027,0.005938,0.003533,0.003954
3,0.004768,0.003772,0.005141,0.003835,0.004800,0.007349,0.002178,0.004614,0.005968,0.006595,...,0.002535,0.009118,0.005982,0.006555,0.008167,0.005805,0.003351,0.008834,0.005558,0.009890
4,0.014964,0.005276,0.007248,0.011185,0.005625,0.003709,0.000834,0.006393,0.007677,0.003865,...,0.000673,0.008790,0.008157,0.004893,0.004712,0.009413,0.011352,0.009256,0.007978,0.021691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,0.011004,0.006933,0.007622,0.007641,0.008487,0.004149,0.001688,0.006824,0.006489,0.006329,...,0.001294,0.006209,0.007689,0.006253,0.007393,0.006459,0.004990,0.007948,0.008827,0.011535
224,0.001295,0.001763,0.003417,0.000440,0.002644,0.009895,0.004277,0.002246,0.003037,0.008943,...,0.006358,0.004120,0.004271,0.009934,0.008410,0.003571,0.001346,0.004120,0.001190,0.001372
225,0.013675,0.006324,0.009535,0.004210,0.005270,0.004034,0.002557,0.005103,0.008173,0.006742,...,0.002138,0.006843,0.009054,0.003327,0.006437,0.006406,0.006706,0.007117,0.007830,0.016403
226,0.013304,0.005697,0.006495,0.012080,0.008327,0.005010,0.000309,0.006523,0.010120,0.004099,...,0.000553,0.009379,0.007414,0.004584,0.006022,0.008840,0.008656,0.007466,0.006946,0.010822


In [9]:
Cytoplasm_feature = pd.DataFrame(Cytoplasm_feature)
Cytoplasm_feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.008476,0.005618,0.005495,0.004095,0.007744,0.006975,0.001876,0.006966,0.005234,0.006431,...,0.001643,0.008045,0.004950,0.005888,0.007304,0.005333,0.004793,0.009149,0.004102,0.011186
1,0.007821,0.003254,0.007136,0.006499,0.006507,0.002717,0.000783,0.006634,0.004588,0.006003,...,0.002934,0.007989,0.007818,0.007469,0.006097,0.007192,0.003614,0.008604,0.005594,0.011969
2,0.007316,0.004981,0.007868,0.003316,0.007471,0.010591,0.002197,0.003808,0.007868,0.006958,...,0.002620,0.004312,0.005120,0.005989,0.004753,0.005233,0.002958,0.006899,0.003738,0.003876
3,0.001321,0.003371,0.002324,0.002357,0.006405,0.005850,0.002380,0.005155,0.004067,0.006544,...,0.001824,0.005603,0.005347,0.010337,0.010038,0.010018,0.000686,0.004903,0.005464,0.003848
4,0.014170,0.005632,0.005151,0.011595,0.006238,0.003184,0.000917,0.007066,0.005077,0.003593,...,0.001094,0.011161,0.005039,0.003594,0.005358,0.008505,0.011909,0.011881,0.007256,0.021126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,0.002883,0.003271,0.005074,0.002614,0.003925,0.005676,0.004618,0.003668,0.004228,0.007505,...,0.004130,0.007816,0.003747,0.007670,0.006743,0.003829,0.001998,0.004078,0.003535,0.002715
286,0.008520,0.004350,0.004851,0.005622,0.003838,0.005746,0.001919,0.007434,0.004079,0.004629,...,0.002077,0.009038,0.004622,0.007038,0.006545,0.007604,0.007295,0.009038,0.006452,0.013806
287,0.003307,0.003241,0.005121,0.002833,0.005537,0.005652,0.001430,0.006058,0.006633,0.008223,...,0.001705,0.006735,0.006188,0.006334,0.007630,0.005716,0.003163,0.005612,0.005108,0.005465
288,0.004880,0.002683,0.006275,0.003445,0.004216,0.005869,0.002368,0.002345,0.005615,0.008456,...,0.003802,0.002787,0.004391,0.007605,0.007902,0.005868,0.000000,0.003186,0.003107,0.001193


In [10]:
data = pd.concat([Nucleus_feature, Cytoplasm_feature], ignore_index=True)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.011010,0.005166,0.007069,0.009104,0.006584,0.004962,0.000983,0.006195,0.007331,0.005005,...,0.001461,0.007892,0.007139,0.005115,0.005413,0.007023,0.006909,0.006419,0.007570,0.013872
1,0.001138,0.002840,0.004449,0.000387,0.005293,0.008433,0.003076,0.004343,0.004783,0.009227,...,0.003609,0.006303,0.005346,0.008732,0.007491,0.004069,0.000920,0.007644,0.002325,0.003215
2,0.003841,0.003923,0.006374,0.003134,0.005604,0.007207,0.001879,0.005027,0.005505,0.008505,...,0.002224,0.006520,0.006814,0.009703,0.008557,0.007166,0.001027,0.005938,0.003533,0.003954
3,0.004768,0.003772,0.005141,0.003835,0.004800,0.007349,0.002178,0.004614,0.005968,0.006595,...,0.002535,0.009118,0.005982,0.006555,0.008167,0.005805,0.003351,0.008834,0.005558,0.009890
4,0.014964,0.005276,0.007248,0.011185,0.005625,0.003709,0.000834,0.006393,0.007677,0.003865,...,0.000673,0.008790,0.008157,0.004893,0.004712,0.009413,0.011352,0.009256,0.007978,0.021691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0.002883,0.003271,0.005074,0.002614,0.003925,0.005676,0.004618,0.003668,0.004228,0.007505,...,0.004130,0.007816,0.003747,0.007670,0.006743,0.003829,0.001998,0.004078,0.003535,0.002715
514,0.008520,0.004350,0.004851,0.005622,0.003838,0.005746,0.001919,0.007434,0.004079,0.004629,...,0.002077,0.009038,0.004622,0.007038,0.006545,0.007604,0.007295,0.009038,0.006452,0.013806
515,0.003307,0.003241,0.005121,0.002833,0.005537,0.005652,0.001430,0.006058,0.006633,0.008223,...,0.001705,0.006735,0.006188,0.006334,0.007630,0.005716,0.003163,0.005612,0.005108,0.005465
516,0.004880,0.002683,0.006275,0.003445,0.004216,0.005869,0.002368,0.002345,0.005615,0.008456,...,0.003802,0.002787,0.004391,0.007605,0.007902,0.005868,0.000000,0.003186,0.003107,0.001193


In [11]:
import pandas as pd

# Define the file path for saving the CSV file.
csv_file_path = './data/TestSet/mRNA_sublocation_TestSet_EIIP_data.csv'

# Save the DataFrame to a CSV file without including the index.
data.to_csv(csv_file_path, index=False)
