In [1]:
from Bio import SeqIO
# Define the file paths for the two FASTA files.
Nucleus_data = "./data/TrainingSet/mRNA_sublocation_TrainingSet_nucleus_data.fasta"
Cytoplasm_data = "./data/TrainingSet/mRNA_sublocation_TrainingSet_cytoplasm_data.fasta"

def read_fasta_file(file_path):
    sequences = []
    # Parse the FASTA file and iterate over each record.
    for i, record in enumerate(SeqIO.parse(file_path, "fasta")):
        sequence = str(record.seq)
        sequences.append((record.id, sequence))
    return sequences
# Read sequences from the FASTA files.
Nucleus_data = read_fasta_file(Nucleus_data)
Cytoplasm_data = read_fasta_file(Cytoplasm_data)
Nucleus_data

[('ENST00000341376',
  'GATTCCCCTTTGTTCGGGTTCGCCATTTTGCTAGGCAGCGGCAGTGGCGGCGGCAGCGGCGGCTGGAGCCTCTGATTGGGTTTCGGAGTCCGGTACTGGAGCCAATCAGCGCGGGCAGCGAACCGGGGGAGCGAGGCACGGAGTGTACCTCACAGCCTTCTAGGATCTCCAGAGTGGACAGGAATCTCACTTGGAGGGACCATGGAGCAGTATACAGCAAACAGCAATAGTTCGACAGAGCAGATTGTTGTCCAGGCAGGACAGATTCAGCAGCAGCAGCAGGGTGGTGTCACTGCTGTGCAGTTGCAGACTGAGGCCCAGGTGGCATCCGCCTCAGGCCAGCAAGTCCAGACCCTCCAGGTAGTCCAAGGGCAGCCATTAATGGTGCAGGTCAGTGGAGGCCAGCTAATCACATCAACTGGCCAACCCATCATGGTCCAGGCTGTCCCTGGTGGACAAGGTCAAACCATCATGCAAGTACCTGTTTCTGGAACACAGGGTTTGCAGCAAATACAGTTGGTCCCACCTGGACAGATCCAGATCCAGGGTGGACAGGCTGTGCAGGTGCAGGGCCAGCAGGGCCAGACCCAGCAGATCATCATCCAGCAGCCCCAGACGGCTGTCACTGCTGGCCAGACTCAGACACAGCAGCAGATTGCTGTCCAGGGACAGCAAGTGGCACAGACTGCTGAAGGGCAGACCATCGTCTATCAACCAGTTAATGCAGATGGCACCATTCTCCAGCAAGTTACAGTCCCTGTTTCAGGCATGATCACTATCCCAGCAGCCAGTTTGGCAGGAGCACAGATTGTTCAAACAGGAGCCAATACCAACACAACCAGCAGTGGGCAAGGGACTGTCACTGTGACACTACCAGTGGCAGGCAATGTGGTCAATTCAGGAGGGATGGTCATGATGGTTCCTGGGGCTGGCTCTGTGCCTGCTATCCAAAGAATCCCTCTACCTGGAGCAGAGA

In [2]:
def extract_elements(input_list):
    extracted_elements = []
    # Iterate over each sublist in the input list.
    for item in input_list:
        # Check if the sublist has a length of 2 or more.
        if len(item) >= 2:  
            # Append the second element of the sublist to the result list.
            extracted_elements.append(item[1])
    
    return extracted_elements

In [3]:
Nucleus_data = extract_elements(Nucleus_data)
Cytoplasm_data = extract_elements(Cytoplasm_data)
Nucleus_data

['GATTCCCCTTTGTTCGGGTTCGCCATTTTGCTAGGCAGCGGCAGTGGCGGCGGCAGCGGCGGCTGGAGCCTCTGATTGGGTTTCGGAGTCCGGTACTGGAGCCAATCAGCGCGGGCAGCGAACCGGGGGAGCGAGGCACGGAGTGTACCTCACAGCCTTCTAGGATCTCCAGAGTGGACAGGAATCTCACTTGGAGGGACCATGGAGCAGTATACAGCAAACAGCAATAGTTCGACAGAGCAGATTGTTGTCCAGGCAGGACAGATTCAGCAGCAGCAGCAGGGTGGTGTCACTGCTGTGCAGTTGCAGACTGAGGCCCAGGTGGCATCCGCCTCAGGCCAGCAAGTCCAGACCCTCCAGGTAGTCCAAGGGCAGCCATTAATGGTGCAGGTCAGTGGAGGCCAGCTAATCACATCAACTGGCCAACCCATCATGGTCCAGGCTGTCCCTGGTGGACAAGGTCAAACCATCATGCAAGTACCTGTTTCTGGAACACAGGGTTTGCAGCAAATACAGTTGGTCCCACCTGGACAGATCCAGATCCAGGGTGGACAGGCTGTGCAGGTGCAGGGCCAGCAGGGCCAGACCCAGCAGATCATCATCCAGCAGCCCCAGACGGCTGTCACTGCTGGCCAGACTCAGACACAGCAGCAGATTGCTGTCCAGGGACAGCAAGTGGCACAGACTGCTGAAGGGCAGACCATCGTCTATCAACCAGTTAATGCAGATGGCACCATTCTCCAGCAAGTTACAGTCCCTGTTTCAGGCATGATCACTATCCCAGCAGCCAGTTTGGCAGGAGCACAGATTGTTCAAACAGGAGCCAATACCAACACAACCAGCAGTGGGCAAGGGACTGTCACTGTGACACTACCAGTGGCAGGCAATGTGGTCAATTCAGGAGGGATGGTCATGATGGTTCCTGGGGCTGGCTCTGTGCCTGCTATCCAAAGAATCCCTCTACCTGGAGCAGAGATGCTTGAAGAAGAGCCTCTCTA

In [4]:
import numpy as np
import pandas as pd

# Define the EIIP values for individual nucleotides.
EIIP_values = {
    'A': 0.1260,
    'C': 0.1340,
    'G': 0.0806,
    'T': 0.1335,
}

# List of nucleotides.
nucleos = ['A', 'C', 'G', 'T']

# Generate all possible trinucleotides combinations.
trinucleos = [a + b + c for a in nucleos for b in nucleos for c in nucleos]

# Function to calculate EIIP values for all possible trinucleotides.
def calculate_EIIP():
    EIIP_dic = {}  # Initialize a dictionary to store the EIIP values.
    for x in nucleos:
        for y in nucleos:
            for z in nucleos:
                # Compute the EIIP value for the trinucleotide combination.
                EIIP_dic[x + y + z] = round(EIIP_values[x] + EIIP_values[y] + EIIP_values[z], 4)
    return EIIP_dic

# Calculate and store the EIIP dictionary.
EIIP_dic = calculate_EIIP()


In [5]:
def calculate_frequency(sequence, keyword):
    count = 0
    total_count = len(sequence) - len(keyword) + 1
    for i in range(total_count):                        
        if sequence[i:i+len(keyword)] == keyword:
            count += 1
    return count / total_count

In [6]:
def EIIP(sequence, k_tuple, EIIP_dic):
    vector = []
    # Calculate the number of possible substrings of the keyword length.
    total_count = len(sequence) - k_tuple + 1
    nucleos = ['A', 'C', 'G', 'T']
    # Iterate over the sequence and compare each substring of the length of the keyword with the keyword.
    for x in nucleos:
        for y in nucleos:
            for z in nucleos:
                keyword = x + y + z
                frequency_value = calculate_frequency(sequence, keyword)
                tmp_value = frequency_value * EIIP_dic[keyword]
                vector.append(tmp_value)
    # Calculate the frequency as the ratio of the count to the total number of possible substrings.            
    return vector

In [7]:
import numpy as np
Nucleus_feature = [EIIP(seq,3,EIIP_dic) for seq in Nucleus_data]
Cytoplasm_feature = [EIIP(seq,3,EIIP_dic) for seq in Cytoplasm_data]


In [8]:
import pandas as pd

Nucleus_feature = pd.DataFrame(Nucleus_feature)
Nucleus_feature


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.008038,0.005472,0.007248,0.007894,0.007398,0.004448,0.000894,0.007232,0.009605,0.007154,...,0.001097,0.006948,0.006429,0.005483,0.007041,0.006114,0.004643,0.008212,0.007301,0.012933
1,0.006421,0.004846,0.007124,0.006833,0.007555,0.005529,0.002641,0.006684,0.008229,0.006792,...,0.001285,0.007552,0.007535,0.006042,0.006530,0.006931,0.004789,0.005775,0.007317,0.014494
2,0.012833,0.005971,0.006511,0.007626,0.007715,0.005825,0.001119,0.006599,0.007285,0.005525,...,0.001001,0.008097,0.006634,0.004670,0.006112,0.006233,0.007101,0.006999,0.006162,0.012829
3,0.013917,0.006805,0.005778,0.013393,0.008006,0.004290,0.000706,0.007448,0.006812,0.002914,...,0.000812,0.008213,0.007142,0.005505,0.004050,0.007390,0.010495,0.011332,0.006759,0.023986
4,0.017324,0.006722,0.007164,0.007420,0.007784,0.003431,0.001093,0.007214,0.006707,0.006244,...,0.000638,0.004594,0.008417,0.005903,0.007023,0.006213,0.006304,0.006432,0.006850,0.013950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,0.002883,0.002576,0.005073,0.001286,0.002392,0.007512,0.003247,0.003751,0.004439,0.008442,...,0.001493,0.004014,0.004539,0.009457,0.010816,0.003811,0.001873,0.002102,0.003645,0.002482
2024,0.013137,0.004329,0.006452,0.007361,0.005811,0.004976,0.000723,0.005884,0.007359,0.005403,...,0.001055,0.008508,0.006116,0.005346,0.006252,0.008253,0.006432,0.007049,0.008007,0.014526
2025,0.003482,0.002963,0.006010,0.001457,0.003784,0.006980,0.002896,0.004276,0.006128,0.007643,...,0.003125,0.004452,0.004539,0.007976,0.007100,0.004885,0.000882,0.004357,0.002463,0.002412
2026,0.005514,0.005839,0.007727,0.005415,0.008341,0.007663,0.001656,0.004252,0.010062,0.011593,...,0.002069,0.007366,0.008819,0.005642,0.004458,0.003005,0.002760,0.007799,0.004883,0.004976


In [9]:
Cytoplasm_feature = pd.DataFrame(Cytoplasm_feature)
Cytoplasm_feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.010694,0.004016,0.008074,0.005207,0.005425,0.004674,0.001678,0.005746,0.006374,0.006279,...,0.001271,0.008563,0.007014,0.006735,0.007853,0.006217,0.005021,0.007319,0.005773,0.009137
1,0.018240,0.006153,0.006950,0.012872,0.007318,0.003565,0.000660,0.006612,0.006592,0.003008,...,0.001125,0.006306,0.006887,0.005024,0.004381,0.009585,0.013123,0.006738,0.008462,0.017946
2,0.002854,0.002498,0.005741,0.001248,0.002082,0.004675,0.003307,0.003396,0.004664,0.010655,...,0.004506,0.002595,0.002935,0.004131,0.006994,0.003000,0.001272,0.004758,0.003375,0.004752
3,0.012651,0.006110,0.007371,0.007149,0.007332,0.003920,0.003543,0.004805,0.007070,0.005084,...,0.002362,0.007980,0.007076,0.007557,0.005998,0.005188,0.005155,0.005078,0.006917,0.008695
4,0.013743,0.005046,0.005706,0.005669,0.007726,0.004667,0.001113,0.007394,0.006386,0.004870,...,0.001138,0.010811,0.005279,0.005830,0.008547,0.007668,0.005779,0.008354,0.007384,0.015051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2629,0.020823,0.006709,0.006760,0.009948,0.007638,0.003582,0.001047,0.007103,0.007427,0.004873,...,0.000931,0.007989,0.008185,0.005027,0.004689,0.006042,0.009458,0.007024,0.007436,0.016279
2630,0.002121,0.007038,0.006064,0.003244,0.007038,0.005526,0.002388,0.005519,0.008397,0.006210,...,0.003418,0.006749,0.006678,0.005859,0.005787,0.009263,0.003307,0.009561,0.003900,0.006741
2631,0.018389,0.005796,0.007591,0.005788,0.005100,0.006153,0.002046,0.007090,0.005993,0.006751,...,0.001673,0.006744,0.006332,0.005436,0.005841,0.004175,0.007081,0.008189,0.004802,0.011305
2632,0.001512,0.001544,0.005322,0.000000,0.003088,0.007880,0.001362,0.001574,0.003991,0.010899,...,0.002785,0.003208,0.002721,0.006962,0.008252,0.001390,0.000000,0.006416,0.000000,0.000000


In [10]:
data = pd.concat([Nucleus_feature, Cytoplasm_feature], ignore_index=True)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.008038,0.005472,0.007248,0.007894,0.007398,0.004448,0.000894,0.007232,0.009605,0.007154,...,0.001097,0.006948,0.006429,0.005483,0.007041,0.006114,0.004643,0.008212,0.007301,0.012933
1,0.006421,0.004846,0.007124,0.006833,0.007555,0.005529,0.002641,0.006684,0.008229,0.006792,...,0.001285,0.007552,0.007535,0.006042,0.006530,0.006931,0.004789,0.005775,0.007317,0.014494
2,0.012833,0.005971,0.006511,0.007626,0.007715,0.005825,0.001119,0.006599,0.007285,0.005525,...,0.001001,0.008097,0.006634,0.004670,0.006112,0.006233,0.007101,0.006999,0.006162,0.012829
3,0.013917,0.006805,0.005778,0.013393,0.008006,0.004290,0.000706,0.007448,0.006812,0.002914,...,0.000812,0.008213,0.007142,0.005505,0.004050,0.007390,0.010495,0.011332,0.006759,0.023986
4,0.017324,0.006722,0.007164,0.007420,0.007784,0.003431,0.001093,0.007214,0.006707,0.006244,...,0.000638,0.004594,0.008417,0.005903,0.007023,0.006213,0.006304,0.006432,0.006850,0.013950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,0.020823,0.006709,0.006760,0.009948,0.007638,0.003582,0.001047,0.007103,0.007427,0.004873,...,0.000931,0.007989,0.008185,0.005027,0.004689,0.006042,0.009458,0.007024,0.007436,0.016279
4658,0.002121,0.007038,0.006064,0.003244,0.007038,0.005526,0.002388,0.005519,0.008397,0.006210,...,0.003418,0.006749,0.006678,0.005859,0.005787,0.009263,0.003307,0.009561,0.003900,0.006741
4659,0.018389,0.005796,0.007591,0.005788,0.005100,0.006153,0.002046,0.007090,0.005993,0.006751,...,0.001673,0.006744,0.006332,0.005436,0.005841,0.004175,0.007081,0.008189,0.004802,0.011305
4660,0.001512,0.001544,0.005322,0.000000,0.003088,0.007880,0.001362,0.001574,0.003991,0.010899,...,0.002785,0.003208,0.002721,0.006962,0.008252,0.001390,0.000000,0.006416,0.000000,0.000000


In [11]:
import pandas as pd
# Define the file path for saving the CSV file.
csv_file_path = './data/TrainingSet/mRNA_sublocation_TrainingSet_EIIP_data.csv'
# Save the DataFrame to a CSV file without including the index.
data.to_csv(csv_file_path, index=False)