 # Prediction of gene expression levels in *Streptococcus Thermophilus*

In [1]:
# Importing the libraries

import pandas as pd

## Extraction of Kmer counts from the potencial promoters of the genes in the dataset

In [5]:
# Load the promoter sequences
# Create a dataframe with the promoter sequences

filtered_promoter_file_path = r"C:\Users\anaca\Documents\Bioinformática\2º Semestre\projeto\promoter-extraction-ML-pipeline\promoter_extraction\filtered_promoters_100_bp.fasta"

with open(filtered_promoter_file_path, "r") as f:
    lines = f.readlines()
lines = [line.strip() for line in lines]
lines = [line.split("\t") for line in lines]
lines = [line[0] for line in lines]
df = pd.DataFrame(lines)
df.columns = ['Promoter Type']
df[['Promoter Type', 'Locus Tag', "Location Coordinates", "Promoter Sequence"]] = df['Promoter Type'].str.split(';', 4, expand=True)
print(df)

                   Promoter Type       Locus Tag  Location Coordinates  \
0    Promoter reverse complement  ['T303_00015']        [2011:2347](-)   
1    Promoter reverse complement  ['T303_00025']        [4153:5353](-)   
2    Promoter reverse complement  ['T303_00030']        [6254:7709](-)   
3    Promoter reverse complement  ['T303_00035']        [7890:8945](-)   
4                       Promoter  ['T303_00050']      [10422:10776](+)   
..                           ...             ...                   ...   
803                     Promoter  ['T303_09885']  [1840783:1840921](+)   
804  Promoter reverse complement  ['T303_09890']  [1841032:1842175](-)   
805  Promoter reverse complement  ['T303_09895']  [1842429:1842711](-)   
806  Promoter reverse complement  ['T303_09900']  [1843080:1843215](-)   
807                     Promoter  ['T303_09905']  [1844132:1845461](+)   

                                     Promoter Sequence  
0    AAGTCTCAACAAGTAAGTCTCTAAGCCCCTAACCATAAGGTTTTGG...

In [None]:

## Extraction of Kmer counts from the potencial promoters of the genes in the dataset
# Load the promoter sequences
# Create a dataframe with the promoter sequences

filtered_promoter_file_path = r"C:\Users\anaca\Documents\Bioinformática\2º Semestre\projeto\promoter-extraction-ML-pipeline\promoter_extraction\filtered_promoters_100_bp.fasta"


def read_filtered_promoters(file_path):
    with open(file_path, "r") as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    lines = [line.split("\t") for line in lines]
    lines = [line[0] for line in lines]
    return lines


# read_filtered_promoters(filtered_promoter_file_path)

def create_filtered_promoters_df(filename):
    lines = read_filtered_promoters(filename)
    df = pd.DataFrame(lines)
    df.columns = ['Promoter Type']
    df[['Promoter Type', 'Locus Tag', "Location Coordinates", "Promoter Sequence"]] = df['Promoter Type'].str.split(';',
                                                                                                                    4,
                                                                                                                    expand=True)
    return df


create_filtered_promoters_df(filtered_promoter_file_path)


# Create dictionary with locus tag as key and promoter sequence as value

def create_promoter_dict(df):
    promoter_dict = {}
    for index, row in df.iterrows():
        promoter_dict[row['Locus Tag']] = row['Promoter Sequence']
    return promoter_dict


create_promoter_dict(create_filtered_promoters_df(filtered_promoter_file_path))


# Generate Kmer from the promoter sequences

def generate_kmer(promoter_dict, k):
    kmer_dict = {}
    for key, value in promoter_dict.items():
        kmer_dict[key] = {}
        for i in range(len(value) - k + 1):
            kmer_dict[key][value[i:i + k]] = 0

    return kmer_dict
    # print(list(kmer_dict.values())[0])


generate_kmer(create_promoter_dict(create_filtered_promoters_df(filtered_promoter_file_path)), 10)


def count_kmer(kmer_dict, promoter_dict, k):
    for key, value in promoter_dict.items():
        for i in range(len(value) - k + 1):
            kmer_dict[key][value[i:i + k]] += 1
    # return kmer_dict
    kmer_feature_table = pd.DataFrame.from_dict(kmer_dict)
    return kmer_feature_table


count_kmer(generate_kmer(create_promoter_dict(create_filtered_promoters_df(filtered_promoter_file_path)), 10),
           create_promoter_dict(create_filtered_promoters_df(filtered_promoter_file_path)), 10)


def return_kmers_with_counts_above_threshold(kmer_dict, threshold):
    kmer_dict_filtered = {}
    for key, value in kmer_dict.items():
        kmer_dict_filtered[key] = {}
        for kmer, count in value.items():
            if count > threshold:
                kmer_dict_filtered[key][kmer] = count
    return kmer_dict_filtered


return_kmers_with_counts_above_threshold(
    count_kmer(generate_kmer(create_promoter_dict(create_filtered_promoters_df(filtered_promoter_file_path)), 10),
               create_promoter_dict(create_filtered_promoters_df(filtered_promoter_file_path)), 10), 1)


def generate_kmer_count_dataframe(kmer_dict):
    kmer_dict_df = pd.DataFrame(kmer_dict)
    kmer_dict_df.columns = ['Locus Tag', 'Kmer', 'Count']
    return kmer_dict_df


generate_kmer_count_dataframe(
    count_kmer(generate_kmer(create_promoter_dict(create_filtered_promoters_df(filtered_promoter_file_path)), 10),
               create_promoter_dict(create_filtered_promoters_df(filtered_promoter_file_path)), 10))