In [178]:
metadata = {
    'Author      ': 'Jay Annadurai',
    'Date        ': '12 Apr 2024',
    'Project     ': 'A4-PSWMPredictor',
    'Version     ': 1.0,
    'Description ': 'ArgR Binding Site Motif Predictor with PSWM'
}

In [179]:
# ~~~~~~~~~~~~~~~~~~
#  Import Libraries
# ~~~~~~~~~~~~~~~~~~
import pandas as pd  # Data Reading
import numpy as np  # Computation
import scipy as sp # Statistical Methods
import seaborn as sb  # Advanced Data Visualization
import matplotlib.pyplot as plt  # Data Visualization
# Set Matplot to Inline
%matplotlib inline 

# ~~~~~~~~~~~~~~~~~~~~~~~
#  Import Utlity Classes
# ~~~~~~~~~~~~~~~~~~~~~~~
from pprint import pprint as print  # Override the standard print function with Pretty Print
from JayUtilities import DataIO as Jio  # Data Input/Output Processing Utility Class

In [180]:
# ~~~~~~~~~~~~~~~
#  Script Config
# ~~~~~~~~~~~~~~~
# Input Config
Jio.input_folder = "Input/"  # Sets the Input Folder for the DataIO Class
input_file_paths = { # Name of the Input File
    'ArgR-Motif': 'argR-counts-matrix.txt',
    'Ecoli-Genetic-Regions':'E_coli_K12_MG1655.400_50'
}

input_format = 'tsv'  # Optionally force the encode format of the Input File
header_rows = 0  # Ptt files contain 2 rows of header information


# Output Config
Jio.output_folder = "Output/"  # Sets the Output Folder for the DataIO Class
save_file = True  # Sets whether the script should save the outputs or not
output_file = None  # Name of the Output File is generated Dynamically
output_format = 'tsv'  # Format of the file to save the Output as

In [181]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Import Counts Matrix Data
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Import the File
counts_matrix_dict = Jio.file_to_df(
    file_name=input_file_paths['ArgR-Motif'], 
    force_encode_format=input_format,
    read_args= { "header": None} 
)

# Bind the Dataframe
raw_counts_matrix_df = counts_matrix_dict['df']

# View Data
Jio.print_df(raw_counts_matrix_df, df_name="ArgR Binding Motif Counts Matrix Raw")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ArgR Binding Motif Counts Matrix Raw: 4 Row x 20 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  0  1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18  \
0  a  |   8  12  21   9   4   2  21  21   3  10   8   5   7  25   4   2   2   
1  c  |   7   4   1   6   2   3   3   3   1   0   2   0   7   0   3   3  24   
2  g  |   3   2   1   8   2  21   2   2   0   1   0   1   0   1   0  15   0   
3  t  |   9   9   4   4  19   1   1   1  23  16  17  21  13   1  20   7   1   

   19  
0  25  
1   0  
2   2  
3   0  
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [182]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Counts Matrix Preprocessing
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Copy DF to prevent any code issues
counts_matrix_df = raw_counts_matrix_df.copy()

# Drop the original index and use the base as the new index
counts_matrix_df.set_index(0, inplace=True)
# Drop the formatting column
counts_matrix_df.drop(1,axis=1,inplace=True)

# Renaming columns to P1 to P18 to represent the position
new_column_names = []
for i in range(1, len(counts_matrix_df.columns) + 1):
    new_column_name = f"P{i}"
    new_column_names.append(new_column_name)

counts_matrix_df.columns = new_column_names

# View Data
Jio.print_df(counts_matrix_df, df_name="ArgR Binding Motif Counts Matrix")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ArgR Binding Motif Counts Matrix: 4 Row x 18 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

   P1  P2  P3  P4  P5  P6  P7  P8  P9  P10  P11  P12  P13  P14  P15  P16  P17  \
0                                                                               
a   8  12  21   9   4   2  21  21   3   10    8    5    7   25    4    2    2   
c   7   4   1   6   2   3   3   3   1    0    2    0    7    0    3    3   24   
g   3   2   1   8   2  21   2   2   0    1    0    1    0    1    0   15    0   
t   9   9   4   4  19   1   1   1  23   16   17   21   13    1   20    7    1   

   P18  
0       
a   25  
c    0  
g    2  
t    0  
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [183]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Counts Matrix with Psuedocount
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Function takes a counts matrix df and returns it with the psuedo count applied
def add_psuedocount(counts_matrix_df, psuedocount=1):
    # Add one to all positions in the counts matrix
    pseudocounts_matrix_df = counts_matrix_df + psuedocount
    return pseudocounts_matrix_df

In [184]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Nucleotide Frequency Matrix
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Computes the Frequency Matrix from the Counts Matrix
def counts_to_frequency_matrix_df(counts_matrix_df):
    # Calculate the sum of each column (each position in the motif)
    column_sums = counts_matrix_df.sum()

    # Divide each element in the DataFrame by its column sum to get frequencies
    frequency_matrix_df = counts_matrix_df.div(column_sums, axis='columns')

    # Return the Frequency Matrix
    return frequency_matrix_df
    

In [185]:
# Apply and View the Psuedocount Transformation
psuedocounts_matrix_df = add_psuedocount(counts_matrix_df)

Jio.print_df(psuedocounts_matrix_df,df_name="ArgR Binding Motif Psuedocounts Matrix")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ArgR Binding Motif Psuedocounts Matrix: 4 Row x 18 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

   P1  P2  P3  P4  P5  P6  P7  P8  P9  P10  P11  P12  P13  P14  P15  P16  P17  \
0                                                                               
a   9  13  22  10   5   3  22  22   4   11    9    6    8   26    5    3    3   
c   8   5   2   7   3   4   4   4   2    1    3    1    8    1    4    4   25   
g   4   3   2   9   3  22   3   3   1    2    1    2    1    2    1   16    1   
t  10  10   5   5  20   2   2   2  24   17   18   22   14    2   21    8    2   

   P18  
0       
a   26  
c    1  
g    3  
t    1  
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [186]:
# Apply and View the Frequency Transformation
frequency_matrix_df = counts_to_frequency_matrix_df(psuedocounts_matrix_df)

Jio.print_df(frequency_matrix_df,df_name="ArgR Binding Motif Frequency Matrix (with Psuedocount)")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ArgR Binding Motif Frequency Matrix (with Psuedocount): 4 Row x 18 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

         P1        P2        P3        P4        P5        P6        P7  \
0                                                                         
a  0.290323  0.419355  0.709677  0.322581  0.161290  0.096774  0.709677   
c  0.258065  0.161290  0.064516  0.225806  0.096774  0.129032  0.129032   
g  0.129032  0.096774  0.064516  0.290323  0.096774  0.709677  0.096774   
t  0.322581  0.322581  0.161290  0.161290  0.645161  0.064516  0.064516   

         P8        P9       P10       P11       P12       P13       P14  \
0                                                                         
a  0.709677  0.129032  0.354839  0.290323  0.193548  0.258065  0.838710   
c  0.129032  0.064516  0.032258  0.096774  0.032258  0.258065  0.032258   
g  0.096774  0.032258  0.064516  0.

In [187]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Log Odds Ratio Frequency Matrix
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Computes matrix of the log odds ratio between frequency and expected frequency
# Expected P is the probability a nucleotide should appear (base is 1 / 4)
# Background is set to 1
def frequency_to_log_odds_matrix_df(frequency_matrix_df, expected_probability = (1/4), background = 1, frequency_uses_pseudocount=True):

    # If the frequency matrix already adjusts via psuedocount
    if frequency_uses_pseudocount:
        # Formula is Ln ( Frequency / Expected Frequency ) / Background
        log_odds_matrix_df = (
                np.log( frequency_matrix_df / expected_probability )
                /
                background
        )
        
    # If the psuedocount is not previously adjusted, add a minor amount to prevent ln(0)
    else:
        # Set an insignificant adjustment to prevent division log of zero
        adjustment = 1e-100

        # Return the Log Odds Matrix with an Adjustment Factor
        log_odds_matrix_df = (
            np.log(
                (frequency_matrix_df + adjustment) 
                / 
                (expected_probability + adjustment)
            ) 
            /
            background
        )

    # Return the Log Odds Matrix DF
    return log_odds_matrix_df

In [188]:
# Apply and View the Log Odds Ratio Transformation
log_odds_matrix_df = frequency_to_log_odds_matrix_df(frequency_matrix_df)

Jio.print_df(log_odds_matrix_df,df_name="ArgR Binding Motif Log Odds Frequency Matrix")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ArgR Binding Motif Log Odds Frequency Matrix: 4 Row x 18 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

         P1        P2        P3        P4        P5        P6        P7  \
0                                                                         
a  0.149532  0.517257  1.043350  0.254892 -0.438255 -0.949081  1.043350   
c  0.031749 -0.438255 -1.354546 -0.101783 -0.949081 -0.661398 -0.661398   
g -0.661398 -0.949081 -1.354546  0.149532 -0.949081  1.043350 -0.949081   
t  0.254892  0.254892 -0.438255 -0.438255  0.948039 -1.354546 -1.354546   

         P8        P9       P10       P11       P12       P13       P14  \
0                                                                         
a  1.043350 -0.661398  0.350202  0.149532 -0.255933  0.031749  1.210404   
c -0.661398 -1.354546 -2.047693 -0.949081 -2.047693  0.031749 -2.047693   
g -0.949081 -2.047693 -1.354546 -2.047693 -1.354546 -2.047693 -1.

In [189]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Position-Specific Weight|Scoring Matrix (PSWM or PSSM)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Function combines the stages of generating a PSW|PSS Matrix
def count_matrix_to_pswm_df(counts_matrix_df):
    
    # Get the Psuedocount Matrix
    psuedocounts_matrix_df = add_psuedocount(counts_matrix_df)
    
    # Get the Frequency Matrix from the Counts Matrix with Psuedocount
    psuedocount_frequency_matrix_df = counts_to_frequency_matrix_df(psuedocounts_matrix_df)
    
    # Get the Log Odds Ratio Frequency Matrix from the Frequency Matrix
    pswm_df = frequency_to_log_odds_matrix_df(psuedocount_frequency_matrix_df, expected_probability = (1/4), background = 1, frequency_uses_pseudocount = True)
    
    # Return the Position Specific Weight|Scoring Matrix
    return pswm_df

In [190]:
# One-step conversion from the Counts Matrix to the PSWM Matrix
pswm_df = count_matrix_to_pswm_df(counts_matrix_df)

Jio.print_df(log_odds_matrix_df,df_name="ArgR Binding Motif PSWM")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ArgR Binding Motif PSWM: 4 Row x 18 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

         P1        P2        P3        P4        P5        P6        P7  \
0                                                                         
a  0.149532  0.517257  1.043350  0.254892 -0.438255 -0.949081  1.043350   
c  0.031749 -0.438255 -1.354546 -0.101783 -0.949081 -0.661398 -0.661398   
g -0.661398 -0.949081 -1.354546  0.149532 -0.949081  1.043350 -0.949081   
t  0.254892  0.254892 -0.438255 -0.438255  0.948039 -1.354546 -1.354546   

         P8        P9       P10       P11       P12       P13       P14  \
0                                                                         
a  1.043350 -0.661398  0.350202  0.149532 -0.255933  0.031749  1.210404   
c -0.661398 -1.354546 -2.047693 -0.949081 -2.047693  0.031749 -2.047693   
g -0.949081 -2.047693 -1.354546 -2.047693 -1.354546 -2.047693 -1.354546   
t -1.354546  1.130361  0.785521  0.842679  1.043350  

In [191]:
# Save the PSWM for the Arg-R Binding Motif Region
if save_file:
    
    # The PSWM DF to Save includes the Index as a Column
    pswm_df_to_save = pswm_df.copy().reset_index(drop=False)
    
    # Save the DF in the specified file format
    Jio.df_to_file(
        df=pswm_df_to_save,
        file_name=f"ArgR-Motif-PSWM",
        file_format=output_format
    )

In [192]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Import Ecoli Genetic Regions Data
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Import the File
genetic_regions_dict = Jio.file_to_df(
    file_name=input_file_paths['Ecoli-Genetic-Regions'],
    force_encode_format=input_format,
    read_args= { "header": None}
)

# Bind the Dataframe
raw_genetic_regions_df = genetic_regions_dict['df']

# View Data
Jio.print_df(raw_genetic_regions_df, df_name="E coli Genetic Regions Raw")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
E coli Genetic Regions Raw: 4319 Row x 1 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

                                                   0
0  16127995 \ aacggcagaccaacatcaactgcaagctttacgcg...
1  16127996 \ ccgtttgctgcatgatattgaaaaaaatatcacca...
2  16127997 \ gaaacgggacgtgaactggagctggcggatattga...
3  16127998 \ ggggattaaagtctcgacggcagaagccagggcta...
4  16127999 \ aggcgaatatggcttgttcctcggcaccgcgcatc...
5  16128000 \ ctacatttatgccgaaaacaatctcttctttttac...
6  16128001 \ ttaagaatgagagaagggttggttgtggcatcctg...
7  16128002 \ cggatgtagcgaaactgcacaaatccggtgcgaaa...
8  16128003 \ gggcgcaagcttccgtaacatcggcgaaattctgg...
9  16128004 \ ctgatgaaagacaaattattggcaaaaggtgtgtc...
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [193]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Ecoli Genetic Regions Preprocessing
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Copy DF to prevent any code issues
genetic_regions_df = raw_genetic_regions_df.copy()

# Split the FASTA Nucleotide Sequence and the Gene ID from the single Row value
genetic_regions_df[['GeneID', 'Sequence']] = (
    genetic_regions_df[0].apply(lambda x: pd.Series(x.split(' \ ')))
)

# # Set the GeneID as the Index 
# genetic_regions_df.set_index('GeneID', inplace=True)

# Remove the trailing backslash ' \' from the 'Sequence' column
genetic_regions_df['Sequence'] = genetic_regions_df['Sequence'].str.rstrip(' \\')


# Split the Upstream and Downstream Regions of the Genetic Region
genetic_regions_df['Upstream'] = genetic_regions_df['Sequence'].str.slice(0, 400)
genetic_regions_df['Downstream'] = genetic_regions_df['Sequence'].str.slice(400)

# Drop the original column if no longer needed
genetic_regions_df.drop(0, axis='columns', inplace=True)

# View Data
Jio.print_df(genetic_regions_df, df_name="E coli Genetic Regions")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
E coli Genetic Regions: 4319 Row x 4 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

     GeneID                                           Sequence  \
0  16127995  aacggcagaccaacatcaactgcaagctttacgcgaacgagccatg...   
1  16127996  ccgtttgctgcatgatattgaaaaaaatatcaccaaataaaaaacg...   
2  16127997  gaaacgggacgtgaactggagctggcggatattgaaattgaacctg...   
3  16127998  ggggattaaagtctcgacggcagaagccagggctattttaccggcg...   
4  16127999  aggcgaatatggcttgttcctcggcaccgcgcatccggcgaaattt...   
5  16128000  ctacatttatgccgaaaacaatctcttctttttacgcctgaacaac...   
6  16128001  ttaagaatgagagaagggttggttgtggcatcctgcggttgataca...   
7  16128002  cggatgtagcgaaactgcacaaatccggtgcgaaaagtgaaccaac...   
8  16128003  gggcgcaagcttccgtaacatcggcgaaattctggaactggcaggc...   
9  16128004  ctgatgaaagacaaattattggcaaaaggtgtgtcgcatttgcttt...   

                                            Upstream  \
0  aacggcagaccaacatcaactgcaagctttacgcgaacgagccatg...   
1  ccgtttgctgcatgatattgaaaaaaatatcacc

In [194]:
# Check Lengths of Sequences to ensure they meet the specifications

# Copy DF to prevent any code issues
region_lengths_df = genetic_regions_df.copy()

# Calculate lengths for Sequence, Upstream, and Downstream columns
# Should be 450
region_lengths_df['Seq_length'] = region_lengths_df['Sequence'].apply(len)
# Should be 400
region_lengths_df['Upstream_length'] = region_lengths_df['Upstream'].apply(len)
# Should be  50
region_lengths_df['Downstream_length'] = region_lengths_df['Downstream'].apply(len)

# Drop the original columns
region_lengths_df.drop(
    ['Sequence','Upstream','Downstream'],
    axis='columns', inplace=True
)

# Inpsect the Metadata regarding Length per 
Jio.print_df(region_lengths_df, df_name="E coli Genetic Regions Metadata",rows=3)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
E coli Genetic Regions Metadata: 4319 Row x 4 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

     GeneID  Seq_length  Upstream_length  Downstream_length
0  16127995         450              400                 50
1  16127996         450              400                 50
2  16127997         450              400                 50
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [195]:
genetic_regions_df

Unnamed: 0,GeneID,Sequence,Upstream,Downstream
0,16127995,aacggcagaccaacatcaactgcaagctttacgcgaacgagccatg...,aacggcagaccaacatcaactgcaagctttacgcgaacgagccatg...,atgaaacgcattagcaccaccattaccaccaccatcaccattacca...
1,16127996,ccgtttgctgcatgatattgaaaaaaatatcaccaaataaaaaacg...,ccgtttgctgcatgatattgaaaaaaatatcaccaaataaaaaacg...,atgcgagtgttgaagttcggcggtacatcagtggcaaatgcagaac...
2,16127997,gaaacgggacgtgaactggagctggcggatattgaaattgaacctg...,gaaacgggacgtgaactggagctggcggatattgaaattgaacctg...,atggttaaagtttatgccccggcttccagtgccaatatgagcgtcg...
3,16127998,ggggattaaagtctcgacggcagaagccagggctattttaccggcg...,ggggattaaagtctcgacggcagaagccagggctattttaccggcg...,atgaaactctacaatctgaaagatcacaacgagcaggtcagctttg...
4,16127999,aggcgaatatggcttgttcctcggcaccgcgcatccggcgaaattt...,aggcgaatatggcttgttcctcggcaccgcgcatccggcgaaattt...,gtgaaaaagatgcaatctatcgtactcgcactttccctggttctgg...
...,...,...,...,...
4314,16132216,gtgctgacgactacgtggctaaaccgttttcaccccgcgaagtgtg...,gtgctgacgactacgtggctaaaccgttttcaccccgcgaagtgtg...,atgcgtatcggcatgcggttgttgctgggctattttttactggtgg...
4315,16132217,tgctgctgaaccggcgttactggagcaggcgctgggaaatttactg...,tgctgctgaaccggcgttactggagcaggcgctgggaaatttactg...,atgttgaaatcccccctgttctggaaaatgactagcctgtttggtg...
4316,16132218,acgtttattagttgtatgatgcaactagttggattattaaaataat...,acgtttattagttgtatgatgcaactagttggattattaaaataat...,atgcagaccccgcacattcttatcgttgaagacgagttggtaacac...
4317,16132219,gtgatgtagtcatctgcaccgatttcgaggccgagaattttatcga...,gtgatgtagtcatctgcaccgatttcgaggccgagaattttatcga...,atgactaaagtacgtaattgcgttcttgatgcactttccatcaacg...


In [196]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Prepare the Genetic Regions DF to be evaluated by PSWM
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [197]:
# Function to Partition a Dataframe by Chunks
def partition_df(df, partition_size, partition_col='Sequence', identifier_col=None, partial_partitions=True, overlaps=False):
    # Prepare a list to collect the partitioned sequences
    partitions = []

    # Extract index name if identifier_col is None and index has a name
    index_name = df.index.name if identifier_col is None and df.index.name is not None else 'Index'

    # Determine the step size based on whether overlaps are allowed
    step_size = 1 if overlaps else partition_size

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        # Determine the identifier based on whether a specific identifier column is provided
        identifier = row[identifier_col] if identifier_col else index

        # Extract the sequence
        sequence = row[partition_col]

        # Generate partitions of the sequence
        for i in range(0, len(sequence) - partition_size + 1, step_size):
            partitioned_sequence = sequence[i:i + partition_size]
            start_point = i
            end_point = i + len(partitioned_sequence) - 1

            # Prepare the data tuple
            partition_data = (
                identifier,
                partitioned_sequence,
                start_point,
                end_point
            )

            # Add the partition depending on whether partial partitions are allowed
            if partial_partitions or len(partitioned_sequence) == partition_size:
                partitions.append(partition_data)

    # Define column names based on whether an identifier column is provided
    columns = [identifier_col if identifier_col else index_name, f'{partition_col}_Partition', 'Start', 'End']

    # Create a new DataFrame from the partitions list
    long_df = pd.DataFrame(partitions, columns=columns)

    # Set the index of the DataFrame if no identifier column was specified
    if identifier_col is None:
        long_df.set_index(index_name, inplace=True)

    # Return the long DataFrame
    return long_df

In [198]:
# Partition Each Sequence of the Gene 
partitioned_genetic_regions_df = partition_df(genetic_regions_df, partition_size=len(pswm_df.columns), identifier_col = 'GeneID', overlaps=True)

In [199]:
# Generate all the Partitions that Need to be Evaluated
Jio.print_df(partitioned_genetic_regions_df, df_name="E coli Genetic Regions partitioned by PSWM Candidates")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
E coli Genetic Regions partitioned by PSWM Candidates: 1870127 Row x 4 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

     GeneID  Sequence_Partition  Start  End
0  16127995  aacggcagaccaacatca      0   17
1  16127995  acggcagaccaacatcaa      1   18
2  16127995  cggcagaccaacatcaac      2   19
3  16127995  ggcagaccaacatcaact      3   20
4  16127995  gcagaccaacatcaactg      4   21
5  16127995  cagaccaacatcaactgc      5   22
6  16127995  agaccaacatcaactgca      6   23
7  16127995  gaccaacatcaactgcaa      7   24
8  16127995  accaacatcaactgcaag      8   25
9  16127995  ccaacatcaactgcaagc      9   26
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [200]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Evaluate Partitioned Genetic Regions DF with PSWM
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Computes the Score of a candidate sequence with the PSWM
def evaluate_sequence_by_pswm_df(sequence,pswm_df,vectorized_approach=True):
    # Check if the sequence length matches the PSWM's number of columns
    if len(sequence) != len(pswm_df.columns):
        # Otherwise raise an error
        raise ValueError(f"Sequence length {len(sequence)} does not match PSWM length {len(pswm_df.columns)}.")

    # Check if all unique characters in the sequence are represented in the PSWM index
    if not set(sequence).issubset(set(pswm_df.index)):
        # Otherwise raise an error
        raise ValueError(f"Sequence contains characters not represented in the PSWM: {set(sequence) - set(pswm_df.index)}")

    # SLOW NESTED LOOP METHOD
    if not vectorized_approach:
        # Evaluate the sequence with the PSWM
        score = 0 # Initalize a score
        for i, char in enumerate(sequence):
            score += pswm_df.loc[char].iloc[i]

    # FAST VECTORIZED METHOD 
    else:
        # Suggested by ChatGPT
        # Convert sequence characters to indices using the PSWM index
        # This presumes that PSWM's index is ordered as needed by the sequence characters
        indices = [pswm_df.index.get_loc(char) for char in sequence]
    
        # Select the appropriate scores using advanced indexing
        scores = pswm_df.values[indices, np.arange(len(sequence))]
    
        # Sum up the scores
        score = np.sum(scores)

    # Return the composite score of the sequence
    return score


In [201]:
# Original Approach by Jay took 3 seconds and 700 ms

# Calculate the Scores for 1% of the DataFrame to Test Validity of the Logic
subset_size = int(len(partitioned_genetic_regions_df) * 0.01)
subset_df = partitioned_genetic_regions_df.iloc[:subset_size].copy()  


# Apply the function to each row of the subset DataFrame
subset_df['Score'] = subset_df.apply(
    lambda row: evaluate_sequence_by_pswm_df(row['Sequence_Partition'], pswm_df,vectorized_approach=False),
    axis='columns'
)


In [202]:
# Vectorized Approach with Numpy as suggested by ChatGPT takes 273 ms

# Calculate the Scores for 1% of the DataFrame to Test Validity of the Logic
subset_size = int(len(partitioned_genetic_regions_df) * 0.01)
subset_df = partitioned_genetic_regions_df.iloc[:subset_size].copy()

# Apply the function to each row of the subset DataFrame
subset_df['Score'] = subset_df.apply(
    lambda row: evaluate_sequence_by_pswm_df(row['Sequence_Partition'], pswm_df,vectorized_approach=True),
    axis='columns'
)

In [203]:
# Vectorized Approach with Numpy as suggested by ChatGPT takes 273 ms
# Original Approach by Jay took 3 seconds and 809 ms

# Scored Genetic Regions DF
scored_genetic_regions_df = partitioned_genetic_regions_df.copy()

# Apply the function to each row of the DataFrame
scored_genetic_regions_df['Score'] = scored_genetic_regions_df.apply(
    lambda row: evaluate_sequence_by_pswm_df(row['Sequence_Partition'], pswm_df),
    axis='columns'
)

In [204]:
# Display the DataFrame with the new 'Score' column
print(partitioned_genetic_regions_df)

           GeneID  Sequence_Partition  Start  End
0        16127995  aacggcagaccaacatca      0   17
1        16127995  acggcagaccaacatcaa      1   18
2        16127995  cggcagaccaacatcaac      2   19
3        16127995  ggcagaccaacatcaact      3   20
4        16127995  gcagaccaacatcaactg      4   21
...           ...                 ...    ...  ...
1870122  16132220  ccgccagagcagaaaata    428  445
1870123  16132220  cgccagagcagaaaatat    429  446
1870124  16132220  gccagagcagaaaatatt    430  447
1870125  16132220  ccagagcagaaaatattg    431  448
1870126  16132220  cagagcagaaaatattgg    432  449

[1870127 rows x 4 columns]


In [205]:
# Modify the Start and End Positions to Correspond to Upstream and Downstream
# Upstream = Negative | Downstream = Positive

scored_genetic_regions_df['Start'] -= 400
scored_genetic_regions_df['End'] -= 400

In [206]:
Jio.print_df(scored_genetic_regions_df,'E coli candidate motif regions as evaluated by PSWM')

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
E coli candidate motif regions as evaluated by PSWM: 1870127 Row x 5 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

     GeneID  Sequence_Partition  Start  End      Score
0  16127995  aacggcagaccaacatca   -400 -383  -6.009405
1  16127995  acggcagaccaacatcaa   -399 -382  -4.967363
2  16127995  cggcagaccaacatcaac   -398 -381 -10.219168
3  16127995  ggcagaccaacatcaact   -397 -380 -11.167474
4  16127995  gcagaccaacatcaactg   -396 -379  -4.301812
5  16127995  cagaccaacatcaactgc   -395 -378  -5.766668
6  16127995  agaccaacatcaactgca   -394 -377  -0.367716
7  16127995  gaccaacatcaactgcaa   -393 -376  -7.400153
8  16127995  accaacatcaactgcaag   -392 -375  -9.973552
9  16127995  ccaacatcaactgcaagc   -391 -374 -12.817403
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [207]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Gather Top Genes in terms of Motif Consensus
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

top_genes_threshold = 30

# Group by 'GeneID' and find the row index of the max 'Score' per group
idx_max_scores = scored_genetic_regions_df.groupby('GeneID')['Score'].idxmax()

# Use the indices to subset the rows with the max 'Score' from the original DataFrame
top_genes_df = scored_genetic_regions_df.loc[idx_max_scores]

# Sort subsetted rows by 'Score' in descending order and get the top 30
top_genes_df = top_genes_df.sort_values(by='Score', ascending=False).head(top_genes_threshold)

In [208]:
# View the Top 30 Genes
Jio.print_df(top_genes_df ,'E coli genes with highest motif candidacy as evaluated by PSWM')

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
E coli genes with highest motif candidacy as evaluated by PSWM: 30 Row x 5 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

           GeneID  Sequence_Partition  Start  End      Score
1341103     b3171  tcactgaatttttatgca   -298 -281  14.543283
113799   16128258  aaagtgaattttaattca    -47  -30  14.391985
1808996  16132076  aaattgaattttaattca    -45  -28  13.804199
1369154  16131126  cagatgaatttttatgca   -392 -375  13.234430
1421868  16131238  ttagtgattttttatgca    -71  -54  13.089849
567057   16129301  tgaatgaataatcatgca   -140 -123  12.701051
292111   16128684  ctaatgtatatttatgca   -131 -114  12.536748
1129829  16130583  atattgcatttttattca   -268 -251  12.396702
1679074  16131795  ttagtgtatttttattca    -67  -50  12.396702
13339    16128026  attgtgaattaatatgca    -51  -34  11.908349
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [209]:
# Save the Results of the PSWM Evaluation
if save_file:

    # Establish the Result DFs to Save
    result_dfs = {
        'Top30': top_genes_df,
        'All': scored_genetic_regions_df,
    }
    
    # Save the Result DFs
    for subset, result_df in result_dfs.items():
        # Save the DF in the specified file format
        Jio.df_to_file(
            df=result_df,
            file_name=f"Gene_Binding_Site_Evaluations_{subset}",
            file_format=output_format
        )


In [210]:
# ~~~~~~~~~~~~~~~
#  End of Script
# ~~~~~~~~~~~~~~~
