In [69]:
metadata = {
    'Author      ': 'Jay Annadurai',
    'Date        ': '14 Feb 2024',
    'Project     ': 'A1-Matrices',
    'Version     ': 1.0,
    'Description ': 'Reads and Compares Gene Expression Matrices in RPKM and Generates corresponding Correlation Matrices and  Correlation Heatmaps'
}

In [70]:
# Input Matrix Files
matrix = "DecayTimecourse"
input_folder = "data/"
output_folder = "output/"

In [71]:
# Import Libraries
import pandas as pd  # Data Reading
import seaborn as sb  # Advanced Data Visualization
import matplotlib.pyplot as mplot  # Data Visualization
import numpy as numpy  # Computation

In [72]:
###############
# Import Data #
###############

def read_matrix_to_df(folder: str, file_name: str, extension: str):
    # Function Reads the Matrix and Returns it as a Pandas DF

    # Read the TSV
    extension = "." + extension.lower()
    file_path = folder + file_name + extension
    matrix_df = pd.read_csv(file_path, sep='\t', header=0)

    # Drop the Extra Column Produced by Extraneous Delimiters
    matrix_df = matrix_df.drop(columns=['Unnamed: 13'])

    # Excluding the non-numeric column (assuming it's the first column)
    numeric_df = matrix_df.select_dtypes(include=[float, int])

    # Generate the 1D Vector of Values
    matrix_vector = numeric_df.values.flatten()

    # Return a Dict with the File Name, the Original DF, the Numeric DF, and the Flattened Vector
    return {"name": file_name, "df": matrix_df, "numeric_df": numeric_df, "vector": matrix_vector}

In [73]:
# Import the Dataframe
decay_df_dict = read_matrix_to_df(input_folder, matrix,"txt")
decay_df = decay_df_dict["df"]

In [74]:
decay_df

Unnamed: 0,Time course #,timecourse1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 18,timecourse3,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,YORF,0.0,5.000000,10.000000,15.000000,20.000000,30.000000,40.000000,50.000000,60.000000,...,60.000000,0.0,5.000000,10.000000,15.000000,20.000000,30.000000,40.000000,50.000000,60.000000
1,YAL026C,1.0,0.703032,,,,0.713088,0.332218,0.215325,0.226495,...,,,,,,,,,,
2,YDR148C,,,,,,,,,,...,0.374799,1.0,0.968435,,0.551961,0.430054,0.483829,0.398754,0.223900,0.246680
3,YIL125W,1.0,1.176639,0.823170,0.556528,,0.712678,0.427962,0.320204,0.276578,...,,1.0,0.830889,0.577928,0.812495,0.591393,0.564740,0.450448,0.494973,0.374799
4,YLR240W,,,,,,,,,,...,,1.0,0.930606,,0.637943,,0.610046,0.340338,0.260625,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6180,YHR217C,1.0,,,,,1.086471,-0.683673,0.853815,0.131799,...,0.621047,1.0,0.720896,0.217361,0.372403,-0.245485,-1.210194,0.255268,-1.275420,
6181,YPR202W,1.0,-1.614135,-1.987968,-1.572060,-2.140755,,-2.624831,,-2.929300,...,-2.435356,1.0,-1.062584,-1.766918,-1.066211,-1.900540,-1.659776,-2.599347,-3.163888,-2.765729
6182,YFL063W,1.0,0.709353,,,0.384163,-0.003912,-1.611686,,-1.417343,...,,1.0,,-1.107214,-0.985987,-1.409186,-1.515253,-2.073723,-2.828369,
6183,YDR543C,1.0,0.384179,-11.808840,-12.892562,,-88.868870,,,,...,,1.0,,,,,,,,


In [75]:
###################################################
# Split the Dataframes per Time Course Experiment #
###################################################

# Initialize a list to keep track of the start indices of each 'timecourse' set
timecourse_start_indices = []

# Identify columns that start with 'timecourse'
# Generate the index intervals for each timecourse dataset
for col_index, col in enumerate(decay_df.columns):
    if col.startswith('timecourse'):
        timecourse_start_indices.append(col_index)

# Add the last column index to cover the range for the last segment
timecourse_start_indices.append(len(decay_df.columns))

# Generate the actual intervals as ranges
timecourse_interval_ranges = []
for i,start_index in enumerate(timecourse_start_indices):
    # Skip the first element
    if i == 0: continue
    
    # The interval from the previous index to the current index are the interval endpoints for each time course set
    interval_range = range(timecourse_start_indices[i-1],timecourse_start_indices[i])
    
    # Append the Interval Range into the List
    timecourse_interval_ranges.append(interval_range)

In [88]:
# Initialize the List to store the individual Time Course DFs
decay_timecourse_dfs = []

# Iterate through the Ranges corresponding to the Endpoints of each Time Course
for timecourse_interval_range in timecourse_interval_ranges:
    
    # Select the Range of each Time Course 
    # Note iloc is the subset of the dataframe
    # iloc uses indices verus loc which uses column and row labels
    decay_timecourse_df = decay_df.iloc[:,timecourse_interval_range]
    
    # Add the first column which is the label for each row
    # The first column was lost when the df was split and must be fused
    # Fusion is performed via Pandas' df concat along the 
    decay_df_row_labels = decay_df.iloc[:, 0]
    decay_timecourse_df = pd.concat([decay_df_row_labels, decay_timecourse_df], axis="columns")
    
    # Set the Column Labels, i.e. the Headers, to the Row of Index = 0
    decay_timecourse_df.columns = decay_timecourse_df.iloc[0,:]

    # Drop the First Row which is the Header but Keep All the Columns
    decay_timecourse_df = decay_timecourse_df.iloc[1:,:]
    
    # Reset the Index In-Place
    # decay_timecourse_df.reset_index(drop=True, inplace=True)

    # Attach the Individual DF to the List of DFs
    decay_timecourse_dfs.append(decay_timecourse_df)

    # Debug, display the Header
    print(f"Decay Timecourse Dataframe from {str(timecourse_interval_range)} \n\n", decay_timecourse_df.head(3), "\n")
    
# decay_timecourse_dfs now contains individual dfs for each time course

Decay Timecourse Dataframe from range(1, 10) 

 0     YORF  0.0       5.0     10.0      15.0  20.0      30.0      40.0  \
1  YAL026C  1.0  0.703032      NaN       NaN   NaN  0.713088  0.332218   
2  YDR148C  NaN       NaN      NaN       NaN   NaN       NaN       NaN   
3  YIL125W  1.0  1.176639  0.82317  0.556528   NaN  0.712678  0.427962   

0      50.0      60.0  
1  0.215325  0.226495  
2       NaN       NaN  
3  0.320204  0.276578   

Decay Timecourse Dataframe from range(10, 18) 

 0     YORF  0.0       5.0      10.0      20.0      30.0      40.0      50.0  \
1  YAL026C  1.0  1.160240       NaN  1.129715  0.645523  0.344193  0.635076   
2  YDR148C  1.0  0.830889  0.577928  0.591393  0.564740  0.450448  0.494973   
3  YIL125W  1.0  0.978603  0.574234  0.685354  0.705738  0.403536  0.480144   

0      60.0  
1       NaN  
2  0.374799  
3       NaN   

Decay Timecourse Dataframe from range(18, 27) 

 0     YORF  0.0       5.0      10.0      15.0      20.0      30.0      40.0  \
1  YA

In [76]:
###################################################
# Restructure DFs into Optimal Form for Analytics #
###################################################

for decay_timecourse_df in decay_timecourse_dfs:
    # Use Pandas Melt