In [18]:
metadata = {
    'Author      ': 'Jay Annadurai',
    'Date        ': '20 Mar 2024',
    'Project     ': 'PQ-B528',
    'Version     ': 1.0,
    'Description ': ' '
}

In [19]:
# ~~~~~~~~~~~~~~~~~~
#  Import Libraries
# ~~~~~~~~~~~~~~~~~~
import pandas as pd  # Data Reading
import seaborn as sb  # Advanced Data Visualization
import matplotlib.pyplot as plt  # Data Visualization
import numpy as np  # Computation
import scipy as sp # Statistical Methods

# ~~~~~~~~~~~~~~~~~~~~~~~
#  Import Utlity Classes
# ~~~~~~~~~~~~~~~~~~~~~~~
from pprint import pprint as print  # Override the standard print function with Pretty Print
from JayUtilities import DataIO as Jio  # Data Input/Output Processing Utility Class

In [20]:
# ~~~~~~~~~~~~~~~
#  Script Config
# ~~~~~~~~~~~~~~~
# Input Config
Jio.input_folder = "Input/" # Sets the Input Folder for the DataIO Class
input_file = 'SampleInput.xlsx' # Name of the Input File
input_format = 'xlsx' # Optionally force the encode format of the Input File

# Output Config
Jio.output_folder= "Output/" # Sets the Output Folder for the DataIO Class
save_file = True, # Sets whether the script should save the final file or not
output_file = 'SampleOutput' # Name of the file to save the Output to
output_format = 'tsv' # Format of the file to save the Output as

In [27]:
# ~~~~~~~~~~~~~ 
#  Import Data
# ~~~~~~~~~~~~~
df_dict = Jio.file_to_df(file_name=input_file, force_encode_format=input_format)
dataframe = df_dict["df"]
print(df_dict['metadata'])

{'cols': {'count': 19,
          'dtypes': {'GSM820516': dtype('float64'),
                     'GSM820517': dtype('float64'),
                     'GSM820518': dtype('float64'),
                     'GSM820519': dtype('float64'),
                     'GSM820520': dtype('float64'),
                     'GSM820521': dtype('float64'),
                     'GSM820522': dtype('float64'),
                     'GSM820523': dtype('float64'),
                     'GSM820524': dtype('float64'),
                     'GSM820525': dtype('float64'),
                     'GSM820526': dtype('float64'),
                     'GSM820527': dtype('float64'),
                     'GSM820528': dtype('float64'),
                     'GSM820529': dtype('float64'),
                     'GSM820530': dtype('float64'),
                     'GSM820531': dtype('float64'),
                     'GSM820532': dtype('float64'),
                     'GSM820533': dtype('float64'),
                     'Probe_ID': dtype('O

In [22]:
Jio.print_df(dataframe,"Gene Expression Data")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Gene Expression Data: 48804 Row x 19 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

       Probe_ID   GSM820516   GSM820517   GSM820518   GSM820519   GSM820520  \
0  ILMN_1343291      0.0000      0.0000      0.0000      0.0000      0.0000   
1  ILMN_1343291  66665.3800  69404.6700  64128.0700  68943.9700  67827.2200   
2  ILMN_1343295  22040.1100  13046.3400  38678.9600  16641.8900  33719.8900   
3  ILMN_1651199      0.0000    205.4483    217.2475    229.0451    226.3029   
4  ILMN_1651209   -278.5710    253.7044    211.8002    278.0423    259.8059   
5  ILMN_1651210         NaN    195.9835    175.3356    193.9065    229.5674   
6  ILMN_1651221         NaN    206.0723    219.5992    205.0462    194.7481   
7  ILMN_1651228  17556.5200   9639.6110  20990.7000  19182.0300  24082.5100   
8  ILMN_1651229    547.4667    799.7516    790.4313   1127.9260    935.4859   
9  ILMN_1651230    227.9082    219.9870    211.4823    212.2066    201.5164   

    GS

In [23]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Melt the DF into Long Format
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
long_dataframe = Jio.wide_to_long(
    df = dataframe,
    group_by = "Probe_ID",
    grouped_columns_name = "Sample",
    dependent_variable_name= "Expression"
)

Jio.print_df(long_dataframe,"Long Gene Expression")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Long Gene Expression: 878472 Row x 3 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

       Probe_ID     Sample  Expression
0  ILMN_1343291  GSM820516      0.0000
1  ILMN_1343291  GSM820516  66665.3800
2  ILMN_1343295  GSM820516  22040.1100
3  ILMN_1651199  GSM820516      0.0000
4  ILMN_1651209  GSM820516   -278.5710
5  ILMN_1651210  GSM820516         NaN
6  ILMN_1651221  GSM820516         NaN
7  ILMN_1651228  GSM820516  17556.5200
8  ILMN_1651229  GSM820516    547.4667
9  ILMN_1651230  GSM820516    227.9082
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [24]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#  Process the Long Format DF
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
final_df = Jio.preprocess_long_df_values(
    long_df=long_dataframe,
    data_columns='Expression',
    behavior_nans='Drop',
    behavior_negs='Drop',
    behavior_zeroes='Keep',
    force_type='float'
)

Jio.print_df(final_df,"Processed Long Gene Expression")


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Processed Long Gene Expression: 878467 Row x 3 Col
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

       Probe_ID     Sample  Expression
0  ILMN_1343291  GSM820516      0.0000
1  ILMN_1343291  GSM820516  66665.3800
2  ILMN_1343295  GSM820516  22040.1100
3  ILMN_1651199  GSM820516      0.0000
4  ILMN_1651228  GSM820516  17556.5200
5  ILMN_1651229  GSM820516    547.4667
6  ILMN_1651230  GSM820516    227.9082
7  ILMN_1651232  GSM820516    304.5083
8  ILMN_1651235  GSM820516    245.1882
9  ILMN_1651236  GSM820516    232.6654
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [25]:
# ~~~~~~~~~~~~~~~~~
#  Save the Output
# ~~~~~~~~~~~~~~~~~
if save_file: Jio.df_to_file(df=final_df, file_name=output_file,file_format=output_format)


In [26]:
# ~~~~~~~~~~~~~~~
#  End of Script
# ~~~~~~~~~~~~~~~
