In [1]:
import pandas as pd

### Read and Formatting Functions

In [4]:
# This file came from running "find . -type f -name "*.mod" | grep -v '/\._\|PEET' " 
# in the directory "grp_tomo_db1_d1/compute/TomoDB1_d1/FlagellarMotor_P1"
def read_data_file(filename):
    """For reading in a file containing a list of file paths to .mod files
    
    parameters:
    filename (str): the name of the file to read in

    returns:
    df2 (pd.DataFrame): a DataFrame containing the file path information
    """


    # Read the file and split each line into rows
    with open(filename, 'r') as file:
        lines = file.readlines()
        # Remove the newline character from each line
        # and split the directory paths by '/'
        rows = [line.strip().split('/') for line in lines]

        # Create the DataFrame
        df = pd.DataFrame(rows)

        # Remove the "." column
        df2 = df[df.columns[1:]]

        # Add a column for the whole file name
        file_names = [line.strip() for line in lines]
        df2['file_name'] = file_names

        return df2

In [5]:
def df_formatter(df, useful_cols_only=True):
    """
    This function takes in a dataframe and gets rid of the columns that are not useful
    and also creates run cols that are hopefully useful

    params:
    df (pd.DataFrame): The dataframe to be formatted (that is a dataframe from reading the mod file directories)
    useful_cols_only (bool): If True, only the useful columns are returned. If False, all columns are returned
    
    returns:
    pd.DataFrame: The formatted dataframe
    """


    # Define a function to extracting dates
    # TODO: This function is not the best and should be improved. Maybe use regex?
    def find_runs(row):
        for col_value in row:
            if isinstance(col_value, str) and '20' in col_value and len(col_value) >= 10 and col_value[-1:].isdigit():
                return col_value
        return None
    
    # Create a new column 'runs' using the find_runs function on each row
    df['Runs'] = df.apply(find_runs, axis=1)

    # Split the batecteria type into segments
    max_segments = df[1].str.count(r'[_ ]').max() + 1 # Find the maximum number of segments
    columns = [f'segment_{i}' for i in range(1, max_segments + 1)] # Create the column names as "segement_1", "segment_2", etc.
    df[columns] = df[1].str.split(r'[_ ]', expand=True) # Split the column into the segments
    df.rename(columns={'segment_1':'Bacteria Species'},inplace=True) # Rename the first segment to "Bacteria Species"


    # Create a new column with each .mod file type
    # TODO: This function is not the best and should be improved. Maybe use regex?
    def find_part(row):
        for col_value in row: # For each value in the row
            if isinstance(col_value, str) and '.mod' in col_value: # If the value is a string and contains ".mod"
                return col_value.split('.')[0] # Return the value without the ".mod"
        return None
    
    # Create a new column 'mod_value' using the find_part function on each row
    df['Mod Parts'] = df.apply(find_part, axis=1)

    if not useful_cols_only:
        return df
    else:
        # Return only the created columns
        # and the columns with "segment" in their name
        return df[['Bacteria Species', 'Mod Parts', 'Runs', 'file_name'] + [col for col in df.columns if 'segment' in str(col)]]


### Actually Read & Format The Data, and Double Check It

In [6]:
# Read in the data and format it
df_FM_P1 = read_data_file('10052_Hpylori_dfliQ.txt')
df_FM_P1 = df_formatter(df_FM_P1)

# Double check what it looks like
df_FM_P1

Unnamed: 0,Bacteria Species,Mod Parts,Runs,file_name
0,mka2019-11-19-143,MS,mka2019-11-19-143,./mka2019-11-19-143/MS.mod
1,mka2019-11-19-73,MS,mka2019-11-19-73,./mka2019-11-19-73/MS.mod
2,mka2019-11-19-192,MS,mka2019-11-19-192,./mka2019-11-19-192/MS.mod
3,mka2019-11-19-42,MS,mka2019-11-19-42,./mka2019-11-19-42/MS.mod
4,mka2019-11-19-59,hat,mka2019-11-19-59,./mka2019-11-19-59/hat.mod
...,...,...,...,...
102,mka2019-11-19-198,MS,mka2019-11-19-198,./mka2019-11-19-198/MS.mod
103,mka2019-11-19-92,hat,mka2019-11-19-92,./mka2019-11-19-92/hat.mod
104,mka2019-11-19-203,MS,mka2019-11-19-203,./mka2019-11-19-203/MS.mod
105,mka2019-11-19-203,MSC,mka2019-11-19-203,./mka2019-11-19-203/MSC.mod


In [7]:
# Find the counts of all the values of the different mod parts
df_FM_P1['Mod Parts'].value_counts()

Mod Parts
MS     67
hat    33
MSC     7
Name: count, dtype: int64

In [8]:
# Find the counts of all the values of the different runs
df_FM_P1['Runs'].value_counts()

Runs
mka2019-11-19-65     3
mka2019-11-19-50     2
mka2019-11-19-203    2
mka2019-11-19-17     2
mka2019-11-19-127    2
                    ..
mka2019-11-19-220    1
mka2019-11-19-4      1
mka2019-11-19-129    1
mka2019-11-19-113    1
mka2019-11-19-144    1
Name: count, Length: 94, dtype: int64