# Generate table for imodulondb that includes lists of genes that are deleted in specific conditions

In [1]:
import pandas as pd

In [24]:
# Read the merged_info file
gene_info = pd.read_csv('../../data/processed/modulome/merged_gene_info.csv')

# Read the gene deletion file
gene_deletion = pd.read_csv('../../data/raw/modulome/J1074_BGC_deletions.csv', sep=';')

gene_deletion.head()

Unnamed: 0,BGC,Deletion,Status,%_reduced,strain_after_deletion,from,to
0,19-23,1,Deleted,6.65 (455 kb),J1074_CfBΔ5,XNR_RS27780,XNR_RS31675
1,13,2,Deleted,0.04 (2.7 kb),J1074_CfBΔ6,XNR_RS11580,XNR_RS11585
2,15,3,Deleted,0.63 (42.8 kb),J1074_CfBΔ7,XNR_RS17095,XNR_RS17110
3,5,4,Deleted,0.33 (22.3 kb),J1074_CfBΔ8,XNR_RS02850,XNR_RS02945
4,2,5,Deleted,0.41 (28 kb),J1074_CfBΔ9,XNR_RS00955,XNR_RS01020


## Identify all genes between the two locus_tags and add them to a list.

In [26]:
# Initialize a list to store the lists of deleted genes for each deletion
deleted_genes_master_list = []

# Iterate through each row in the gene_deletion dataframe
for index, row in gene_deletion.iterrows():
    # Find the index of the 'from' locus_tag in the gene_info dataframe
    from_index = gene_info[gene_info['locus_tag'] == row['from']].index[0]
    
    # Find the index of the 'to' locus_tag in the gene_info dataframe
    to_index = gene_info[gene_info['locus_tag'] == row['to']].index[0]
    
    # Extract all locus_tags between these indices, including the 'from' and 'to' locus_tags
    deleted_genes = gene_info.loc[from_index:to_index, 'locus_tag'].tolist()
    
    # Append this list to the master list
    deleted_genes_master_list.append(deleted_genes)

# deleted_genes_master_list now contains the lists of deleted genes for each deletion
# Add this list to the gene_deletion dataframe
gene_deletion['deleted_genes'] = deleted_genes_master_list

# Initialize a list to store the cumulative lists of deleted genes for each deletion
cumulative_deleted_genes_list = []

# Initialize an empty list to keep track of the cumulative deleted genes
cumulative_genes = []

# Iterate through the list of lists of deleted genes
for deleted_genes in deleted_genes_master_list:
    # Concatenate the current list of deleted genes with the cumulative list
    cumulative_genes += deleted_genes
    
    # Append a copy of the cumulative list to the cumulative_deleted_genes_list
    # Use copy() to ensure each list is independent
    cumulative_deleted_genes_list.append(cumulative_genes.copy())

# Now, cumulative_deleted_genes_list contains the cumulative lists of deleted genes for each step
# You can add this list to the gene_deletion dataframe as a new column
gene_deletion['cumulative_deleted_genes'] = cumulative_deleted_genes_list

gene_deletion.head()

# Save the gene_deletion dataframe to a new CSV file
gene_deletion.to_csv('../../data/processed/modulome/J1074_BGC_deletions_with_genes.csv', index=False)


In [18]:
# Initialize a list to store the lists of deleted genes for each deletion
deleted_genes_master_list = []

# Iterate through each row in the gene_deletion dataframe
for index, row in gene_deletion.iterrows():
    # Find the index of the 'from' locus_tag in the gene_info dataframe
    from_index = gene_info[gene_info['locus_tag'] == row['from']].index[0]
    
    # Find the index of the 'to' locus_tag in the gene_info dataframe
    to_index = gene_info[gene_info['locus_tag'] == row['to']].index[0]
    
    # Extract all locus_tags between these indices, including the 'from' and 'to' locus_tags
    deleted_genes = gene_info.loc[from_index:to_index, 'locus_tag'].tolist()
    
    # Format the list of deleted genes as a single string with genes separated by underscores
    formatted_deleted_genes = "_".join(deleted_genes)
    
    # Append this formatted string to the master list
    deleted_genes_master_list.append(formatted_deleted_genes)

# deleted_genes_master_list now contains the formatted strings of deleted genes for each deletion
# Add this list to the gene_deletion dataframe
gene_deletion['deleted_genes'] = deleted_genes_master_list

gene_deletion.head()

Unnamed: 0,BGC,Deletion,Status,%_reduced,strain_after_deletion,from,to,deleted_genes
0,19-23,1,Deleted,6.65 (455 kb),J1074_CfBΔ5,XNR_RS27780,XNR_RS31675,XNR_RS27780_XNR_RS27785_XNR_RS27790_XNR_RS2779...
1,13,2,Deleted,0.04 (2.7 kb),J1074_CfBΔ6,XNR_RS11580,XNR_RS11585,XNR_RS11580_XNR_RS11585
2,15,3,Deleted,0.63 (42.8 kb),J1074_CfBΔ7,XNR_RS17095,XNR_RS17110,XNR_RS17095_XNR_RS17100_XNR_RS17105_XNR_RS17110
3,5,4,Deleted,0.33 (22.3 kb),J1074_CfBΔ8,XNR_RS02850,XNR_RS02945,XNR_RS02850_XNR_RS02855_XNR_RS02860_XNR_RS0286...
4,2,5,Deleted,0.41 (28 kb),J1074_CfBΔ9,XNR_RS00955,XNR_RS01020,XNR_RS00955_XNR_RS00960_XNR_RS00965_XNR_RS0097...


In [20]:
# Initialize a new list to store the cumulative deletions
cumulative_deletions = []

# Initialize an empty string to keep track of the cumulative deletion string
cumulative_deletion_str = ""

# Iterate through the list of formatted deleted genes
for formatted_deleted_genes in deleted_genes_master_list:
    # If it's the first element, it becomes the initial cumulative deletion string
    if not cumulative_deletion_str:
        cumulative_deletion_str = formatted_deleted_genes
    else:
        # For subsequent elements, add the new deletion to the existing cumulative string
        # Ensure there's no leading or trailing underscore when concatenating
        cumulative_deletion_str += "_" + formatted_deleted_genes
    
    # Append the updated cumulative deletion string to the list
    cumulative_deletions.append(cumulative_deletion_str)

# Now, cumulative_deletions contains the cumulative deletions for each step
# You can add this list to the gene_deletion dataframe as a new column
gene_deletion['cumulative_deletions'] = cumulative_deletions

gene_deletion.head()

Unnamed: 0,BGC,Deletion,Status,%_reduced,strain_after_deletion,from,to,deleted_genes,cumulative_deletions
0,19-23,1,Deleted,6.65 (455 kb),J1074_CfBΔ5,XNR_RS27780,XNR_RS31675,XNR_RS27780_XNR_RS27785_XNR_RS27790_XNR_RS2779...,XNR_RS27780_XNR_RS27785_XNR_RS27790_XNR_RS2779...
1,13,2,Deleted,0.04 (2.7 kb),J1074_CfBΔ6,XNR_RS11580,XNR_RS11585,XNR_RS11580_XNR_RS11585,XNR_RS27780_XNR_RS27785_XNR_RS27790_XNR_RS2779...
2,15,3,Deleted,0.63 (42.8 kb),J1074_CfBΔ7,XNR_RS17095,XNR_RS17110,XNR_RS17095_XNR_RS17100_XNR_RS17105_XNR_RS17110,XNR_RS27780_XNR_RS27785_XNR_RS27790_XNR_RS2779...
3,5,4,Deleted,0.33 (22.3 kb),J1074_CfBΔ8,XNR_RS02850,XNR_RS02945,XNR_RS02850_XNR_RS02855_XNR_RS02860_XNR_RS0286...,XNR_RS27780_XNR_RS27785_XNR_RS27790_XNR_RS2779...
4,2,5,Deleted,0.41 (28 kb),J1074_CfBΔ9,XNR_RS00955,XNR_RS01020,XNR_RS00955_XNR_RS00960_XNR_RS00965_XNR_RS0097...,XNR_RS27780_XNR_RS27785_XNR_RS27790_XNR_RS2779...


In [22]:
gene_deletion.to_csv('../../data/processed/modulome/gene_deletion_info.csv', index=False)

Create a new dataframe for the del14 samples

In [33]:
# Read in the del14_deletion file
del14_deletion = pd.read_csv('../../data/processed/modulome/gene_deletions_possible_del14_dels.csv', sep=';')

# Step 1: Convert the locus_tag column to a list
locus_tags_list = del14_deletion['locus_tag'].tolist()

# Step 2: Join the elements with "&"
deleted_genes_str = "&".join(locus_tags_list)

# Step 3: Create a new DataFrame
new_df = pd.DataFrame({
    'strain': ['Del14'],  # Creating a list with a single element 'Del14'
    'deleted_genes': [deleted_genes_str]  # Creating a list with the joined string
})

new_df.to_clipboard(index=False, header=False)