# ESM1b analysis 

#### 1) The E.coli LTEE data was retrived from the authors github page -> https://github.com/benjaminhgood/LTEE-metagenomic
#### 2) The path of the folder is data_files , from here the all population annotated_timecourse text was taken and a masterhseet was prepared with only missense mutations.
#### 3) In order to process this file for ESM1b analysis we need all variants WT protien sequence. We retrived the DNA and protein sequence from NCBI --> https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/017/985/GCF_000017985.1_ASM1798v1/ . 
#### 4) The NCBI files such as cds_genomics.fna and translate_cds.faa along with feature table.txt was important for this analysis.

# Step 1

#### To retrive the DNA sequence for each variants, we need to do the following steps.

In [1]:
import pandas as pd

# Step 1: Open and read the text file
file_path = "../data/GCA_000017985.1_ASM1798v1_cds_from_genomic.fna"  # Replace with your file path
data = []

try:
    with open(file_path, 'r') as file:
        header = None
        sequence = []

        for line in file:
            if line.startswith(">"):
                # If a new header is encountered, save the previous sequence (if any)
                if header is not None:
                    data.append([header, ''.join(sequence)])
                header = line.strip()
                sequence = []
            else:
                sequence.append(line.strip())

        # Add the last sequence (if any)
        if header is not None:
            data.append([header, ''.join(sequence)])

    # Step 2: Create a DataFrame
    df = pd.DataFrame(data, columns=['col1', 'col2'])

    # Step 3: Display the DataFrame
    # print(df.head())

except FileNotFoundError:
    print(f"File '{file_path}' not found.")
except IOError:
    print(f"An error occurred while reading the file '{file_path}'.")


In [3]:
df.head()

Unnamed: 0,col1,col2
0,>lcl|CP000819.1_cds_ACT37694.1_1 [gene=thrL] [...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...
1,>lcl|CP000819.1_cds_ACT37695.1_2 [gene=thrA] [...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...
2,>lcl|CP000819.1_cds_ACT37696.1_3 [gene=thrB] [...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...
3,>lcl|CP000819.1_cds_ACT37697.1_4 [gene=thrC] [...,ATGAAACTCTACAATCTGAAAGATCACAATGAGCAGGTCAGCTTTG...
4,>lcl|CP000819.1_cds_ACT37698.1_5 [gene=yaaX] [...,ATGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...


In [4]:
# Define a regular expression pattern to match attributes in square brackets
pattern = r'\[([^]]+)\]'

# Split the "col2" column by the attribute pattern and expand it into multiple columns
split_columns = df['col1'].str.split(pattern, expand=True)

# Rename the new columns for clarity (optional)
split_columns.columns = [f'Attribute_{i}' for i in range(len(split_columns.columns))]

# Concatenate the split_columns DataFrame with the original DataFrame
result_df = pd.concat([df, split_columns], axis=1)

# Drop the original "col2" column if needed
# result_df = result_df.drop(columns='col2')

# Print the result
# print(result_df)


In [5]:
result_df

Unnamed: 0,col1,col2,Attribute_0,Attribute_1,Attribute_2,Attribute_3,Attribute_4,Attribute_5,Attribute_6,Attribute_7,Attribute_8,Attribute_9,Attribute_10,Attribute_11,Attribute_12,Attribute_13,Attribute_14
0,>lcl|CP000819.1_cds_ACT37694.1_1 [gene=thrL] [...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,>lcl|CP000819.1_cds_ACT37694.1_1,gene=thrL,,locus_tag=ECB_00001,,protein=thr operon leader peptide,,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,
1,>lcl|CP000819.1_cds_ACT37695.1_2 [gene=thrA] [...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,>lcl|CP000819.1_cds_ACT37695.1_2,gene=thrA,,locus_tag=ECB_00002,,protein=bifunctional aspartokinase I/homeserin...,,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,
2,>lcl|CP000819.1_cds_ACT37696.1_3 [gene=thrB] [...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,>lcl|CP000819.1_cds_ACT37696.1_3,gene=thrB,,locus_tag=ECB_00003,,protein=homoserine kinase,,protein_id=ACT37696.1,,location=2800..3732,,gbkey=CDS,,,
3,>lcl|CP000819.1_cds_ACT37697.1_4 [gene=thrC] [...,ATGAAACTCTACAATCTGAAAGATCACAATGAGCAGGTCAGCTTTG...,>lcl|CP000819.1_cds_ACT37697.1_4,gene=thrC,,locus_tag=ECB_00004,,protein=threonine synthase,,protein_id=ACT37697.1,,location=3733..5019,,gbkey=CDS,,,
4,>lcl|CP000819.1_cds_ACT37698.1_5 [gene=yaaX] [...,ATGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,>lcl|CP000819.1_cds_ACT37698.1_5,gene=yaaX,,locus_tag=ECB_00005,,protein=hypothetical protein,,protein_id=ACT37698.1,,location=5232..5528,,gbkey=CDS,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,>lcl|CP000819.1_cds_ACT41898.1_4205 [gene=creB...,ATGCAACGGGAAACGGTCTGGTTAGTGGAAGATGAGCAAGGGATAG...,>lcl|CP000819.1_cds_ACT41898.1_4205,gene=creB,,locus_tag=ECB_04274,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41898.1,,location=4624167..4624856,,gbkey=CDS,,,
4205,>lcl|CP000819.1_cds_ACT41899.1_4206 [gene=creC...,ATGCGTATCGGCATGCGGTTGCTGCTGGGCTATTTTTTACTGGTGG...,>lcl|CP000819.1_cds_ACT41899.1_4206,gene=creC,,locus_tag=ECB_04275,,protein=sensory histidine kinase in two-compon...,,protein_id=ACT41899.1,,location=4624856..4626280,,gbkey=CDS,,,
4206,>lcl|CP000819.1_cds_ACT41900.1_4207 [gene=arcA...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,>lcl|CP000819.1_cds_ACT41900.1_4207,gene=arcA,,locus_tag=ECB_04277,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41900.1,,location=complement(4627750..4628466),,gbkey=CDS,,,
4207,>lcl|CP000819.1_cds_ACT41901.1_4208 [gene=yjjY...,ATGACTAAAGTACGTAATTGCGTTCTTGATGCACTTTCCATCAACG...,>lcl|CP000819.1_cds_ACT41901.1_4208,gene=yjjY,,locus_tag=ECB_04278,,protein=hypothetical protein,,protein_id=ACT41901.1,,location=4628562..4628702,,gbkey=CDS,,,


In [6]:
# Rename columns
# Assuming you want to rename "col2" to "new_column_name"
result_df.columns = ['Info' if col == 'col1' else col for col in result_df.columns]
# # Assuming you want to rename "col2" to "new_column_name"
result_df.columns = ['DNA_Seq' if col == 'col2' else col for col in result_df.columns]
result_df.columns = ['Function' if col == 'Attribute_5' else col for col in result_df.columns]
result_df.columns = ['protein_id' if col == 'Attribute_7' else col for col in result_df.columns]
result_df.columns = ['Location' if col == 'Attribute_9' else col for col in result_df.columns]
result_df.columns = ['locus_tag' if col == 'Attribute_3' else col for col in result_df.columns]

In [7]:
# Split "col2" based on "=" and create new columns C and D
result_df[['C', 'D']] = result_df['Attribute_1'].str.split('=', n=1, expand=True)

In [8]:
result_df.columns = ['Gene' if col == 'D' else col for col in result_df.columns]

In [9]:
result_df

Unnamed: 0,Info,DNA_Seq,Attribute_0,Attribute_1,Attribute_2,locus_tag,Attribute_4,Function,Attribute_6,protein_id,Attribute_8,Location,Attribute_10,Attribute_11,Attribute_12,Attribute_13,Attribute_14,C,Gene
0,>lcl|CP000819.1_cds_ACT37694.1_1 [gene=thrL] [...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,>lcl|CP000819.1_cds_ACT37694.1_1,gene=thrL,,locus_tag=ECB_00001,,protein=thr operon leader peptide,,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,,gene,thrL
1,>lcl|CP000819.1_cds_ACT37695.1_2 [gene=thrA] [...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,>lcl|CP000819.1_cds_ACT37695.1_2,gene=thrA,,locus_tag=ECB_00002,,protein=bifunctional aspartokinase I/homeserin...,,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,,gene,thrA
2,>lcl|CP000819.1_cds_ACT37696.1_3 [gene=thrB] [...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,>lcl|CP000819.1_cds_ACT37696.1_3,gene=thrB,,locus_tag=ECB_00003,,protein=homoserine kinase,,protein_id=ACT37696.1,,location=2800..3732,,gbkey=CDS,,,,gene,thrB
3,>lcl|CP000819.1_cds_ACT37697.1_4 [gene=thrC] [...,ATGAAACTCTACAATCTGAAAGATCACAATGAGCAGGTCAGCTTTG...,>lcl|CP000819.1_cds_ACT37697.1_4,gene=thrC,,locus_tag=ECB_00004,,protein=threonine synthase,,protein_id=ACT37697.1,,location=3733..5019,,gbkey=CDS,,,,gene,thrC
4,>lcl|CP000819.1_cds_ACT37698.1_5 [gene=yaaX] [...,ATGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,>lcl|CP000819.1_cds_ACT37698.1_5,gene=yaaX,,locus_tag=ECB_00005,,protein=hypothetical protein,,protein_id=ACT37698.1,,location=5232..5528,,gbkey=CDS,,,,gene,yaaX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,>lcl|CP000819.1_cds_ACT41898.1_4205 [gene=creB...,ATGCAACGGGAAACGGTCTGGTTAGTGGAAGATGAGCAAGGGATAG...,>lcl|CP000819.1_cds_ACT41898.1_4205,gene=creB,,locus_tag=ECB_04274,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41898.1,,location=4624167..4624856,,gbkey=CDS,,,,gene,creB
4205,>lcl|CP000819.1_cds_ACT41899.1_4206 [gene=creC...,ATGCGTATCGGCATGCGGTTGCTGCTGGGCTATTTTTTACTGGTGG...,>lcl|CP000819.1_cds_ACT41899.1_4206,gene=creC,,locus_tag=ECB_04275,,protein=sensory histidine kinase in two-compon...,,protein_id=ACT41899.1,,location=4624856..4626280,,gbkey=CDS,,,,gene,creC
4206,>lcl|CP000819.1_cds_ACT41900.1_4207 [gene=arcA...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,>lcl|CP000819.1_cds_ACT41900.1_4207,gene=arcA,,locus_tag=ECB_04277,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41900.1,,location=complement(4627750..4628466),,gbkey=CDS,,,,gene,arcA
4207,>lcl|CP000819.1_cds_ACT41901.1_4208 [gene=yjjY...,ATGACTAAAGTACGTAATTGCGTTCTTGATGCACTTTCCATCAACG...,>lcl|CP000819.1_cds_ACT41901.1_4208,gene=yjjY,,locus_tag=ECB_04278,,protein=hypothetical protein,,protein_id=ACT41901.1,,location=4628562..4628702,,gbkey=CDS,,,,gene,yjjY


In [10]:
# Check for spaces between characters in each row of "Gene"
result_df['contains_space'] = result_df['Gene'].str.contains(r'\s')

# Replace True with "Yes" and False with "No" in the "contains_
# " column
result_df['contains_space'] = result_df['contains_space'].replace({True: 'Yes', False: 'No'})

# Print the DataFrame
# print(result_df)

In [11]:
result_df

Unnamed: 0,Info,DNA_Seq,Attribute_0,Attribute_1,Attribute_2,locus_tag,Attribute_4,Function,Attribute_6,protein_id,Attribute_8,Location,Attribute_10,Attribute_11,Attribute_12,Attribute_13,Attribute_14,C,Gene,contains_space
0,>lcl|CP000819.1_cds_ACT37694.1_1 [gene=thrL] [...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,>lcl|CP000819.1_cds_ACT37694.1_1,gene=thrL,,locus_tag=ECB_00001,,protein=thr operon leader peptide,,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,,gene,thrL,No
1,>lcl|CP000819.1_cds_ACT37695.1_2 [gene=thrA] [...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,>lcl|CP000819.1_cds_ACT37695.1_2,gene=thrA,,locus_tag=ECB_00002,,protein=bifunctional aspartokinase I/homeserin...,,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,,gene,thrA,No
2,>lcl|CP000819.1_cds_ACT37696.1_3 [gene=thrB] [...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,>lcl|CP000819.1_cds_ACT37696.1_3,gene=thrB,,locus_tag=ECB_00003,,protein=homoserine kinase,,protein_id=ACT37696.1,,location=2800..3732,,gbkey=CDS,,,,gene,thrB,No
3,>lcl|CP000819.1_cds_ACT37697.1_4 [gene=thrC] [...,ATGAAACTCTACAATCTGAAAGATCACAATGAGCAGGTCAGCTTTG...,>lcl|CP000819.1_cds_ACT37697.1_4,gene=thrC,,locus_tag=ECB_00004,,protein=threonine synthase,,protein_id=ACT37697.1,,location=3733..5019,,gbkey=CDS,,,,gene,thrC,No
4,>lcl|CP000819.1_cds_ACT37698.1_5 [gene=yaaX] [...,ATGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,>lcl|CP000819.1_cds_ACT37698.1_5,gene=yaaX,,locus_tag=ECB_00005,,protein=hypothetical protein,,protein_id=ACT37698.1,,location=5232..5528,,gbkey=CDS,,,,gene,yaaX,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,>lcl|CP000819.1_cds_ACT41898.1_4205 [gene=creB...,ATGCAACGGGAAACGGTCTGGTTAGTGGAAGATGAGCAAGGGATAG...,>lcl|CP000819.1_cds_ACT41898.1_4205,gene=creB,,locus_tag=ECB_04274,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41898.1,,location=4624167..4624856,,gbkey=CDS,,,,gene,creB,No
4205,>lcl|CP000819.1_cds_ACT41899.1_4206 [gene=creC...,ATGCGTATCGGCATGCGGTTGCTGCTGGGCTATTTTTTACTGGTGG...,>lcl|CP000819.1_cds_ACT41899.1_4206,gene=creC,,locus_tag=ECB_04275,,protein=sensory histidine kinase in two-compon...,,protein_id=ACT41899.1,,location=4624856..4626280,,gbkey=CDS,,,,gene,creC,No
4206,>lcl|CP000819.1_cds_ACT41900.1_4207 [gene=arcA...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,>lcl|CP000819.1_cds_ACT41900.1_4207,gene=arcA,,locus_tag=ECB_04277,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41900.1,,location=complement(4627750..4628466),,gbkey=CDS,,,,gene,arcA,No
4207,>lcl|CP000819.1_cds_ACT41901.1_4208 [gene=yjjY...,ATGACTAAAGTACGTAATTGCGTTCTTGATGCACTTTCCATCAACG...,>lcl|CP000819.1_cds_ACT41901.1_4208,gene=yjjY,,locus_tag=ECB_04278,,protein=hypothetical protein,,protein_id=ACT41901.1,,location=4628562..4628702,,gbkey=CDS,,,,gene,yjjY,No


In [13]:
# Check if any of the character contains spaces 
result_df['contains_space'].unique()

array(['No'], dtype=object)

In [14]:
result_df.columns

Index(['Info', 'DNA_Seq', 'Attribute_0', 'Attribute_1', 'Attribute_2',
       'locus_tag', 'Attribute_4', 'Function', 'Attribute_6', 'protein_id',
       'Attribute_8', 'Location', 'Attribute_10', 'Attribute_11',
       'Attribute_12', 'Attribute_13', 'Attribute_14', 'C', 'Gene',
       'contains_space'],
      dtype='object')

In [15]:
# Select the columns you want to keep
DNA = result_df[['Info', 'DNA_Seq', 'Attribute_0', 'Attribute_1', 'Attribute_2',
       'locus_tag', 'Attribute_4', 'Function', 'Attribute_6', 'protein_id',
       'Attribute_8', 'Location', 'Attribute_10', 'Attribute_11',
       'Attribute_12', 'Attribute_13', 'Attribute_14', 'C', 'Gene',]]

In [16]:
# Write in local 
# Specify the file path where you want to save the Excel file
excel_file_path = '../Results/DNA_seq.xlsx'

# Write the DataFrame to an Excel file
DNA.to_excel(excel_file_path, index=False)

print(f'DataFrame saved to {excel_file_path}')

DataFrame saved to ../Results/DNA_seq.xlsx


#### To retrive the Protein sequence for each variants, we need to do the following steps.

In [17]:
# Step 1: Open and read the text file
file_path = "../data/GCA_000017985.1_ASM1798v1_translated_cds.faa"  # Replace with your file path
data = []

try:
    with open(file_path, 'r') as file:
        header = None
        sequence = []

        for line in file:
            if line.startswith(">"):
                # If a new header is encountered, save the previous sequence (if any)
                if header is not None:
                    data.append([header, ''.join(sequence)])
                header = line.strip()
                sequence = []
            else:
                sequence.append(line.strip())

        # Add the last sequence (if any)
        if header is not None:
            data.append([header, ''.join(sequence)])

    # Step 2: Create a DataFrame
    df = pd.DataFrame(data, columns=['col1', 'col2'])

    # Step 3: Display the DataFrame
    # print(df.head())

except FileNotFoundError:
    print(f"File '{file_path}' not found.")
except IOError:
    print(f"An error occurred while reading the file '{file_path}'.")



In [18]:
df

Unnamed: 0,col1,col2
0,>lcl|CP000819.1_prot_ACT37694.1_1 [gene=thrL] ...,MKRISTTITTTITITTGNGAG
1,>lcl|CP000819.1_prot_ACT37695.1_2 [gene=thrA] ...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...
2,>lcl|CP000819.1_prot_ACT37696.1_3 [gene=thrB] ...,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...
3,>lcl|CP000819.1_prot_ACT37697.1_4 [gene=thrC] ...,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...
4,>lcl|CP000819.1_prot_ACT37698.1_5 [gene=yaaX] ...,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...
...,...,...
4204,>lcl|CP000819.1_prot_ACT41898.1_4205 [gene=cre...,MQRETVWLVEDEQGIADTLVYMLQQEGFAVEVFERGLPVLDKARQQ...
4205,>lcl|CP000819.1_prot_ACT41899.1_4206 [gene=cre...,MRIGMRLLLGYFLLVAVAAWFVLAIFVKEVKPGVRRATEGTLIDTA...
4206,>lcl|CP000819.1_prot_ACT41900.1_4207 [gene=arc...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...
4207,>lcl|CP000819.1_prot_ACT41901.1_4208 [gene=yjj...,MTKVRNCVLDALSINVNNIISLVVGTFPQDPTVSKTAVILTILTAT


In [19]:
# Define a regular expression pattern to match attributes in square brackets
pattern = r'\[([^]]+)\]'

# Split the "col2" column by the attribute pattern and expand it into multiple columns
split_columns = df['col1'].str.split(pattern, expand=True)

# Rename the new columns for clarity (optional)
split_columns.columns = [f'Attribute_{i}' for i in range(len(split_columns.columns))]

# Concatenate the split_columns DataFrame with the original DataFrame
result_df = pd.concat([df, split_columns], axis=1)

# Drop the original "col2" column if needed
# result_df = result_df.drop(columns='col2')

# Print the result
# print(result_df)


In [20]:
result_df

Unnamed: 0,col1,col2,Attribute_0,Attribute_1,Attribute_2,Attribute_3,Attribute_4,Attribute_5,Attribute_6,Attribute_7,Attribute_8,Attribute_9,Attribute_10,Attribute_11,Attribute_12,Attribute_13,Attribute_14
0,>lcl|CP000819.1_prot_ACT37694.1_1 [gene=thrL] ...,MKRISTTITTTITITTGNGAG,>lcl|CP000819.1_prot_ACT37694.1_1,gene=thrL,,locus_tag=ECB_00001,,protein=thr operon leader peptide,,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,
1,>lcl|CP000819.1_prot_ACT37695.1_2 [gene=thrA] ...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,>lcl|CP000819.1_prot_ACT37695.1_2,gene=thrA,,locus_tag=ECB_00002,,protein=bifunctional aspartokinase I/homeserin...,,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,
2,>lcl|CP000819.1_prot_ACT37696.1_3 [gene=thrB] ...,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,>lcl|CP000819.1_prot_ACT37696.1_3,gene=thrB,,locus_tag=ECB_00003,,protein=homoserine kinase,,protein_id=ACT37696.1,,location=2800..3732,,gbkey=CDS,,,
3,>lcl|CP000819.1_prot_ACT37697.1_4 [gene=thrC] ...,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,>lcl|CP000819.1_prot_ACT37697.1_4,gene=thrC,,locus_tag=ECB_00004,,protein=threonine synthase,,protein_id=ACT37697.1,,location=3733..5019,,gbkey=CDS,,,
4,>lcl|CP000819.1_prot_ACT37698.1_5 [gene=yaaX] ...,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,>lcl|CP000819.1_prot_ACT37698.1_5,gene=yaaX,,locus_tag=ECB_00005,,protein=hypothetical protein,,protein_id=ACT37698.1,,location=5232..5528,,gbkey=CDS,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,>lcl|CP000819.1_prot_ACT41898.1_4205 [gene=cre...,MQRETVWLVEDEQGIADTLVYMLQQEGFAVEVFERGLPVLDKARQQ...,>lcl|CP000819.1_prot_ACT41898.1_4205,gene=creB,,locus_tag=ECB_04274,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41898.1,,location=4624167..4624856,,gbkey=CDS,,,
4205,>lcl|CP000819.1_prot_ACT41899.1_4206 [gene=cre...,MRIGMRLLLGYFLLVAVAAWFVLAIFVKEVKPGVRRATEGTLIDTA...,>lcl|CP000819.1_prot_ACT41899.1_4206,gene=creC,,locus_tag=ECB_04275,,protein=sensory histidine kinase in two-compon...,,protein_id=ACT41899.1,,location=4624856..4626280,,gbkey=CDS,,,
4206,>lcl|CP000819.1_prot_ACT41900.1_4207 [gene=arc...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,>lcl|CP000819.1_prot_ACT41900.1_4207,gene=arcA,,locus_tag=ECB_04277,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41900.1,,location=complement(4627750..4628466),,gbkey=CDS,,,
4207,>lcl|CP000819.1_prot_ACT41901.1_4208 [gene=yjj...,MTKVRNCVLDALSINVNNIISLVVGTFPQDPTVSKTAVILTILTAT,>lcl|CP000819.1_prot_ACT41901.1_4208,gene=yjjY,,locus_tag=ECB_04278,,protein=hypothetical protein,,protein_id=ACT41901.1,,location=4628562..4628702,,gbkey=CDS,,,


In [21]:
# Rename the rows 
# Assuming you want to rename "col2" to "new_column_name"
result_df.columns = ['Info' if col == 'col1' else col for col in result_df.columns]
# # Assuming you want to rename "col2" to "new_column_name"
result_df.columns = ['Prt_Seq' if col == 'col2' else col for col in result_df.columns]
result_df.columns = ['Function' if col == 'Attribute_5' else col for col in result_df.columns]
result_df.columns = ['protein_id' if col == 'Attribute_7' else col for col in result_df.columns]
result_df.columns = ['Location' if col == 'Attribute_9' else col for col in result_df.columns]
result_df.columns = ['locus_tag' if col == 'Attribute_3' else col for col in result_df.columns]

In [22]:
# Split "col2" based on "=" and create new columns C and D
result_df[['C', 'D']] = result_df['Attribute_1'].str.split('=', n=1, expand=True)

In [23]:
result_df.columns = ['Gene' if col == 'D' else col for col in result_df.columns]

In [24]:
result_df

Unnamed: 0,Info,Prt_Seq,Attribute_0,Attribute_1,Attribute_2,locus_tag,Attribute_4,Function,Attribute_6,protein_id,Attribute_8,Location,Attribute_10,Attribute_11,Attribute_12,Attribute_13,Attribute_14,C,Gene
0,>lcl|CP000819.1_prot_ACT37694.1_1 [gene=thrL] ...,MKRISTTITTTITITTGNGAG,>lcl|CP000819.1_prot_ACT37694.1_1,gene=thrL,,locus_tag=ECB_00001,,protein=thr operon leader peptide,,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,,gene,thrL
1,>lcl|CP000819.1_prot_ACT37695.1_2 [gene=thrA] ...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,>lcl|CP000819.1_prot_ACT37695.1_2,gene=thrA,,locus_tag=ECB_00002,,protein=bifunctional aspartokinase I/homeserin...,,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,,gene,thrA
2,>lcl|CP000819.1_prot_ACT37696.1_3 [gene=thrB] ...,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,>lcl|CP000819.1_prot_ACT37696.1_3,gene=thrB,,locus_tag=ECB_00003,,protein=homoserine kinase,,protein_id=ACT37696.1,,location=2800..3732,,gbkey=CDS,,,,gene,thrB
3,>lcl|CP000819.1_prot_ACT37697.1_4 [gene=thrC] ...,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,>lcl|CP000819.1_prot_ACT37697.1_4,gene=thrC,,locus_tag=ECB_00004,,protein=threonine synthase,,protein_id=ACT37697.1,,location=3733..5019,,gbkey=CDS,,,,gene,thrC
4,>lcl|CP000819.1_prot_ACT37698.1_5 [gene=yaaX] ...,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,>lcl|CP000819.1_prot_ACT37698.1_5,gene=yaaX,,locus_tag=ECB_00005,,protein=hypothetical protein,,protein_id=ACT37698.1,,location=5232..5528,,gbkey=CDS,,,,gene,yaaX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,>lcl|CP000819.1_prot_ACT41898.1_4205 [gene=cre...,MQRETVWLVEDEQGIADTLVYMLQQEGFAVEVFERGLPVLDKARQQ...,>lcl|CP000819.1_prot_ACT41898.1_4205,gene=creB,,locus_tag=ECB_04274,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41898.1,,location=4624167..4624856,,gbkey=CDS,,,,gene,creB
4205,>lcl|CP000819.1_prot_ACT41899.1_4206 [gene=cre...,MRIGMRLLLGYFLLVAVAAWFVLAIFVKEVKPGVRRATEGTLIDTA...,>lcl|CP000819.1_prot_ACT41899.1_4206,gene=creC,,locus_tag=ECB_04275,,protein=sensory histidine kinase in two-compon...,,protein_id=ACT41899.1,,location=4624856..4626280,,gbkey=CDS,,,,gene,creC
4206,>lcl|CP000819.1_prot_ACT41900.1_4207 [gene=arc...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,>lcl|CP000819.1_prot_ACT41900.1_4207,gene=arcA,,locus_tag=ECB_04277,,protein=DNA-binding response regulator in two-...,,protein_id=ACT41900.1,,location=complement(4627750..4628466),,gbkey=CDS,,,,gene,arcA
4207,>lcl|CP000819.1_prot_ACT41901.1_4208 [gene=yjj...,MTKVRNCVLDALSINVNNIISLVVGTFPQDPTVSKTAVILTILTAT,>lcl|CP000819.1_prot_ACT41901.1_4208,gene=yjjY,,locus_tag=ECB_04278,,protein=hypothetical protein,,protein_id=ACT41901.1,,location=4628562..4628702,,gbkey=CDS,,,,gene,yjjY


In [25]:
# Check for spaces between characters in each row of "Gene"
result_df['contains_space'] = result_df['Gene'].str.contains(r'\s')

# Replace True with "Yes" and False with "No" in the "contains_space" column
result_df['contains_space'] = result_df['contains_space'].replace({True: 'Yes', False: 'No'})

# Print the DataFrame
# print(result_df)

In [26]:
result_df['contains_space'].unique()

array(['No'], dtype=object)

In [29]:
# Check for duplicate values in the "col2" column
duplicates = result_df[result_df['Gene'].duplicated(keep=False)]
duplicates
# Print the duplicate rows
# print(duplicates)

Unnamed: 0,Info,Prt_Seq,Attribute_0,Attribute_1,Attribute_2,locus_tag,Attribute_4,Function,Attribute_6,protein_id,Attribute_8,Location,Attribute_10,Attribute_11,Attribute_12,Attribute_13,Attribute_14,C,Gene,contains_space


In [30]:
# Select the columns you want to keep
Prt = result_df[['Info', 'Prt_Seq', 'Attribute_0', 'Attribute_1', 'Attribute_2',
       'locus_tag', 'Attribute_4', 'Function', 'Attribute_6', 'protein_id',
       'Attribute_8', 'Location', 'Attribute_10', 'Attribute_11',
       'Attribute_12', 'Attribute_13', 'Attribute_14', 'C', 'Gene',]]

In [31]:
# Write in local 
# Specify the file path where you want to save the Excel file
excel_file_path = '../Results/Prt_seq.xlsx'

# Write the DataFrame to an Excel file
result_df.to_excel(excel_file_path, index=False)

print(f'DataFrame saved to {excel_file_path}')

DataFrame saved to ../Results/Prt_seq.xlsx


#### Now we have DNA and protien sequence for each gene. So merge it with variant file.

In [32]:
import pandas as pd
df = pd.read_excel('../data/Mastersheet.xlsx',sheet_name='missense') #Variant sheet 
dna = pd.read_excel("../Results/DNA_seq.xlsx") # DNA sheet 
prt = pd.read_excel("../Results/Prt_seq.xlsx") # Protein sheet 

In [34]:
dna.head()

Unnamed: 0,Info,DNA_Seq,Attribute_0,Attribute_1,Attribute_2,locus_tag,Attribute_4,Function,Attribute_6,protein_id,Attribute_8,Location,Attribute_10,Attribute_11,Attribute_12,Attribute_13,Attribute_14,C,Gene
0,>lcl|CP000819.1_cds_ACT37694.1_1 [gene=thrL] [...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,>lcl|CP000819.1_cds_ACT37694.1_1,gene=thrL,,locus_tag=ECB_00001,,protein=thr operon leader peptide,,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,,gene,thrL
1,>lcl|CP000819.1_cds_ACT37695.1_2 [gene=thrA] [...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,>lcl|CP000819.1_cds_ACT37695.1_2,gene=thrA,,locus_tag=ECB_00002,,protein=bifunctional aspartokinase I/homeserin...,,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,,gene,thrA
2,>lcl|CP000819.1_cds_ACT37696.1_3 [gene=thrB] [...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,>lcl|CP000819.1_cds_ACT37696.1_3,gene=thrB,,locus_tag=ECB_00003,,protein=homoserine kinase,,protein_id=ACT37696.1,,location=2800..3732,,gbkey=CDS,,,,gene,thrB
3,>lcl|CP000819.1_cds_ACT37697.1_4 [gene=thrC] [...,ATGAAACTCTACAATCTGAAAGATCACAATGAGCAGGTCAGCTTTG...,>lcl|CP000819.1_cds_ACT37697.1_4,gene=thrC,,locus_tag=ECB_00004,,protein=threonine synthase,,protein_id=ACT37697.1,,location=3733..5019,,gbkey=CDS,,,,gene,thrC
4,>lcl|CP000819.1_cds_ACT37698.1_5 [gene=yaaX] [...,ATGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,>lcl|CP000819.1_cds_ACT37698.1_5,gene=yaaX,,locus_tag=ECB_00005,,protein=hypothetical protein,,protein_id=ACT37698.1,,location=5232..5528,,gbkey=CDS,,,,gene,yaaX


In [35]:
prt.head()

Unnamed: 0,Info,Prt_Seq,Attribute_0,Attribute_1,Attribute_2,locus_tag,Attribute_4,Function,Attribute_6,protein_id,Attribute_8,Location,Attribute_10,Attribute_11,Attribute_12,Attribute_13,Attribute_14,C,Gene,contains_space
0,>lcl|CP000819.1_prot_ACT37694.1_1 [gene=thrL] ...,MKRISTTITTTITITTGNGAG,>lcl|CP000819.1_prot_ACT37694.1_1,gene=thrL,,locus_tag=ECB_00001,,protein=thr operon leader peptide,,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,,gene,thrL,No
1,>lcl|CP000819.1_prot_ACT37695.1_2 [gene=thrA] ...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,>lcl|CP000819.1_prot_ACT37695.1_2,gene=thrA,,locus_tag=ECB_00002,,protein=bifunctional aspartokinase I/homeserin...,,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,,gene,thrA,No
2,>lcl|CP000819.1_prot_ACT37696.1_3 [gene=thrB] ...,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,>lcl|CP000819.1_prot_ACT37696.1_3,gene=thrB,,locus_tag=ECB_00003,,protein=homoserine kinase,,protein_id=ACT37696.1,,location=2800..3732,,gbkey=CDS,,,,gene,thrB,No
3,>lcl|CP000819.1_prot_ACT37697.1_4 [gene=thrC] ...,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,>lcl|CP000819.1_prot_ACT37697.1_4,gene=thrC,,locus_tag=ECB_00004,,protein=threonine synthase,,protein_id=ACT37697.1,,location=3733..5019,,gbkey=CDS,,,,gene,thrC,No
4,>lcl|CP000819.1_prot_ACT37698.1_5 [gene=yaaX] ...,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,>lcl|CP000819.1_prot_ACT37698.1_5,gene=yaaX,,locus_tag=ECB_00005,,protein=hypothetical protein,,protein_id=ACT37698.1,,location=5232..5528,,gbkey=CDS,,,,gene,yaaX,No


In [36]:
# Merge DNA with prt firstly then merge thus df with the original sheet 
DNA_prt = dna.merge(prt, on='Gene', how='left')

In [37]:
# Merge DNA with prt firstly then merge thus df with the original sheet 
Final_df= df.merge(DNA_prt, on='Gene', how='left')

In [42]:
Final_df.head()

Unnamed: 0,Position,Gene,Allele,Annotation,Test.statistic,P-value,Ref_allele,Alt_allele,label,Info_x,...,protein_id_y,Attribute_8_y,Location_y,Attribute_10_y,Attribute_11_y,Attribute_12_y,Attribute_13_y,Attribute_14_y,C_y,contains_space
0,241,thrL,A->C,missense,2.88705,0.0001,A,C,m1,>lcl|CP000819.1_cds_ACT37694.1_1 [gene=thrL] [...,...,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,,gene,No
1,1615,thrA,T->G,missense,0.986369,0.0001,T,G,m1,>lcl|CP000819.1_cds_ACT37695.1_2 [gene=thrA] [...,...,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,,gene,No
2,6046,yaaA,C->A,missense,1.68194,0.077982,C,A,m1,>lcl|CP000819.1_cds_ACT37699.1_6 [gene=yaaA] [...,...,protein_id=ACT37699.1,,location=complement(5681..6457),,gbkey=CDS,,,,gene,No
3,6220,yaaA,T->G,missense,2.47871,0.0001,T,G,m1,>lcl|CP000819.1_cds_ACT37699.1_6 [gene=yaaA] [...,...,protein_id=ACT37699.1,,location=complement(5681..6457),,gbkey=CDS,,,,gene,No
4,7406,yaaJ,A->C,missense,3.43714,0.0023,A,C,m1,>lcl|CP000819.1_cds_ACT37700.1_7 [gene=yaaJ] [...,...,protein_id=ACT37700.1,,location=complement(6527..7957),,gbkey=CDS,,,,gene,No


In [43]:
# Write in local 
# Specify the file path where you want to save the Excel file
excel_file_path = '../Results/DNA_Prt_variant.xlsx'

# Write the DataFrame to an Excel file
Final_df.to_excel(excel_file_path, index=False)

print(f'DataFrame saved to {excel_file_path}')

DataFrame saved to ../Results/DNA_Prt_variant.xlsx


# Step2 

#### Prepare this data as per input format for esm1b

In [44]:
# Check if 'U' is present in any row of 'Prt_Seq'
contains_U = Final_df['Prt_Seq'].str.contains('U')

# Print the result
if contains_U.any():
    print("The letter 'U' is present in at least one row of 'Prt_Seq'.")
else:
    print("The letter 'U' is not present in any row of 'Prt_Seq'.")

The letter 'U' is present in at least one row of 'Prt_Seq'.


In [46]:
# Drop nan from Prt_Seq column
Final_df = Final_df.dropna(subset=['Prt_Seq'])
Final_df.reset_index(inplace=True, drop=True)

In [48]:
contains_U = Final_df['Prt_Seq'].str.contains('U')

In [49]:
# Check if 'U' is present in any row of 'Prt_Seq'
contains_U = Final_df['Prt_Seq'].str.contains('U')

# Find the rows where 'U' is present
rows_with_U = Final_df.loc[contains_U]
# rows_with_U
# Print the rows containing 'U'
# print()

In [50]:
# Check if 'U' is present in any row of 'Prt_Seq'
contains_U = Final_df['Prt_Seq'].str.contains('U')

# Create a new DataFrame without rows containing 'U'
Final_df = Final_df[~contains_U]

# Display the filtered DataFrame
Final_df

Unnamed: 0,Position,Gene,Allele,Annotation,Test.statistic,P-value,Ref_allele,Alt_allele,label,Info_x,...,protein_id_y,Attribute_8_y,Location_y,Attribute_10_y,Attribute_11_y,Attribute_12_y,Attribute_13_y,Attribute_14_y,C_y,contains_space
0,241,thrL,A->C,missense,2.887050,0.000100,A,C,m1,>lcl|CP000819.1_cds_ACT37694.1_1 [gene=thrL] [...,...,protein_id=ACT37694.1,,location=190..255,,gbkey=CDS,,,,gene,No
1,1615,thrA,T->G,missense,0.986369,0.000100,T,G,m1,>lcl|CP000819.1_cds_ACT37695.1_2 [gene=thrA] [...,...,protein_id=ACT37695.1,,location=336..2798,,gbkey=CDS,,,,gene,No
2,6046,yaaA,C->A,missense,1.681940,0.077982,C,A,m1,>lcl|CP000819.1_cds_ACT37699.1_6 [gene=yaaA] [...,...,protein_id=ACT37699.1,,location=complement(5681..6457),,gbkey=CDS,,,,gene,No
3,6220,yaaA,T->G,missense,2.478710,0.000100,T,G,m1,>lcl|CP000819.1_cds_ACT37699.1_6 [gene=yaaA] [...,...,protein_id=ACT37699.1,,location=complement(5681..6457),,gbkey=CDS,,,,gene,No
4,7406,yaaJ,A->C,missense,3.437140,0.002300,A,C,m1,>lcl|CP000819.1_cds_ACT37700.1_7 [gene=yaaJ] [...,...,protein_id=ACT37700.1,,location=complement(6527..7957),,gbkey=CDS,,,,gene,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22799,4627830,arcA,C->T,missense,1.390190,0.012235,C,T,p6,>lcl|CP000819.1_cds_ACT41900.1_4207 [gene=arcA...,...,protein_id=ACT41900.1,,location=complement(4627750..4628466),,gbkey=CDS,,,,gene,No
22800,4627933,arcA,T->G,missense,2.990930,0.005099,T,G,p6,>lcl|CP000819.1_cds_ACT41900.1_4207 [gene=arcA...,...,protein_id=ACT41900.1,,location=complement(4627750..4628466),,gbkey=CDS,,,,gene,No
22801,4628381,arcA,T->G,missense,1.179320,0.000100,T,G,p6,>lcl|CP000819.1_cds_ACT41900.1_4207 [gene=arcA...,...,protein_id=ACT41900.1,,location=complement(4627750..4628466),,gbkey=CDS,,,,gene,No
22802,4628701,yjjY,A->C,missense,5.263680,0.000100,A,C,p6,>lcl|CP000819.1_cds_ACT41901.1_4208 [gene=yjjY...,...,protein_id=ACT41901.1,,location=4628562..4628702,,gbkey=CDS,,,,gene,No


In [53]:
# List of columns you want to keep esm1b input 
columns_to_keep = ['Position', 'Gene','Prt_Seq']

# Select the desired columns and assign them back to the DataFrame
Final_df = Final_df[columns_to_keep]

In [54]:
# Merge 'Position' and 'Gene' into a new column 'seq_id' with an underscore separator
Final_df['seq_id'] = Final_df['Position'].astype(str) + '_' + df['Gene'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_df['seq_id'] = Final_df['Position'].astype(str) + '_' + df['Gene'].astype(str)


In [55]:
Final_df

Unnamed: 0,Position,Gene,Prt_Seq,seq_id
0,241,thrL,MKRISTTITTTITITTGNGAG,241_thrL
1,1615,thrA,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,1615_thrA
2,6046,yaaA,MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQI...,6046_yaaA
3,6220,yaaA,MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQI...,6220_yaaA
4,7406,yaaJ,MPDFFSFINSVLWGSVMIYLLFGAGCWFTFRTGFVQFRYIRQFGKS...,7406_yaaJ
...,...,...,...,...
22799,4627830,arcA,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,4627830_yjjG
22800,4627933,arcA,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,4627933_prfC
22801,4628381,arcA,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,4628381_prfC
22802,4628701,yjjY,MTKVRNCVLDALSINVNNIISLVVGTFPQDPTVSKTAVILTILTAT,4628701_prfC


In [56]:
# List of columns you want to keep
columns_to_keep = ['seq_id', 'Prt_Seq']

# Select the desired columns and assign them back to the DataFrame
Final_df = Final_df[columns_to_keep]

In [57]:
Final_df.columns = ['Prt_Seq_WT' if col == 'Prt_Seq' else col for col in Final_df.columns]
Final_df

Unnamed: 0,seq_id,Prt_Seq_WT
0,241_thrL,MKRISTTITTTITITTGNGAG
1,1615_thrA,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...
2,6046_yaaA,MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQI...
3,6220_yaaA,MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQI...
4,7406_yaaJ,MPDFFSFINSVLWGSVMIYLLFGAGCWFTFRTGFVQFRYIRQFGKS...
...,...,...
22799,4627830_yjjG,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...
22800,4627933_prfC,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...
22801,4628381_prfC,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...
22802,4628701_prfC,MTKVRNCVLDALSINVNNIISLVVGTFPQDPTVSKTAVILTILTAT


In [58]:
# Read the Excel file
# all_data = pd.read_excel("/content/drive/MyDrive/compiled_final.xlsx")

# Output file name
output_file = '../Results/esm1b_input_WT.fasta'

# Write to the text file
with open(output_file, 'w') as f:
    for index, row in Final_df.iterrows():
        #f.write(f'>{row["seq_id"]}\n{row["aa_seq_Mut"]}\n')
        f.write(f'>{row["seq_id"]}\n{row["Prt_Seq_WT"]}\n')

print('Data written to', output_file)

Data written to ../Results/esm1b_input_WT.fasta


#### This was the source of the ESM1b model -> https://github.com/ntranoslab/esm-variants and this was the command line python3 esm_score_missense_mutations.py --input-fasta-file /path/to/input.fasta --output-csv-file /path/to/output.csv


# Step 3

In [59]:
# Load the sheet before transforming it to esm1b input 
# read the sheet 
Final_df = pd.read_excel('../Results/DNA_Prt_variant.xlsx')

#### This sheet was used to get the location GCA_000017985.1_ASM1798v1_feature_table.txt and was put in the DNA_Prt_variant in excel by vlookup and all the unwanted columns were removed.Also those variants were removed which did not have gene start and stop and genes which had upstream variants.

In [61]:
import pandas as pd
Final_df = pd.read_excel('../data/DNA_Prt_Strand_variant.xlsx')

#### Now take this file to script2.R script to generate the mutant DNA sequence.

### After generating Mut DNA seq from script2.R get the file here.
### Now convert the WT DNA to WT prt to check if it matches the NCBI submitted Prt seq

In [63]:
import pandas as pd 
data = pd.read_excel("../data/DNA_Prt_Strand_Variant_MutDNA.xlsx")

In [64]:
from Bio.Seq import Seq
from Bio.Data import CodonTable
import pandas as pd

# Define the custom codon table
custom_codon_table = CodonTable.unambiguous_dna_by_id[11]  # Change the ID as needed
data['WT_aa_made'] = data['DNA_Seq'].apply(lambda cds_sequence: str(Seq(cds_sequence).translate(table=custom_codon_table)).replace('*', ' '))
data

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,Start,End,Strand,DNA_Seq,Prt_Seq,Sequence_Mut,Diff_Count,Differences,Diff_Pos,has_space,WT_aa_made
0,241,thrL,A->C,A,C,missense,m1,190,255,+,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,MKRISTTITTTITITTGNGAG,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,1,AC,241,False,MKRISTTITTTITITTGNGAG
1,1615,thrA,T->G,T,G,missense,m1,336,2798,+,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,1,TG,1615,False,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...
2,8430,talB,T->G,T,G,missense,m1,8236,9189,+,ATGACGGACAAATTGACCTCCCTTCGTCAGTACACCACCGTAGTGG...,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,ATGACGGACAAATTGACCTCCCTTCGTCAGTACACCACCGTAGTGG...,1,TG,8430,False,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...
3,12527,dnaK,A->C,A,C,missense,m1,12161,14077,+,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,1,AC,12527,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...
4,13974,dnaK,A->C,A,C,missense,m1,12161,14077,+,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,1,AC,13974,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21765,4621500,yjjX,T->G,T,C,missense,p6,4621384,4621905,-,ATGCACCAAGTTGTCTGTGCGACCACCAATCCCGCTAAAATTCAGG...,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,ATGCACCAAGTTGTCTGTGCGACCACCAATCCCGCTAAAATTCAGG...,1,AC,4621789,False,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...
21766,4623143,rob,T->G,T,C,missense,p6,4622601,4623470,-,ATGGATCAGGCCGGCATTATTCGCGACCTTTTAATCTGGCTGGAAG...,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,ATGGATCAGGCCGGCATTATTCGCGACCTTTTAATCTGGCTGGAAG...,1,AC,4622928,False,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...
21767,4627830,arcA,C->T,C,A,missense,p6,4627750,4628466,-,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,1,GA,4628386,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...
21768,4627933,arcA,T->G,T,C,missense,p6,4627750,4628466,-,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,1,AC,4628283,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...


In [66]:
# remove any empty spcae 
data['WT_aa_made'] = data['WT_aa_made'].str.replace(' ', '')

In [67]:
# remove any empty spcae 
data['Prt_Seq'] = data['Prt_Seq'].str.replace(' ', '')

In [68]:
# Compare sequences and create a new column
data['comparison'] = data.apply(lambda row: row['Prt_Seq'] == row['WT_aa_made'], axis=1)

In [69]:
data['comparison'].unique()

array([ True, False])

In [70]:
# Count the number of 'True' values in 'comparison'
false_count = (data['comparison'] == False).sum()

print("Number of 'False' values in comparison:", false_count)

Number of 'False' values in comparison: 2216


In [71]:
# Count the number of 'True' values in 'comparison'
true_count = (data['comparison'] == True).sum()

print("Number of 'True' values in comparison:", true_count)

Number of 'True' values in comparison: 19554


In [72]:
# Subset rows with 'True' values into a new DataFrame
true_df = data[data['comparison'] == True]

In [73]:
true_df

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,Start,End,Strand,DNA_Seq,Prt_Seq,Sequence_Mut,Diff_Count,Differences,Diff_Pos,has_space,WT_aa_made,comparison
0,241,thrL,A->C,A,C,missense,m1,190,255,+,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,MKRISTTITTTITITTGNGAG,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,1,AC,241,False,MKRISTTITTTITITTGNGAG,True
1,1615,thrA,T->G,T,G,missense,m1,336,2798,+,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,1,TG,1615,False,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,True
2,8430,talB,T->G,T,G,missense,m1,8236,9189,+,ATGACGGACAAATTGACCTCCCTTCGTCAGTACACCACCGTAGTGG...,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,ATGACGGACAAATTGACCTCCCTTCGTCAGTACACCACCGTAGTGG...,1,TG,8430,False,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,True
3,12527,dnaK,A->C,A,C,missense,m1,12161,14077,+,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,1,AC,12527,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,True
4,13974,dnaK,A->C,A,C,missense,m1,12161,14077,+,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,1,AC,13974,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21765,4621500,yjjX,T->G,T,C,missense,p6,4621384,4621905,-,ATGCACCAAGTTGTCTGTGCGACCACCAATCCCGCTAAAATTCAGG...,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,ATGCACCAAGTTGTCTGTGCGACCACCAATCCCGCTAAAATTCAGG...,1,AC,4621789,False,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,True
21766,4623143,rob,T->G,T,C,missense,p6,4622601,4623470,-,ATGGATCAGGCCGGCATTATTCGCGACCTTTTAATCTGGCTGGAAG...,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,ATGGATCAGGCCGGCATTATTCGCGACCTTTTAATCTGGCTGGAAG...,1,AC,4622928,False,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,True
21767,4627830,arcA,C->T,C,A,missense,p6,4627750,4628466,-,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,1,GA,4628386,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,True
21768,4627933,arcA,T->G,T,C,missense,p6,4627750,4628466,-,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,1,AC,4628283,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,True


### Some seq after conversion did not match the NCBI submitted seq , Therefore remove those variants 

In [74]:
# Write in local 
# Specify the file path where you want to save the Excel file
excel_file_path = '../Results/DNA_Prt_Strand_Variant_MutDNA_True.xlsx'

# Write the DataFrame to an Excel file
true_df.to_excel(excel_file_path, index=False)

print(f'DataFrame saved to {excel_file_path}')

DataFrame saved to ../Results/DNA_Prt_Strand_Variant_MutDNA_True.xlsx


### Take this sheet to R to get the SNP between WT prt and Mut prt 

#### All elements in place we can pull the ESM1b score to the variant mastersheet 

In [75]:
# Read the esm1b output score 
import pandas as pd
score = pd.read_csv('../data/esm1b_output.csv')

In [76]:
# Make universal ID 
score['universal_id'] = score['seq_id'].astype(str) + '_' + score['mut_name'].astype(str)
score['universal_id'] = score['universal_id'].str.replace(' ', '')

In [78]:
score.head()

Unnamed: 0,seq_id,mut_name,esm_score,universal_id
0,241_thrL,M1K,-7.278638,241_thrL_M1K
1,241_thrL,M1R,-7.132564,241_thrL_M1R
2,241_thrL,M1H,-8.481606,241_thrL_M1H
3,241_thrL,M1E,-7.796053,241_thrL_M1E
4,241_thrL,M1D,-8.461714,241_thrL_M1D


##### Read the dataframe with SNP preporcesd in r 

In [79]:
true_df = pd.read_excel('../data/DNA_Prt_Strand_Variant_MutDNA_True_SNP.xlsx')

In [80]:
true_df

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,Start,End,Strand,...,Prt_Seq,Sequence_Mut,Diff_Count,Differences,Diff_Pos,has_space,WT_aa_made,comparison,Mut_aa_made,SNP
0,241,thrL,A->C,A,C,missense,m1,190,255,+,...,MKRISTTITTTITITTGNGAG,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,1,AC,241,False,MKRISTTITTTITITTGNGAG,True,MKRISTTITTTITITTGHGAG,N18H
1,1615,thrA,T->G,T,G,missense,m1,336,2798,+,...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,1,TG,1615,False,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,True,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,I427S
2,8430,talB,T->G,T,G,missense,m1,8236,9189,+,...,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,ATGACGGACAAATTGACCTCCCTTCGTCAGTACACCACCGTAGTGG...,1,TG,8430,False,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,True,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,D65E
3,12527,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,1,AC,12527,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,True,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,M123L
4,13974,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,1,AC,13974,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,True,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,Q605P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19549,4621500,yjjX,T->G,T,C,missense,p6,4621384,4621905,-,...,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,ATGCACCAAGTTGTCTGTGCGACCACCAATCCCGCTAAAATTCAGG...,1,AC,4621789,False,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,True,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,K136Q
19550,4623143,rob,T->G,T,C,missense,p6,4622601,4623470,-,...,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,ATGGATCAGGCCGGCATTATTCGCGACCTTTTAATCTGGCTGGAAG...,1,AC,4622928,False,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,True,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,S110R
19551,4627830,arcA,C->T,C,A,missense,p6,4627750,4628466,-,...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,1,GA,4628386,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,True,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,E213K
19552,4627933,arcA,T->G,T,C,missense,p6,4627750,4628466,-,...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,1,AC,4628283,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,True,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,K178N


In [81]:
# Make Seq Id in mastersheet to pull scores 
true_df['seq_id'] = true_df['Position'].astype(str) + '_' + true_df['Gene'].astype(str)
true_df['universal_id'] = true_df['seq_id'].astype(str) + '_' + true_df['SNP'].astype(str)
true_df['universal_id'] = true_df['universal_id'].str.replace(' ', '')

In [84]:
true_df

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,Start,End,Strand,...,Diff_Count,Differences,Diff_Pos,has_space,WT_aa_made,comparison,Mut_aa_made,SNP,seq_id,universal_id
0,241,thrL,A->C,A,C,missense,m1,190,255,+,...,1,AC,241,False,MKRISTTITTTITITTGNGAG,True,MKRISTTITTTITITTGHGAG,N18H,241_thrL,241_thrL_N18H
1,1615,thrA,T->G,T,G,missense,m1,336,2798,+,...,1,TG,1615,False,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,True,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,I427S,1615_thrA,1615_thrA_I427S
2,8430,talB,T->G,T,G,missense,m1,8236,9189,+,...,1,TG,8430,False,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,True,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,D65E,8430_talB,8430_talB_D65E
3,12527,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,1,AC,12527,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,True,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,M123L,12527_dnaK,12527_dnaK_M123L
4,13974,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,1,AC,13974,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,True,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,Q605P,13974_dnaK,13974_dnaK_Q605P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19549,4621500,yjjX,T->G,T,C,missense,p6,4621384,4621905,-,...,1,AC,4621789,False,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,True,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,K136Q,4621500_yjjX,4621500_yjjX_K136Q
19550,4623143,rob,T->G,T,C,missense,p6,4622601,4623470,-,...,1,AC,4622928,False,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,True,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,S110R,4623143_rob,4623143_rob_S110R
19551,4627830,arcA,C->T,C,A,missense,p6,4627750,4628466,-,...,1,GA,4628386,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,True,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,E213K,4627830_arcA,4627830_arcA_E213K
19552,4627933,arcA,T->G,T,C,missense,p6,4627750,4628466,-,...,1,AC,4628283,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,True,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,K178N,4627933_arcA,4627933_arcA_K178N


In [85]:
# Merge 
final_df = true_df.merge(score, on='universal_id', how='left')

In [89]:
# Count duplicate rows based on 'key_col'
duplicate_count = final_df.duplicated(subset=['universal_id']).sum()

print("Number of duplicate rows based on 'universal_id':", duplicate_count)

Number of duplicate rows based on 'universal_id': 5765


In [90]:
# Count duplicate rows based on 'universal_id'
duplicate_count = final_df.duplicated(subset=['universal_id']).sum()

# Drop duplicate rows based on 'universal_id'
final_df = final_df.drop_duplicates(subset=['universal_id'])

print("Number of duplicate rows based on 'universal_id':", duplicate_count)
print("DataFrame after dropping duplicates:")
final_df

Number of duplicate rows based on 'universal_id': 5765
DataFrame after dropping duplicates:


Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,Start,End,Strand,...,has_space,WT_aa_made,comparison,Mut_aa_made,SNP,seq_id_x,universal_id,seq_id_y,mut_name,esm_score
0,241,thrL,A->C,A,C,missense,m1,190,255,+,...,False,MKRISTTITTTITITTGNGAG,True,MKRISTTITTTITITTGHGAG,N18H,241_thrL,241_thrL_N18H,241_thrL,N18H,-4.279933
1,1615,thrA,T->G,T,G,missense,m1,336,2798,+,...,False,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,True,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,I427S,1615_thrA,1615_thrA_I427S,1615_thrA,I427S,-11.530375
2,8430,talB,T->G,T,G,missense,m1,8236,9189,+,...,False,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,True,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,D65E,8430_talB,8430_talB_D65E,8430_talB,D65E,-5.331029
3,12527,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,True,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,M123L,12527_dnaK,12527_dnaK_M123L,12527_dnaK,M123L,-4.669276
4,13974,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,False,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,True,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,Q605P,13974_dnaK,13974_dnaK_Q605P,13974_dnaK,Q605P,-7.992450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24076,4612040,lplA,T->G,T,C,missense,p6,4611315,4612331,-,...,False,MSTLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLWRNADTVVI...,True,MSTLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLWRNADTVVI...,K98Q,4612040_lplA,4612040_lplA_K98Q,4612040_lplA,K98Q,-8.282178
24077,4621500,yjjX,T->G,T,C,missense,p6,4621384,4621905,-,...,False,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,True,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,K136Q,4621500_yjjX,4621500_yjjX_K136Q,4621500_yjjX,K136Q,-5.321062
24078,4623143,rob,T->G,T,C,missense,p6,4622601,4623470,-,...,False,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,True,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,S110R,4623143_rob,4623143_rob_S110R,4623143_rob,S110R,-5.109406
24081,4627933,arcA,T->G,T,C,missense,p6,4627750,4628466,-,...,False,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,True,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,K178N,4627933_arcA,4627933_arcA_K178N,4627933_arcA,K178N,-4.694204


In [91]:
# Count NaN values in the 'score' column
nan_count = final_df['esm_score'].isna().sum()

print("Number of NaN values in 'esm-score' column:", nan_count)

Number of NaN values in 'esm-score' column: 259


In [92]:
# Drop rows with NaN values in the 'score' column
final_df = final_df.dropna(subset=['esm_score'])

In [93]:
# Select the columns you want to keep
Score_missense = final_df[['Position', 'Gene', 'Allele', 'Ref_allele', 'Alt_allele', 'Annotation',
       'label', 'Start', 'End', 'Strand', 'DNA_Seq', 'Prt_Seq', 'Sequence_Mut', 'WT_aa_made','Mut_aa_made', 'SNP', 'seq_id_x', 'universal_id',
       'seq_id_y', 'mut_name', 'esm_score']]

In [94]:
Score_missense

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,Start,End,Strand,...,Prt_Seq,Sequence_Mut,WT_aa_made,Mut_aa_made,SNP,seq_id_x,universal_id,seq_id_y,mut_name,esm_score
0,241,thrL,A->C,A,C,missense,m1,190,255,+,...,MKRISTTITTTITITTGNGAG,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,MKRISTTITTTITITTGNGAG,MKRISTTITTTITITTGHGAG,N18H,241_thrL,241_thrL_N18H,241_thrL,N18H,-4.279933
1,1615,thrA,T->G,T,G,missense,m1,336,2798,+,...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,I427S,1615_thrA,1615_thrA_I427S,1615_thrA,I427S,-11.530375
2,8430,talB,T->G,T,G,missense,m1,8236,9189,+,...,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,ATGACGGACAAATTGACCTCCCTTCGTCAGTACACCACCGTAGTGG...,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,D65E,8430_talB,8430_talB_D65E,8430_talB,D65E,-5.331029
3,12527,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,M123L,12527_dnaK,12527_dnaK_M123L,12527_dnaK,M123L,-4.669276
4,13974,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,Q605P,13974_dnaK,13974_dnaK_Q605P,13974_dnaK,Q605P,-7.992450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24076,4612040,lplA,T->G,T,C,missense,p6,4611315,4612331,-,...,MSTLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLWRNADTVVI...,ATGTCCACATTACGCCTGCTCATCTCTGACTCTTACGACCCGTGGT...,MSTLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLWRNADTVVI...,MSTLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLWRNADTVVI...,K98Q,4612040_lplA,4612040_lplA_K98Q,4612040_lplA,K98Q,-8.282178
24077,4621500,yjjX,T->G,T,C,missense,p6,4621384,4621905,-,...,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,ATGCACCAAGTTGTCTGTGCGACCACCAATCCCGCTAAAATTCAGG...,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,K136Q,4621500_yjjX,4621500_yjjX_K136Q,4621500_yjjX,K136Q,-5.321062
24078,4623143,rob,T->G,T,C,missense,p6,4622601,4623470,-,...,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,ATGGATCAGGCCGGCATTATTCGCGACCTTTTAATCTGGCTGGAAG...,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,S110R,4623143_rob,4623143_rob_S110R,4623143_rob,S110R,-5.109406
24081,4627933,arcA,T->G,T,C,missense,p6,4627750,4628466,-,...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,K178N,4627933_arcA,4627933_arcA_K178N,4627933_arcA,K178N,-4.694204


In [96]:
Score_missense.reset_index(inplace=True, drop=True)

In [97]:
for i in range (Score_missense.shape[0]):

    try:

        a = zip(Score_missense.loc[i, "WT_aa_made"], Score_missense.loc[i, "Mut_aa_made"])
        b = [ (pos, char1, char2) for pos, (char1, char2) in enumerate(a) if char1!=char2]
        c =  [ f"{char1}{pos}{char2}" for (pos, char1, char2) in b]
        Score_missense.loc[i, "mutant"] = c[0]

        # break
        print(c)
    except:
        continue


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Score_missense.loc[i, "mutant"] = c[0]


['N17H']
['I426S']
['D64E']
['M122L']
['Q604P']
['E141A']
['F188L']
['N81H']
['N56K']
['V79G']
['K320N']
['S801R']
['A16V']
['H260Q']
['D419E']
['I87L']
['P160Q']
['A494V']
['T72P']
['L225F']
['K280Q']
['V534G']
['E97A']
['F302C']
['K538Q']
['F205L']
['I109L']
['H334Q']
['S16Y']
['V259M']
['E304A']
['F148C']
['N302H']
['K18Q']
['A190V']
['S230A']
['Y279S']
['K123Q']
['S83A']
['I564M']
['A614T']
['D12E']
['I159L']
['A189T']
['P199A']
['A232D']
['D37E']
['N45H']
['D67E']
['V82G']
['D234A']
['L77F']
['D296E']
['F351L']
['Y213D']
['F18C']
['D184E']
['R83C']
['E112A']
['G180S']
['I206L']
['K306Q']
['E193A']
['S290R']
['K212N']
['S550Y']
['E128A']
['F211C']
['E279A']
['I5S']
['K628Q']
['N50H']
['C120G']
['L47R']
['Q33P']
['K148Q']
['A27E']
['K38N']
['F133C']
['L23M']
['N44H']
['I3M']
['I75M']
['R78S']
['K104Q']
['D21E']
['K239Q']
['T241P']
['E210A']
['E50D']
['K372T']
['S412R']
['K504T']
['R573L']
['K676T']
['T717P']
['Q131P']
['E166A']
['D160E']
['T219P']
['K355T']
['E190D']
['Q85P']
['I340

In [98]:
Score_missense

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,Start,End,Strand,...,Sequence_Mut,WT_aa_made,Mut_aa_made,SNP,seq_id_x,universal_id,seq_id_y,mut_name,esm_score,mutant
0,241,thrL,A->C,A,C,missense,m1,190,255,+,...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,MKRISTTITTTITITTGNGAG,MKRISTTITTTITITTGHGAG,N18H,241_thrL,241_thrL_N18H,241_thrL,N18H,-4.279933,N17H
1,1615,thrA,T->G,T,G,missense,m1,336,2798,+,...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,I427S,1615_thrA,1615_thrA_I427S,1615_thrA,I427S,-11.530375,I426S
2,8430,talB,T->G,T,G,missense,m1,8236,9189,+,...,ATGACGGACAAATTGACCTCCCTTCGTCAGTACACCACCGTAGTGG...,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,D65E,8430_talB,8430_talB_D65E,8430_talB,D65E,-5.331029,D64E
3,12527,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,M123L,12527_dnaK,12527_dnaK_M123L,12527_dnaK,M123L,-4.669276,M122L
4,13974,dnaK,A->C,A,C,missense,m1,12161,14077,+,...,ATGGGTAAAATAATTGGTATCGACCTGGGTACTACCAACTCTTGTG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,Q605P,13974_dnaK,13974_dnaK_Q605P,13974_dnaK,Q605P,-7.992450,Q604P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18054,4612040,lplA,T->G,T,C,missense,p6,4611315,4612331,-,...,ATGTCCACATTACGCCTGCTCATCTCTGACTCTTACGACCCGTGGT...,MSTLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLWRNADTVVI...,MSTLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLWRNADTVVI...,K98Q,4612040_lplA,4612040_lplA_K98Q,4612040_lplA,K98Q,-8.282178,K97Q
18055,4621500,yjjX,T->G,T,C,missense,p6,4621384,4621905,-,...,ATGCACCAAGTTGTCTGTGCGACCACCAATCCCGCTAAAATTCAGG...,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,MHQVVCATTNPAKIQAILQAFHEIFGEGSCHIASVAVESGVPEQPF...,K136Q,4621500_yjjX,4621500_yjjX_K136Q,4621500_yjjX,K136Q,-5.321062,K135Q
18056,4623143,rob,T->G,T,C,missense,p6,4622601,4623470,-,...,ATGGATCAGGCCGGCATTATTCGCGACCTTTTAATCTGGCTGGAAG...,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,MDQAGIIRDLLIWLEGHLDQPLSLDNVAAKAGYSKWHLQRMFKDVT...,S110R,4623143_rob,4623143_rob_S110R,4623143_rob,S110R,-5.109406,S109R
18057,4627933,arcA,T->G,T,C,missense,p6,4627750,4628466,-,...,ATGCAGACCCCGCACATTCTTATCGTTGAAAACGAGTTGGTAACAC...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,MQTPHILIVENELVTRNTLKSIFEAEGYDVFEATDGAEMHQILSEY...,K178N,4627933_arcA,4627933_arcA_K178N,4627933_arcA,K178N,-4.694204,K177N


In [99]:
# Write in local 
# Specify the file path where you want to save the Excel file
excel_file_path = '../Results/Final_esm1b_variant.xlsx'

# Write the DataFrame to an Excel file
Score_missense.to_excel(excel_file_path, index=False)

print(f'DataFrame saved to {excel_file_path}')

DataFrame saved to ../Results/Final_esm1b_variant.xlsx


##### Few of them got droped out becuase of the stop codon introduced in mutant protein

#### This is the sheet having all the variants and respective esm1b score

#### The script for plot generation is in script 3.ipynb file 