In [2]:
# Imports
import os
import pandas as pd



In [24]:
import os
import pandas as pd

def get_all_comparisons_for_sample(sample_path):
    """Return all comparison directories for a given sample path."""
    return [d for d in os.listdir(sample_path) if os.path.isdir(os.path.join(sample_path, d))]

def get_all_xlsx_files_in_directory(directory_path):
    """Return all .xlsx files in the given directory."""
    return [f for f in os.listdir(directory_path) if f.endswith('.xlsx')]

def extract_mutations_from_file(file_path, ignore_mutations):
    """Extract unique mutations from the given .xlsx file."""
    df = pd.read_excel(file_path)
    mutations = set(df['AMINO ACID CHANGE'].dropna().unique())
    return mutations.difference(ignore_mutations)

def main():
    initial_path = "data/comparisons/"
    k_lengths = [15, 21, 27]
    samples = ["04.B1.W14.01_04.M1.W09.02"]
    ignore_mutations = {"No CDS", "Silent mutation"}

    mutation_files = {}  # Dictionary to store mutations and corresponding files

    for sample in samples:
        comparisons_path = os.path.join(initial_path, sample)
        comparisons = get_all_comparisons_for_sample(comparisons_path)
        
        for comparison in comparisons:
            for k_length in k_lengths:
                folder_path = os.path.join(comparisons_path, comparison, str(k_length))
                xlsx_files = get_all_xlsx_files_in_directory(folder_path)
                
                print(sample, comparison, "k =", k_length, "ouput =", len(xlsx_files))
                
                for xlsx_file in xlsx_files:
                    file_path = os.path.join(folder_path, xlsx_file)
                    mutations = extract_mutations_from_file(file_path, ignore_mutations)
                    
                    # Update the mutation_files dictionary
                    for mutation in mutations:
                        if mutation not in mutation_files:
                            mutation_files[mutation] = []
                        mutation_files[mutation].append(file_path)

    # Display mutations and corresponding files
    for mutation, files in mutation_files.items():
        print(f"Mutation: {mutation}")
        for file in files:
            print(f"   - {file}")

main()


04.B1.W14.01_04.M1.W09.02 Comparison_3 k = 15 ouput = 101
04.B1.W14.01_04.M1.W09.02 Comparison_3 k = 21 ouput = 90
04.B1.W14.01_04.M1.W09.02 Comparison_3 k = 27 ouput = 98
04.B1.W14.01_04.M1.W09.02 Comparison_4 k = 15 ouput = 178
04.B1.W14.01_04.M1.W09.02 Comparison_4 k = 21 ouput = 131
04.B1.W14.01_04.M1.W09.02 Comparison_4 k = 27 ouput = 122
04.B1.W14.01_04.M1.W09.02 Comparison_2 k = 15 ouput = 150
04.B1.W14.01_04.M1.W09.02 Comparison_2 k = 21 ouput = 118
04.B1.W14.01_04.M1.W09.02 Comparison_2 k = 27 ouput = 111
04.B1.W14.01_04.M1.W09.02 Comparison_1 k = 15 ouput = 365
04.B1.W14.01_04.M1.W09.02 Comparison_1 k = 21 ouput = 322
04.B1.W14.01_04.M1.W09.02 Comparison_1 k = 27 ouput = 240
Mutation: I111V
   - data/comparisons/04.B1.W14.01_04.M1.W09.02/Comparison_3/15/ATCGGTGATCTACTA.xlsx
   - data/comparisons/04.B1.W14.01_04.M1.W09.02/Comparison_3/15/TATCGGTGATCTACT.xlsx
   - data/comparisons/04.B1.W14.01_04.M1.W09.02/Comparison_3/15/CTATCGGTGATCTAC.xlsx
   - data/comparisons/04.B1.W14.01_

In [22]:
all_mutations

{'A103S',
 'A103S/I111V',
 'A121T/V123E/L125V',
 'D277Q',
 'Del212/A214G/A215R',
 'Del214/A215P/V217R/I218D',
 'Del293/V294A',
 'I111V',
 'I193V',
 'I193V/D195E',
 'I24M',
 'I24M/E26G',
 'I263L',
 'I38L',
 'I38L/F41L',
 'I55V',
 'Ins127 (A)/K137S/Del138',
 'L293V',
 'M163V',
 'N183T',
 'S101G',
 'S262W/I263V/N267Y',
 'S291A',
 'Silent mutation',
 'T34Q/E35H/A36I/V37H/I38H',
 'T40A',
 'T40A/F41L',
 'V123E/L125V',
 'V123E/L125V/A127T'}

In [19]:
initial_path = "data/comparisons/"
# Define the list of k-mer lengths to be explored
k_lengths = [15, 21, 27]
# Define the list of sample names/identifiers to process
samples = ["04.B1.W14.01_04.M1.W09.02"]

all_mutations = set()
ignore_mutations = {'No CDS'}

for sample in samples:
    comparisons_path = initial_path + sample
    comparisons = [d for d in os.listdir(comparisons_path) if os.path.isdir(os.path.join(comparisons_path, d))]
    
    for comparison in comparisons:
        for k_length in k_lengths:
            folder_path = os.path.join(comparisons_path, comparison, str(k_length))
            
            # List all .xlsx files in the current directory
            xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
            print(sample, comparison, "k =", k_length, "ouput =", len(xlsx_files))
            
            # Print the .xlsx files
            for xlsx_file in xlsx_files: 
                file_path = os.path.join(comparisons_path, comparison, str(k_length), xlsx_file)
                df = pd.read_excel(file_path)
                mutations = set(df['AMINO ACID CHANGE'].dropna().unique())
                if not any(mutation in ignore_mutations for mutation in mutations):
                    all_mutations.update(mutations)

04.B1.W14.01_04.M1.W09.02 Comparison_3 k = 15 ouput = 101
04.B1.W14.01_04.M1.W09.02 Comparison_3 k = 21 ouput = 90
04.B1.W14.01_04.M1.W09.02 Comparison_3 k = 27 ouput = 98
04.B1.W14.01_04.M1.W09.02 Comparison_4 k = 15 ouput = 178
04.B1.W14.01_04.M1.W09.02 Comparison_4 k = 21 ouput = 131
04.B1.W14.01_04.M1.W09.02 Comparison_4 k = 27 ouput = 122
04.B1.W14.01_04.M1.W09.02 Comparison_2 k = 15 ouput = 150
04.B1.W14.01_04.M1.W09.02 Comparison_2 k = 21 ouput = 118
04.B1.W14.01_04.M1.W09.02 Comparison_2 k = 27 ouput = 111
04.B1.W14.01_04.M1.W09.02 Comparison_1 k = 15 ouput = 365
04.B1.W14.01_04.M1.W09.02 Comparison_1 k = 21 ouput = 322
04.B1.W14.01_04.M1.W09.02 Comparison_1 k = 27 ouput = 240


In [20]:
all_mutations

{'A103S',
 'A103S/I111V',
 'A121T/V123E/L125V',
 'D277Q',
 'Del212/A214G/A215R',
 'Del214/A215P/V217R/I218D',
 'Del293/V294A',
 'I111V',
 'I193V',
 'I193V/D195E',
 'I24M',
 'I24M/E26G',
 'I263L',
 'I38L',
 'I38L/F41L',
 'I55V',
 'Ins127 (A)/K137S/Del138',
 'L293V',
 'M163V',
 'N183T',
 'S101G',
 'S262W/I263V/N267Y',
 'S291A',
 'Silent mutation',
 'T34Q/E35H/A36I/V37H/I38H',
 'T40A',
 'T40A/F41L',
 'V123E/L125V',
 'V123E/L125V/A127T'}

In [16]:
import sys
!{sys.executable} -m pip install openpyxl


Defaulting to user installation because normal site-packages is not writeable
Collecting openpyxl
  Using cached openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
Collecting et-xmlfile
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


In [6]:
# Define the directory path for the current sample
samples_path = "data/comparisons/" 
samples = [d for d in os.listdir(samples_path) if os.path.isdir(os.path.join(samples_path, d))]

for sample in samples:
    comparisons_path = samples_path + sample
    comparisons = [d for d in os.listdir(comparisons_path) if os.path.isdir(os.path.join(comparisons_path, d))]
    print(comparisons)


['Comparison_3', 'Comparison_2', 'Comparison_1']
['Comparison_1']
['Comparison_3', 'Comparison_2', 'Comparison_1']
['Comparison_3', 'Comparison_4', 'Comparison_2', 'Comparison_1']
['Comparison_3', 'Comparison_4', 'Comparison_2', 'Comparison_1', 'Comparison_5', 'Comparison_6']
['Comparison_3', 'Comparison_4', 'Comparison_2', 'Comparison_1']
['Comparison_3', 'Comparison_2', 'Comparison_1']
['Comparison_3', 'Comparison_2', 'Comparison_1']
['Comparison_3', 'Comparison_4', 'Comparison_2', 'Comparison_1']
['Comparison_1']


In [7]:
# Get the list of subdirectories to process
subdirectories = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]

NameError: name 'directory' is not defined

In [None]:
all_mutations = set()
folder_paths = ["15", "18", "21"]
ignore_mutations = {'Silent mutation', 'Not identified', 'No CDS'}

for folder_path in folder_paths:
    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_excel(file_path)
            mutations = set(df['AMINO ACID CHANGE'].dropna().unique())
            if not any(mutation in ignore_mutations for mutation in mutations):
                all_mutations.update(mutations)

In [79]:
# p2
for m in  all_mutations: print(m)

['Substitution: M163V']
['Substitution: A234T']
['Deletion: 8', 'Substitution: T9S', 'Substitution: N10V', 'Substitution: R11S']
['Substitution: N10V', 'Substitution: R11S']
['Substitution: I149M']
['Substitution: I149M', 'Substitution: F156L']
['Substitution: I38L']
['Substitution: I263L']
['Substitution: T8S', 'Substitution: T9A']


In [6]:
 # P
for m in  all_mutations: print(m)

D277E
M163V
I38L
A272T
I193V
T8S
I263L
I24M/E26G/S29F
I193V/D195E
S362N
L64F
T8S/T9V
I149M/F156L
E26G/S29F
Ins1 (T)/Del8
S29F
Ins413 (A)
I149M
L210M
A234T
C366Y
A272T/D277H
Del100/S101G
T8S/T9A
S362N/C366Y
Ins1 (S)/Ins2 (T)/Del8/Del9
F156L
S101G
F156L/M163V
L210I
L293V
S291A
D277Q


In [4]:
 # P
for m in  all_mutations: print(m)

D277E
M163V
I38L
A272T
I193V
T8S
I263L
I24M/E26G/S29F
I193V/D195E
S362N
L64F
T8S/T9V
I149M/F156L
E26G/S29F
Ins1 (T)/Del8
S29F
Ins413 (A)
I149M
L210M
A234T
C366Y
A272T/D277H
Del100/S101G
T8S/T9A
S362N/C366Y
Ins1 (S)/Ins2 (T)/Del8/Del9
F156L
S101G
F156L/M163V
L210I
L293V
S291A
D277Q


In [6]:
 # P"
for m in  all_mutations: print(m)

['Substitution: I263L']
['Substitution: I149M', 'Substitution: F156L']
['Substitution: N10V', 'Substitution: R11S']
['Substitution: T8S', 'Substitution: T9A']
['Substitution: A234T']
['Substitution: M163V']
['Substitution: T8S', 'Substitution: T9V', 'Substitution: N10S', 'Substitution: R11A']
['Substitution: I38L']
['Substitution: I149M']


In [5]:
print(len("CAAAAACAAAGGAGTCGCAC"))
print(len("CAAAAACAAAGGAGTCGCAC"))
print(len("CAGAAACAAAGGAGTCGTAC"))

20
20
20
