In [None]:
# @title Extract 3-9-mers from CDR3 column of CSV
import pandas as pd
from collections import Counter
import itertools
from google.colab import files

# Upload the CSV file
# uploaded = files.upload()  # pick your file when prompted
# filename = next(iter(uploaded))  # get the uploaded file name

# Load the CSV
df = pd.read_csv('A_H_T.tsv_cdr3_only.csv')

if 'cdr3_aa' not in df.columns:
    raise ValueError("The CSV must contain a column named 'cdr3_aa'.")

# Remove NA and empty strings
sequences = df['cdr3_aa'].dropna().astype(str)
sequences = sequences[sequences != ""]

# Function to get k-mers from a sequence
def extract_kmers(seq, ks=range(3, 10)):
    kmers = []
    for k in ks:
        if len(seq) >= k:
            kmers.extend(seq[i:i+k] for i in range(len(seq) - k + 1))
    return kmers

# Count all k-mers across all sequences
all_kmers = list(itertools.chain.from_iterable(extract_kmers(seq) for seq in sequences))
kmer_counts = Counter(all_kmers)

# Convert to DataFrame and save
kmer_df = pd.DataFrame(kmer_counts.items(), columns=["kmer", "count"])
kmer_df = kmer_df.sort_values("count", ascending=False).reset_index(drop=True)

# Save to CSV
output_filename = "cdr3_kmer_counts.csv"
kmer_df.to_csv(output_filename, index=False)
files.download(output_filename)


KeyboardInterrupt: 

In [None]:
# @title Extract 3–9-mers by stage from uploaded files with cdr3_aa
import pandas as pd
from collections import Counter
import itertools
import os
from google.colab import files
from io import BytesIO
import zipfile

# Upload multiple files (e.g., A_H_T.tsv_cdr3_only.csv)
uploaded = files.upload()

# K-mer extraction helper
def extract_kmers(seq, ks=range(3, 10)):
    kmers = []
    for k in ks:
        if len(seq) >= k:
            kmers.extend(seq[i:i+k] for i in range(len(seq) - k + 1))
    return kmers

# Keep track of all output filenames for zipping
output_files = []

for filename in uploaded:
    print(f"\nProcessing {filename}...")

    try:
        df = pd.read_csv(filename)
    except Exception as e:
        print(f"Could not read {filename}: {e}")
        continue

    if 'cdr3_aa' not in df.columns or 'stage' not in df.columns:
        print(f"[!] Skipping {filename} - missing required columns 'cdr3_aa' and/or 'stage'")
        continue

    # Drop NAs and empty strings
    df = df.dropna(subset=['cdr3_aa', 'stage'])
    df['cdr3_aa'] = df['cdr3_aa'].astype(str)
    df = df[df['cdr3_aa'] != ""]

    for stage in sorted(df['stage'].unique()):
        stage_df = df[df['stage'] == stage]
        sequences = stage_df['cdr3_aa'].tolist()

        all_kmers = list(itertools.chain.from_iterable(extract_kmers(seq) for seq in sequences))
        kmer_counts = Counter(all_kmers)

        if not kmer_counts:
            print(f"  [!] No valid kmers for stage {stage} in {filename}")
            continue

        kmer_df = pd.DataFrame(kmer_counts.items(), columns=["kmer", "count"])
        kmer_df = kmer_df.sort_values("count", ascending=False).reset_index(drop=True)

        base = os.path.splitext(filename)[0]
        outname = f"{base}_stage_{stage}_kmer_counts.csv"
        kmer_df.to_csv(outname, index=False)
        output_files.append(outname)

# Zip all outputs
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, "w") as zipf:
    for f in output_files:
        zipf.write(f)
zip_buffer.seek(0)

# Download
files.download(zip_buffer, "stage_kmer_counts.zip")


Saving A_H_T.tsv_cdr3_only.csv to A_H_T.tsv_cdr3_only (1).csv

Processing A_H_T.tsv_cdr3_only (1).csv...


TypeError: download() takes 1 positional argument but 2 were given

In [None]:
import os
import zipfile
from google.colab import files

# Name of the output zip file
zip_filename = "kmer_files.zip"

# Create the zip archive
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for filename in os.listdir():
        if 'kmer' in filename:
            print(f"Adding {filename} to zip...")
            zipf.write(filename)

# Download the zip file
files.download(zip_filename)


Adding kmer_files.zip to zip...
Adding E_L_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding D_K_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding G_H_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding G_L_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding C_H_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding A_L_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding A_K_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding D_L_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding E_K_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding E_H_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding A_H_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding B_L_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding B_K_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding D_H_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding G_K_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding C_L_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...
Adding C_K_T.tsv_cdr3_only (1)_kmer_counts.csv to zip...

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Zip all outputs
zip_filename = "stage_kmer_counts.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for f in output_files:
        zipf.write(f)

# Download the final zip file
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!ls /content/ | rm -rf *.tsv *.csv