# Liver mouse preprocessing

In [19]:
import pandas as pd

In [71]:
df = pd.read_table("/workspace/projects/clustering_3d/o3d_analysys/datasets/input/cancer_mouse/mm39/vep/liver_strelka_mm39.vep.tsv.gz")
df

Unnamed: 0,#Uploaded_variation,Feature,Consequence,Protein_position,Amino_acids,SYMBOL,CANONICAL
0,do9806_93131_N1,-,intergenic_variant,-,-,-,-
1,do9414_91603_N1,-,intergenic_variant,-,-,-,-
2,do9786_91932_N7,-,intergenic_variant,-,-,-,-
3,do9219_90886_N4,-,intergenic_variant,-,-,-,-
4,do9429_91618_N2,-,intergenic_variant,-,-,-,-
...,...,...,...,...,...,...,...
55406524,do9818_93243_N1,-,intergenic_variant,-,-,-,-
55406525,do9878_91943_N1,-,intergenic_variant,-,-,-,-
55406526,do9864_94315_N10,-,intergenic_variant,-,-,-,-
55406527,do9403_91415_N2,-,intergenic_variant,-,-,-,-


In [58]:
seq_df[seq_df["Gene"] == "Braf"].Seq.values[0][637-1]

'G'

In [66]:
braf_df = braf_df[braf_df["CANONICAL"] == "YES"]

In [50]:
braf_df[braf_df["Protein_position"] == "637"][:60]

Unnamed: 0,#Uploaded_variation,Feature,Consequence,Protein_position,Amino_acids,SYMBOL,CANONICAL
32,do12458_90793_N2,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
34,do12459_91948_N4,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
36,do12460_92057_N1,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
38,do12461_92057_N2,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
40,do12465_92058_N3,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
42,do12466_92104_N1,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
44,do12469_92105_N1,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
46,do12471_92105_N4,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
48,do12473_91100_N3,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES
50,do12477_93091_N8,ENSMUST00000002487,missense_variant,637,V/E,Braf,YES


## Parse .txt file to .vcf for VEP (if not using liftover)

In [None]:
def sort_vcf(vcf_file):
    """
    Sort the vcf file from by chromosomes and position.
    """
    
    df = pd.DataFrame(vcf_file)

    # Define the custom sort order for Chromosome
    chromosome_order = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10,
                        '11': 11, '12': 12, '13': 13, '14': 14, '15': 15, '16': 16, '17': 17, '18': 18, '19': 19,
                        '20': 20, '21': 21, 'X': 22, 'Y': 23, 'M': 24}

    # Convert 'Chromosome' column to a temporary categorical column for sorting
    df['#CHROM'] = df['#CHROM'].astype(str)
    df['Chromosome_sort'] = df['#CHROM'].map(chromosome_order)
    # Sort and drop temporary col
    sorted_df = df.sort_values(by=['Chromosome_sort', 'POS'], ignore_index=True)
    sorted_df.drop('Chromosome_sort', axis=1, inplace=True)

    return sorted_df.reset_index(drop=True)

In [None]:
path_input = "../Liver_DEN_strelka_July2017_mm10.txt"
path_output = "liver_strelka_mm39.vcf"

mouse_df = pd.read_csv(path_input, sep="\t")
mouse_df = mouse_df[["CHROMOSOME", "POSITION", "SAMPLE", "REF", "ALT"]]
mouse_df = mouse_df.rename(columns={"CHROMOSOME" : "#CHROM",
                                    "POSITION" : "POS",
                                    "SAMPLE" : "ID"})
mouse_df = sort_vcf(mouse_df)
mouse_df.to_csv(fpath_output, sep="\t", index=False)

## Liftover from mm10 to mm39

In [None]:
import pandas as pd
import numpy as np
import sys
from pyliftover import LiftOver


# Convert coordinates
def liftover_vcf(df, path_chain_file):
    
    lo = LiftOver(path_chain_file)

    df['CHROM_mm39'] = np.nan
    df['POS_mm39'] = np.nan
    for index, row in df.iterrows():
        chrom = f"chr{str(row['CHROMOSOME']).lower()}"
        pos = int(row['POSITION'])
        
        # Perform liftover
        result = lo.convert_coordinate(chrom, pos - 1)  # convert to 0-based for liftover
        if result:
            new_chrom, new_pos = result[0][0], result[0][1] + 1  # convert back to 1-based
            df.at[index, 'CHROM_mm39'] = new_chrom
            df.at[index, 'POS_mm39'] = new_pos
        else:
            df.at[index, 'CHROM_mm39'] = np.nan
            df.at[index, 'POS_mm39'] = np.nan
            
    if (index + 1) % 1000 == 0:
            print(f"Processed {index + 1} rows")
            
    return df


def main(input_path, chain_file_path, output_path):
    
    # Load VCF
    print("Loading mutations..")
    vcf_df = pd.read_table(input_path, low_memory=False)

    # Liftover
    print("Starting liftover..")
    vcf_mm39 = liftover_vcf(vcf_df, chain_file_path)

    # Filter out unmapped entries
    print("Filtering NAs..")
    len_a = len(vcf_mm39)
    vcf_mm39 = vcf_mm39.dropna(subset=['CHROM_mm39', 'POS_mm39'])
    len_b = len(vcf_mm39)
    print(f"Filtered {len_a - len_b} NAs")

    # Save to new VCF file
    vcf_mm39['#CHROM'] = vcf_mm39['CHROM_mm39'].str.replace("chr", "").astype(int)
    vcf_mm39['POS'] = vcf_mm39['POS_mm39'].astype(int)
    vcf_mm39 = vcf_mm39.rename(columns={"SAMPLE" : "ID"})
    output_columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
    print("Saving..")
    vcf_mm39[output_columns].to_csv(output_path, sep='\t', index=False)


# if __name__ == '__main__':
#     input_path = sys.argv[1]
#     chain_file_path = sys.argv[2]
#     output_path = sys.argv[3]
    
#     print("Input:", input_path)
#     print("Chain:", chain_file_path)
#     print("Output:", output_path)
    
#     main(input_path, chain_file_path, output_path)
    
main("Liver_DEN_strelka_July2017_mm10.txt", "mm10ToMm39.over.chain.gz", "liver_strelka_mm39.vcf")

In [None]:
python3 liftover.py Liver_DEN_strelka_July2017_mm10.txt mm10ToMm39.over.chain.gz liver_strelka_mm39.vcf

## Run VEP and parser

In [None]:
/workspace/projects/clustering_3d/clustering_3d/build/preprocessing/vep_and_parse_mouse.sh liver_strelka_mm39.vcf liver_strelka_mm39.vep.tsv

## Convert tsv for VEP to tsv for bgsignature

In [None]:
path_input = "liver_strelka_mm39.vcf"
path_output = "liver_strelka_mm39.in.tsv.gz"

df = pd.read_table(path_input)
df = df.rename(columns={"#CHROM" : "CHROMOSOME",
                           "POS" : "POSITION",
                           "ID" : "SAMPLE"})
df["SAMPLE"] = df.pop("SAMPLE")

df.to_csv(path_output, sep="\t", compression="gzip", index=False)

## Generate mut profile

### Get regions file

In [1]:
import sys
from bgreference import refseq

CHR = [str(i) for i in range(1, 20)] + ['X', 'Y', 'M']


def compute_sizes(genome, kmer):
    sizes = []
    for chr_ in CHR:
        seq = refseq(genome, f"chr{chr_}", start=(1 + kmer // 2), size=None)
        sizes.append(tuple(map(str, (chr_, 1 + kmer // 2, len(seq) - kmer // 2))))
    return sizes


def write(sizes):
    print('\t'.join(('CHROMOSOME', 'START', 'END')))
    for s in sizes:
        print('\t'.join(s))

# if __name__ == '__main__':
#     genome = sys.argv[1]
#     kmer = int(sys.argv[2])
#     write(compute_sizes(genome, kmer))

write(compute_sizes(genome="mm39", kmer=3))

In [None]:
python3 /workspace/projects/clustering_3d/o3d_analysys/datasets/input/cancer_mouse/raw_data/liver_strelka_mm39/get_regions_file.py mm39 3 > mm39_wg_regions.tsv

### Run bgsignature content

In [None]:
conda env activate bgsignature

bgsignature count -r mm39_wg_regions.tsv -s 3 -g mm39 --exclude-N --collapse -o wg.counts.gz --cores 8

### Run bgsignature normalizer

In [None]:
bgsignature normalize -m liver_strelka_mm39.in.tsv.gz -r mm39_wg_regions.tsv --normalize wg.counts.gz -s 3 -g mm39 --collapse --cores 8 -o liver_strelka_mm39.sig.json