# STRspy

This script creates the necessary config files to run STRspy on the Bioliquid Nanopore data.

# Load modules

In [1]:
import numpy as np
import pandas as pd
import os

# Set variables

In [2]:
# Input variables
run_number="run1"
chrom="chr11"
dis="sca"
location=5227002
location_padding=2000000

# Setup
chrom_dis=f"{chrom}_{dis}"
rootdir=f"/mnt/aretian/genomics/nanopore"
datadir=f"/mnt/aretian/genomics/nanopore/{run_number}"

os.environ["run_number"]=run_number
os.environ["chrom"]=chrom
os.environ["chrom_dis"]=chrom_dis
os.environ["datadir"]=datadir

# Individual BED and Fasta files

Create one BED and Fasta file per STR.

### Get reference genome

In [4]:
# Read in fasta file: remove line breaks and header
def read_fasta_genome(fasta_file,chromosome_header):
    clean_data = fasta_file.read().replace("\n", "")
    clean_data = clean_data.replace(chromosome_header,"") # get rid of header

    return clean_data

with open(f'{rootdir}/{chrom}_selected.fa') as f: # update path if needed
    ref_genome = read_fasta_genome(f,f'>{chrom}')
    
print(f"Unique characters: {list(set(ref_genome))}") 

print(f"Selected chromosome from reference genome is {len(ref_genome)} BP long")

Unique characters: ['C', 'T', 'G', 'N', 'A']
Selected chromosome from reference genome is 135086622 BP long


### Load Full list of STRs

In [5]:
# Load Full STR list
df = pd.read_csv(f'{rootdir}/hg38.hipstr_reference.bed', sep='\t', header=None)
df.columns=['chr','start','end','NA','repeats','name','unit']

# Load STRspy test data
testdata = pd.read_csv('/home/fer/genomics/strspy/testset/testCustomDB/FGA.bed', sep='\t', header=None)
print('Data should look like this:')
testdata

Data should look like this:


Unnamed: 0,0,1,2,3
0,4,155508887,155508975,FGA


### Prepare data

In [6]:
# Filter chr17
# selected_chromosome = 'chr17'
# selected_chromosome = chrom
# window_width = 5000000
# start_pos = 70924941 - window_width
# start_pos = 23000000
# end_pos = 70953015 + window_width
# end_pos = 27000000

df = df.loc[df['chr']==chrom]

# Filter locations
# try:
# df = df.loc[(df['start']>=start_pos) & (df['end']<=end_pos)]
# except:
#     None
    
# Get columns
df = df[['chr','start','end','name']]

### Save each STR in different BED file

In [7]:
selected_strs = df.loc[(df['start']>location-location_padding)&(df['end']<location+location_padding)]

In [61]:
# Loop: create single STR files
for n in range(len(selected_strs)):
# for n in range(3):
    str_out = selected_strs.iloc[[n]]
    str_name = str_out['name'].values[0]
    str_out.to_csv(f"{datadir}/strspy/input/db/{str_name}.bed", header=False, index=False, sep='\t')
    
    myfasta = open(f"{datadir}/strspy/input/db/{str_name}.fa","w")
    start = str_out['start'].values[0]
    end = str_out['end'].values[0]
    # Extract reads
    padded_str=ref_genome[start-500:end+500]
    # Write to file
    myfasta.write('>')
    myfasta.write(str_name)
    myfasta.write('\n')
    myfasta.write(padded_str)
    myfasta.write('\n')
    myfasta.close()

# Region BED file (all STRs)

In [62]:
selected_strs.to_csv(f'{datadir}/strspy/input/regions/all_strs.bed', header=False, index=False, sep='\t')

# Run STRspy

In [11]:
%%bash
echo "${datadir}/strspy/output"

/mnt/aretian/genomics/nanopore/run1/strspy/output


In [50]:
%%bash
rm -rf "${datadir}/strspy/output"
mkdir "${datadir}/strspy/output"
cd /home/fer/genomics/strspy
bash STRspy_run_v1.0.sh "config/${chrom}_InputConfig.txt" config/UserToolsConfig.txt

Process is interrupted.


# Concatenate all output

In [63]:
%%bash
cd "${datadir}/strspy/output/Countings"
cat *person0*Allele_freqs.txt > "${run_number}_${chrom}_person0_strs_raw.txt"
cat *person1*Allele_freqs.txt > "${run_number}_${chrom}_person1_strs_raw.txt"

grep Human_STR "${run_number}_${chrom}_person0_strs_raw.txt" > "${run_number}_${chrom}_person0_strs.txt"
grep Human_STR "${run_number}_${chrom}_person1_strs_raw.txt" > "${run_number}_${chrom}_person1_strs.txt"

# Add necessary columns

In [8]:
# This cell takes ~2 minutes to run
def complete_str_df(person):
    # Load STRspy output
    strspy_df = pd.read_csv(f'{datadir}/strspy/output/Countings/{run_number}_{chrom}_{person}_strs.txt', sep='\t')
    strspy_df.columns = ['name', 'count', 'normcount']

    # Load Full STR list
    df = pd.read_csv(f'{rootdir}/hg38.hipstr_reference_full_strs.bed', sep='\t', header=None)
    df.columns=['chr','start','end','NA','repeats','name','motif','str']

    # Append it to STRspy output
    output = strspy_df.merge(df, how='left', on='name')
    output = output[['name','count','chr','start','end','motif', 'str']]

    # Save
    return output
    

output0 = complete_str_df('person0')
output1 = complete_str_df('person1')

# Save
output0.to_csv(f'{datadir}/{run_number}_{chrom}_person0_full.txt', index=None, header=None, sep='\t')
output1.to_csv(f'{datadir}/{run_number}_{chrom}_person1_full.txt', index=None, header=None, sep='\t')

# Combine person0 and person1 into single vcf file
output = pd.concat([output0,output1])
output=output.sort_values(by='name')
output = output.drop_duplicates(subset=['name'])
output.to_csv(f'{datadir}/{run_number}_{chrom}_person_full.txt', index=None, header=None, sep='\t')

# EXTRA CODE: Remove reads from BAM file

In [72]:
# df.loc[df['name']=='Human_STR_1625686']

Unnamed: 0,chr,start,end,NA,repeats,name,motif,str
579396,chr17,26569632,26569679,2,24.0,Human_STR_1625686,CT,CTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCT


In [None]:
# %%bash
# samtools view -h ~/work/code/strspy/bioliquid-data/bioliquid_run1_chr17.bam | grep 505171f5-1f0b-4bb2-b855-5cd3d9ce7554 | samtools view -bS -o ~/work/code/strspy/bioliquid-data/chr17_filtered.bam -

In [None]:
# 4000000/75