# STRspy Config

This script creates the necessary config files to run STRspy on the Bioliquid Nanopore data.

# Load modules

In [26]:
import pandas as pd

# Individual BED and Fasta files

Create one BED and Fasta file per STR.

### Get reference genome

In [33]:
# Read in fasta file: remove line breaks and header
def read_fasta_genome(fasta_file,chromosome_header):
    clean_data = fasta_file.read().replace("\n", "")
    clean_data = clean_data.replace(chromosome_header,"") # get rid of header

    return clean_data

with open('../data/processed/chr17_selected.fa') as f: # update path if needed
    ref_genome = read_fasta_genome(f,'>chr17')

### Load Full list of STRs

In [34]:
# Load Full STR list
df = pd.read_csv('../data/raw/hg38.hipstr_reference.bed', sep='\t', header=None)
df.columns=['chr','start','end','repeats','NA','name','unit']

# Load STRspy test data
testdata = pd.read_csv('testset/testCustomDB/FGA.bed', sep='\t', header=None)
print('Data should look like this:')
testdata

Data should look like this:


Unnamed: 0,0,1,2,3
0,4,155508887,155508975,FGA


### Prepare data

In [35]:
# Filter to Chr17 and locations 23M to 27M
df['chr']=df['chr'].str[3:]
# Filter chr17
df = df.loc[df['chr']=='17']
# Filter locations 23M to 27M
# df = df.loc[(df['start']>=23000000) & (df['end']<=27000000)]
# Get columns
df = df[['chr','start','end','name']]

### Save each STR in different BED file for first 500

In [36]:
selected_strs = df.loc[(df['start']>22500000)&(df['end']<27500000)]

In [37]:
# Loop: create single STR files
for n in range(len(selected_strs)):
# for n in range(3):
    str_out = selected_strs.iloc[[n]]
    str_name = str_out['name'].values[0]
    str_out.to_csv(f"bioliquid-data/db/{str_name}.bed", header=False, index=False, sep='\t')
    
    myfasta = open(f"bioliquid-data/db/{str_name}.fa","w")
    start = str_out['start'].values[0]
    end = str_out['end'].values[0]
    # Extract reads
    padded_str=ref_genome[start-500:end+500]
    # Write to file
    myfasta.write('>')
    myfasta.write(str_name)
    myfasta.write('\n')
    myfasta.write(padded_str)
    myfasta.write('\n')
    myfasta.close()

# Region BED file (all STRs)

In [38]:
selected_strs.to_csv('bioliquid-data/regions/all_strs.bed', header=False, index=False, sep='\t')

# Remove reads from BAM file

In [23]:
%%bash
samtools view -h ~/work/code/strspy/bioliquid-data/bioliquid_run1_chr17.bam | grep 505171f5-1f0b-4bb2-b855-5cd3d9ce7554 | samtools view -bS -o ~/work/code/strspy/bioliquid-data/chr17_filtered.bam -


[main_samview] fail to read the header from "-".


CalledProcessError: Command 'b'samtools view -h ~/work/code/strspy/bioliquid-data/bioliquid_run1_chr17.bam | grep 505171f5-1f0b-4bb2-b855-5cd3d9ce7554 | samtools view -bS -o ~/work/code/strspy/bioliquid-data/chr17_filtered.bam -\n'' returned non-zero exit status 1.