# STRspy Config

This script creates the necessary config files to run STRspy on the Bioliquid Nanopore data.

# Load modules

In [3]:
import numpy as np
import pandas as pd
import os

# Set variables

In [11]:
# Input variables
run_number="run1"
chrom="chr11"
dis="sca"
location=5227002
windowwidth=2000000

# Setup
chrom_dis=f"{chrom}_{dis}"
rootdir=f"/mnt/aretian/genomics/nanopore/{run_number}"
datadir=f"/mnt/aretian/genomics/nanopore/{run_number}/data"

os.environ["run_number"]=run_number
os.environ["chrom"]=chrom
os.environ["chrom_dis"]=chrom_dis
os.environ["datadir"]=datadir

# Individual BED and Fasta files

Create one BED and Fasta file per STR.

### Get reference genome

In [26]:
# Read in fasta file: remove line breaks and header
def read_fasta_genome(fasta_file,chromosome_header):
    clean_data = fasta_file.read().replace("\n", "")
    clean_data = clean_data.replace(chromosome_header,"") # get rid of header

    return clean_data

with open(f'{datadir}/{chrom}_selected.fa') as f: # update path if needed
    ref_genome = read_fasta_genome(f,f'>{chrom}')
    
print(f"Unique characters: {list(set(ref_genome))}") 

print(f"Selected chromosome from reference genome is {len(ref_genome)} BP long")

Unique characters: ['G', 'N', 'T', 'A', 'C']
Selected chromosome from reference genome is 135086622 BP long


### Load Full list of STRs

In [6]:
# Load Full STR list
df = pd.read_csv(f'{datadir}/hg38.hipstr_reference.bed', sep='\t', header=None)
df.columns=['chr','start','end','NA','repeats','name','unit']

# Load STRspy test data
testdata = pd.read_csv('/home/fer/genomics/strspy/testset/testCustomDB/FGA.bed', sep='\t', header=None)
print('Data should look like this:')
testdata

Data should look like this:


Unnamed: 0,0,1,2,3
0,4,155508887,155508975,FGA


### Prepare data

In [27]:
# df.groupby(['chr']).count()[['start']]

# Ratios
# 80000000/58887

# 93551/181000000

# 3000000000/673984

# 4000000/1358

# import matplotlib.pyplot as plt

# plt.plot(range(181000000),df.start)

In [28]:
# Filter chr17
# selected_chromosome = 'chr17'
# selected_chromosome = chrom
# window_width = 5000000
# start_pos = 70924941 - window_width
# start_pos = 23000000
# end_pos = 70953015 + window_width
# end_pos = 27000000

df = df.loc[df['chr']==chrom]

# Filter locations
# try:
# df = df.loc[(df['start']>=start_pos) & (df['end']<=end_pos)]
# except:
#     None
    
# Get columns
df = df[['chr','start','end','name']]

### Save each STR in different BED file

In [32]:
selected_strs = df.loc[(df['start']>location-windowwidth)&(df['end']<location+windowwidth)]

In [33]:
# Loop: create single STR files
for n in range(len(selected_strs)):
# for n in range(3):
    str_out = selected_strs.iloc[[n]]
    str_name = str_out['name'].values[0]
    str_out.to_csv(f"{datadir}/strspy/input/db/{str_name}.bed", header=False, index=False, sep='\t')
    
    myfasta = open(f"{datadir}/strspy/input/db/{str_name}.fa","w")
    start = str_out['start'].values[0]
    end = str_out['end'].values[0]
    # Extract reads
    padded_str=ref_genome[start-500:end+500]
    # Write to file
    myfasta.write('>')
    myfasta.write(str_name)
    myfasta.write('\n')
    myfasta.write(padded_str)
    myfasta.write('\n')
    myfasta.close()

# Region BED file (all STRs)

In [34]:
selected_strs.to_csv(f'{datadir}/strspy/input/regions/all_strs.bed', header=False, index=False, sep='\t')

# Run STRspy

In [35]:
!whoami

fer


In [38]:
%%bash
docker image ls

REPOSITORY              TAG                 IMAGE ID            CREATED             SIZE
yufernando/bioaretian   latest              4a04dc4c3ae1        2 weeks ago         6.1GB
yufernando/bioaretian   <none>              afd9eaa529e5        5 weeks ago         5.17GB
giesselmann/strique     latest              dbd6910d609e        3 months ago        1.15GB
<none>                  <none>              294970afa8de        11 months ago       91.1MB
ubuntu                  <none>              4e5021d210f6        16 months ago       64.2MB


In [18]:
%%bash
cd /home/fer/genomics/strspy
bash STRspy_run_v1.0.sh "config/${chrom}_InputConfig.txt" config/UserToolsConfig.txt

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Concatenate all output

In [25]:
%%bash
cd "${datadir}/strspy/output/Countings"
pwd
cat *person0*Allele_freqs.txt > "${run_number}_${chrom}_person0_strs_raw.txt"
cat *person1*Allele_freqs.txt > "${run_number}_${chrom}_person1_strs_raw.txt"

grep Human_STR "${run_number}_${chrom}_person0_strs_raw.txt" > "${run_number}_${chrom}_person0_strs.txt"
grep Human_STR "${run_number}_${chrom}_person1_strs_raw.txt" > "${run_number}_${chrom}_person1_strs.txt"

/mnt/aretian/genomics/nanopore/run1/data/strspy/output/Countings


CalledProcessError: Command 'b'cd "${datadir}/strspy/output/Countings"\npwd\ncat *person0*Allele_freqs.txt > "${run_number}_${chrom}_person0_strs_raw.txt"\ncat *person1*Allele_freqs.txt > "${run_number}_${chrom}_person1_strs_raw.txt"\n\ngrep Human_STR "${run_number}_${chrom}_person0_strs_raw.txt" > "${run_number}_${chrom}_person0_strs.txt"\ngrep Human_STR "${run_number}_${chrom}_person1_strs_raw.txt" > "${run_number}_${chrom}_person1_strs.txt"\n'' returned non-zero exit status 1.

# Add necessary columns

In [103]:
# Select person
person = person1

# Load STRspy output
strspy_df = pd.read_csv(f'../strspy/data/run1/output/Countings/run1_{person}_chr17_strs.txt', sep='\t', header=None)
strspy_df.columns = ['name', 'count', 'normcount']

# Load Full STR list
df = pd.read_csv('../data/run1/raw/hg38.hipstr_reference.bed', sep='\t', header=None)
df.columns=['chr','start','end','NA','repeats','name','motif']

### Create STR
def create_str(row):
    motif_len = len(row['motif']) # get length
    # Get Base
    int_repeat = int(np.floor(row['repeats'])) # 9
    base = int_repeat * row['motif']
    # Get Tail and append
    dec_repeat = row['repeats']%1
    nt_to_pull = round(dec_repeat * motif_len)
    tail = row['motif'][:nt_to_pull]
    base = base + tail
    return base

# Drop nans
df = df.loc[df['motif'].notnull()]
df['str'] = df.apply(lambda x: create_str(x), axis = 1)

# Append it to STRspy output
output = strspy_df.merge(df, how='left', on='name')

output = output[['name','count','chr','start','end','motif', 'str']]

# Save
output.to_csv('../strspy/data/run1/output/Countings/run1_person0_chr17_full.txt', index=None, header=None, sep='\t')

In [120]:
output

Unnamed: 0,name,count,chr,start,end,str
0,Human_STR_1625686,240,chr17,26569632,26569679,CTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCT
1,Human_STR_1625687,546,chr17,26592304,26592351,CTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCT
2,Human_STR_1625688,112,chr17,26639844,26639889,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
3,Human_STR_1625689,210,chr17,26649584,26649594,TTTTTTTTTTT
4,Human_STR_1625690,162,chr17,26663827,26663859,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
5,Human_STR_1625691,184,chr17,26665703,26665717,TTTTTTTTTTTTTTT


In [122]:
df.loc[df['name']=='Human_STR_1625686']

Unnamed: 0,chr,start,end,NA,repeats,name,unit,str
579396,chr17,26569632,26569679,2,24.0,Human_STR_1625686,CT,CTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCT


# EXTRA CODE: Remove reads from BAM file

In [None]:
# %%bash
# samtools view -h ~/work/code/strspy/bioliquid-data/bioliquid_run1_chr17.bam | grep 505171f5-1f0b-4bb2-b855-5cd3d9ce7554 | samtools view -bS -o ~/work/code/strspy/bioliquid-data/chr17_filtered.bam -

In [None]:
# 4000000/75