# Strique setup

This script extracts preffixes and suffixes to the input file for STRique.

Steps:

- Import modules
- Load HipSTR reference file
- Load reference genome
- Check STRs
- Attach prefix and suffix
- Reorder columns
- Save file

# Import modules

In [85]:
# Import modules
import pandas as pd

# Load HipSTR reference file

In [87]:
# Read one line of the bed file
df=pd.read_csv('../data/raw/hg38.hipstr_reference.bed', sep='\t', header=None)
df.columns = ['chr', 'start', 'end', 'NA', 'repeats', 'name', 'unit']

# Get Chr17
df = df.loc[df['chr']=='chr17']

# Load reference genome

In [88]:
# Read in fasta file: remove line breaks and header
def read_fasta_genome(fasta_file,chromosome_header):
    clean_data = fasta_file.read().replace("\n", "")
    clean_data = clean_data.replace(chromosome_header,"") # get rid of header

    return clean_data

with open('../data/processed/reference_genome_chr17.fa') as f: # update path if needed
    ref_genome = read_fasta_genome(f,'>chr17')

# See https://www.bioinformatics.org/sms/iupac.html for IUPAC nucleotide codes

print(f"Unique characters: {list(set(ref_genome))}") 

print(f"Selected chromosome from reference genome is {len(ref_genome)} BP long")

Unique characters: ['Y', 'S', 'N', 'G', 'C', 'A', 'K', 'R', 'T', 'W']
Selected chromosome from reference genome is 83257441 BP long


# Check STRs

In [90]:
df.head(20)

Unnamed: 0,chr,start,end,NA,repeats,name,unit
562780,chr17,60845,60867,1,23.0,Human_STR_1625444,T
562781,chr17,61940,61962,6,3.83333,Human_STR_1625445,AGGCCG
562782,chr17,68156,68177,4,5.5,Human_STR_1625446,AAAT
562783,chr17,71566,71585,1,20.0,Human_STR_1625447,A
562784,chr17,76031,76051,5,4.2,Human_STR_1625448,ATTTT
562785,chr17,77395,77443,2,24.5,Human_STR_1625449,AT
562786,chr17,77755,77770,1,16.0,Human_STR_1625450,A
562787,chr17,79916,79932,2,8.5,Human_STR_1625451,AG
562788,chr17,80307,80347,3,13.6667,Human_STR_1625452,AAC
562789,chr17,82551,82570,1,20.0,Human_STR_1625453,A


In [89]:
for read_number in range(20):
    str_start = int(df[['start']].iloc[read_number])
    str_end = int(df[['end']].iloc[read_number])
    print(ref_genome[str_start:str_end])

TTTTTTTTTTTTTTTTTTTTTT
AGGCCGAGGCCGAGGCCGGGCC
AATAAATAAATAAAAAATAAA
AAAAAAAAACAAAATAAAA
TTTGTATTTTATTTTATTTT
ATATATATATATATATATATATATATATATATATATATATATATATAT
AAAAAAAAAAAAAAA
GAGAGAGGGAGAGAGA
ACAACAACAACAATAACAAAAACAAAAACAACAACAACAA
AAAAAAAAAAGAAAAAAAA
TTATTTTATTTTATTAAATTTATTTTTTTTATTTT
TTTTTTTTTTTTTTTTTTTT
AAAAAAAAAGAAAAAAAA
AAAAAAAAAAAA
TTTATTTTATTTTATTTT
TTTTTTTTTTT
AAAAAAAAAAAAAA
CACACACAAACACACACACACACACACACAC
TTTTTTTTTTTTT
AGGAAGGAAAGAAAAAAGGAAGGGAGGAGGGAAGGAGGGAAAAAGGGAAGGAGGGAAGGAAAGGAAGGAAGGGAAAGAAGGAAAGGAAGGAAGG


# Attach prefix and suffix

In [68]:
prefix_length = 150

In [70]:
def create_prefix(str):
    str_start = int(str['start'])
    prefix = ref_genome[str_start-prefix_length:str_start]
    return prefix

In [73]:
def create_suffix(str):
    str_end = int(str['end'])
    suffix = ref_genome[str_end:str_end+prefix_length]
    return suffix

In [71]:
df['prefix'] = df.apply(lambda x: create_prefix(x), axis=1)

In [74]:
df['suffix'] = df.apply(lambda x: create_suffix(x), axis=1)

In [79]:
df.head()

Unnamed: 0,chr,start,end,NA,repeats,name,unit,prefix,suffix
562780,chr17,60845,60867,1,23.0,Human_STR_1625444,T,AGGTGGGCAACCCCAAAGATCCCAGGACTCACAGTACCCCCTGAGA...,GAGACCGAGTCTTGCTCTGTCACCCAGGCTGGAGTGCAGTGGTGCG...
562781,chr17,61940,61962,6,3.83333,Human_STR_1625445,AGGCCG,CTGGAATGGCCGACGTGAGGAATGAGCTGGGCCTAAAGAGGCCACT...,CGTGCAGGCCTTCGAGAGGCAGGAGGCCGGGCCTGCAAAGGCCGCC...
562782,chr17,68156,68177,4,5.5,Human_STR_1625446,AAAT,TAGTGGCAAGCACCTGTAGTCTCAGCCACGTGGGAGGTTGAGGTGG...,ACAGGTTAGAAACTGTGATGAGGTCTGTTGGGCAAAATTCCATATA...
562783,chr17,71566,71585,1,20.0,Human_STR_1625447,A,GTTGGGCATAGTGGCAAGCACCTGTAGTCTCAGCCACGTGGGAGGT...,CAGGTTAGAAATTGTAATGAGGTCTGCTGGGCAAAATTCCATATAA...
562784,chr17,76031,76051,5,4.2,Human_STR_1625448,ATTTT,GAAGTGACAAAAAGACATCTTTTGACATAAAGGGATGACACAGACA...,TTGAGACAGTCTCATTCTGTCACCTAGGCTGGAGTGCAGTGGTGCA...


# Reorder columns

In [81]:
df_out = df[['chr', 'start', 'end', 'name', 'unit', 'prefix', 'suffix']]

In [83]:
df_out.columns=['chr', 'begin', 'end', 'name', 'repeat', 'prefix', 'suffix']

# Save file

In [84]:
df_out.to_csv('../data/processed/strique_repeat_config.tsv', sep='\t', index=None)