The script:
 - selects non-overlapping & single CDS genes,
 - extracts CDSs of non-overlapping & single CDS genes (this is done with seqtk & bedtools),
 - reads fasta files with sequences of genes extracted from the genome fasta files,
 - reverse complements genes,
 - creates multiple alignments with concatenated genes.
 #### @author:aniafijarczyk

In [1]:
from Bio import SeqIO
import glob
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
import pandas as pd
from collections import defaultdict
from urllib.request import urlopen
import gzip
import re
from Bio.Seq import Seq

### Selecting non-overlapping genes
Downloading gff file from SGD database (this is version R64-2-1); 
reading gff file; extracting information for genes, including modified chromosome name, start and stop in bed format and gene ID. Excluding genes which have non-overlapping gene coordinates (it is possible that some genes may be overlapping but their CDS are not, but I'll ignore it here).

In [3]:
# this should be version R64-2-1
url = "http://sgd-archive.yeastgenome.org/curation/chromosomal_feature/saccharomyces_cerevisiae.gff.gz"
fh = gzip.open(urlopen(url),'rt')
linie = fh.readlines()
F = []
for ele in linie:
    if ele != "##FASTA\n":
        if ele.startswith("#"):
            continue
        else:
            F.append(ele.split())
    else:
        break
dF = list(zip(*F))
df = pd.DataFrame({"chrom":dF[0],"source":dF[1],"feature":dF[2],"start":dF[3],"end":dF[4],"score":dF[5],
                  "strand":dF[6],"frame":dF[7],"attribute":dF[8]})
df_genes = df.loc[df['feature'] == "gene",["chrom","start","end","attribute"]]
df_genes["geneID"] = df_genes["attribute"].apply(lambda x: re.split("[=,;]",x)[1])
df_genes['newstart'] = df_genes['start'].apply(lambda x: int(x)-1)
df_genes['newend'] = df_genes['end'].apply(lambda x: int(x))
ref_names = {"chrI":"ref|NC_001133|","chrII":"ref|NC_001134|",
"chrIII":"ref|NC_001135|","chrIV":"ref|NC_001136|",
"chrV":"ref|NC_001137|","chrVI":"ref|NC_001138|",
"chrVII":"ref|NC_001139|","chrVIII":"ref|NC_001140|",
"chrIX":"ref|NC_001141|","chrX":"ref|NC_001142|",
"chrXI":"ref|NC_001143|","chrXII":"ref|NC_001144|",
"chrXIII":"ref|NC_001145|","chrXIV":"ref|NC_001146|",
"chrXV":"ref|NC_001147|","chrXVI":"ref|NC_001148|","chrmt":"ref|NC_001224|"}
df_genes["newchrom"] = df_genes["chrom"].apply(lambda x: ref_names[x])
bed = df_genes[['newchrom','newstart','newend','geneID']].rename(columns={"newchrom":"chrom","newstart":"start","newend":"stop"}).reset_index().drop(columns=["index"])
bed.head()

Unnamed: 0,chrom,start,stop,geneID
0,ref|NC_001133|,334,649,YAL069W
1,ref|NC_001133|,537,792,YAL068W-A
2,ref|NC_001133|,1806,2169,YAL068C
3,ref|NC_001133|,2479,2707,YAL067W-A
4,ref|NC_001133|,7234,9016,YAL067C


#### List of non-overlapping genes

In [4]:
chroms = sorted(list(set(bed['chrom'])))
Ch = {}
C = {}
Genes_nono = []
for chrom in chroms:
    db = bed.loc[bed["chrom"]==chrom,:].reset_index()
    starts = list(db["start"])
    stops = list(db["stop"])
    pairs = zip(starts,stops)
    ranges = [set(range(int(i[0]),int(i[1]))) for i in pairs]
    Ch[chrom] = ranges
    first_start = int(db.at[0,'start'])
    first_stop = int(db.at[0,'stop'])
    geneID = db.at[0,'geneID']
    #firstset = set(range(int(first_start),int(first_stop)))
    F = [(first_start, first_stop, geneID)]
    for index, row in db.iterrows():
        i1 = F[-1][0]
        i2 = F[-1][1]
        lastset = set(range(i1,i2))
        newcoords = int(row['start']), int(row['stop']), row['geneID']
        newrange = set(range(int(row['start']), int(row['stop'])))
        if len(lastset.intersection(newrange))>0:
            continue
        else:
            F.append(newcoords)
            Genes_nono.append(row['geneID'])
    C[chrom] = F

#### Selecting cds of non-overlapping & single CDS genes and writing to bed

In [5]:
df_cds = df.loc[df['feature'] == "CDS",["chrom","start","end","attribute","strand"]]
df_cds["cdsID"] = df_cds["attribute"].apply(lambda x: re.split("[=,;,_]",x)[1])
df_cds['newstart'] = df_cds['start'].apply(lambda x: int(x)-1)
df_cds['newend'] = df_cds['end'].apply(lambda x: int(x))
df_cds["newchrom"] = df_cds["chrom"].apply(lambda x: ref_names[x])
df_cds["name"] = df_cds.apply(lambda x: str(x["cdsID"])+"_"+str(x["newchrom"])+"_"+str(x["newstart"])+"_"+str(x["newend"]), axis=1)
df_cdsf = df_cds.loc[df_cds['cdsID'].isin(Genes_nono),:]

dups = []
alls = []
for ele in list(df_cdsf['cdsID']):
    if ele in alls:
        dups.append(ele)
    else:
        alls.append(ele)
sing = [ele for ele in alls if ele not in list(set(dups))]
df_cdsff = df_cdsf.loc[df_cdsf['cdsID'].isin(sing),:]
bed_cds = df_cdsff[['newchrom','newstart','newend','cdsID']].rename(columns={"newchrom":"chrom","newstart":"start","newend":"stop"}).reset_index().drop(columns=["index"])
bed_cds.to_csv("./output/manipulateFasta_nonoverlappingCDS.bed",sep="\t",header=False, index=False)
bed_cds.head()

Unnamed: 0,chrom,start,stop,cdsID
0,ref|NC_001133|,1806,2169,YAL068C
1,ref|NC_001133|,2479,2707,YAL067W-A
2,ref|NC_001133|,7234,9016,YAL067C
3,ref|NC_001133|,10090,10399,YAL066W
4,ref|NC_001133|,11564,11951,YAL065C


### Fasta files with CDS extracted from strain genomes
#### Getting table with strain genotypes from vcf (relatives_annot_Filtered2.vcf.gz = File S5)
```console
bcftools query -f '%CHROM\t%POS[\t%TGT]\n' relatives_annot_Filtered2.vcf.gz | sed 's/\//\|/g' \
| awk  -F"\t" -v OFS="\t" 'function GSUB(F) {gsub(/[|]/,"\t",$F)} {GSUB(3);GSUB(4);GSUB(5);GSUB(6);GSUB(7);GSUB(8);GSUB(9);GSUB(10);GSUB(11);GSUB(12);GSUB(13)}1' \
| awk '{if (length($3)==1) print $0}' > relatives_annot_Filtered2.tab
sed -i -e 's/\./N/g' relatives_annot_Filtered2.tab
```

#### Generating fasta sequence for each strain genome & extracting CDSs from each strain genome
```console
i=0
while read h
  do
  (( i = i + 1 ))
  echo $i
  awk -v var="$i" '{print $1"\t"$2"\t.\t"$(var+2)}' relatives_annot_Filtered2.tab > genotypes_temp.tab
  seqtk mutfa S288C_reference_sequence_R64-2-1_20150113_N.fasta genotypes_temp.tab > refScer_${h}.fasta
  bedtools getfasta -fi refScer_${h}.fasta -bed manipulateFasta_nonoverlappingCDS.bed -name -fo cds_${h}.fasta
  done<relatives_annot_Filtered2.samples
```
##### relatives_annot_Filtered2.samples - names of strain genomes in relatives_annot_Filtered2.tab file
##### S288C_reference_sequence_R64-2-1_20150113_N.fasta - reference chromosomes masked with N

### Reverse complementing & concatenating CDS

In [127]:
# File with list of sample names from relatives_annot_Filtered2.vcf (FileS5.vcf) file
samp_file = pd.read_csv("./input_files/relatives_annot_Filtered2.samples", sep = "\t",header=None,names=["haplo"])
samples = list(samp_file["haplo"])
group = "all"

###In case of selecting a subset of samples
samp_to_filter = ["A.2565","A.Muntons","A.S-33","A.T-58","BE005","CFI","CFN","CFP","CHK","Jean-Talon"]
samp_to_filter = ["Jean-Talon"]
group = samp_to_filter[0]
samples_to_filter = [ele for ele in samples if ele.split("_")[0] in samp_to_filter]
#samples = samples_to_filter

wh = open("genesConcatenated_pair_"+group+".fasta","w")
for haplotype in samples:
    fasta = glob.glob("./cds/cds_"+haplotype+".fasta")[0]
    records = [rec for rec in list(SeqIO.parse(fasta,"fasta"))]
    R={}
    CON = []
    for gene in records:
        if len(str(gene.seq))/3. == int(len(str(gene.seq))/3.):
            if gene.id.split("-")[0][-1] == 'C':
                revcomp = gene.seq.reverse_complement()
                R[gene.id] = str(revcomp)
                CON.append(str(revcomp))
            else:
                R[gene.id] = str(gene.seq)
                CON.append(str(gene.seq))
        else:
            continue
    concatenated = ''
    for s in CON:
        concatenated += s

    if haplotype.split("_")[0]!="A.Windson":
        wh.write(">"+haplotype+"\n")
        wh.write(concatenated+"\n")
    print(fasta, len(concatenated))
    
wh.flush()
wh.close()  

./cds/cds_A.2565_H1.fasta 8010768
./cds/cds_A.2565_H2.fasta 8010768
./cds/cds_A.2565_H3.fasta 8010768
./cds/cds_A.2565_H4.fasta 8010768
./cds/cds_A.Muntons_H1.fasta 8010768
./cds/cds_A.Muntons_H2.fasta 8010768
./cds/cds_A.Muntons_H3.fasta 8010768
./cds/cds_A.Muntons_H4.fasta 8010768
./cds/cds_A.S-33_H1.fasta 8010768
./cds/cds_A.S-33_H2.fasta 8010768
./cds/cds_A.S-33_H3.fasta 8010768
./cds/cds_A.S-33_H4.fasta 8010768
./cds/cds_A.T-58_H1.fasta 8010768
./cds/cds_A.T-58_H2.fasta 8010768
./cds/cds_A.T-58_H3.fasta 8010768
./cds/cds_A.T-58_H4.fasta 8010768
./cds/cds_A.Windson_H1.fasta 8010768
./cds/cds_A.Windson_H2.fasta 8010768
./cds/cds_A.Windson_H3.fasta 8010768
./cds/cds_A.Windson_H4.fasta 8010768
./cds/cds_BE005_H1.fasta 8010768
./cds/cds_BE005_H2.fasta 8010768
./cds/cds_BE005_H3.fasta 8010768
./cds/cds_BE005_H4.fasta 8010768
./cds/cds_CFI_H1.fasta 8010768
./cds/cds_CFI_H2.fasta 8010768
./cds/cds_CFI_H3.fasta 8010768
./cds/cds_CFI_H4.fasta 8010768
./cds/cds_CFN_H1.fasta 8010768
./cds/cds

In [118]:
len(bed_cds)

5713