In [1]:
import sys
import pandas as pd
from Bio import SeqIO
from Bio import SearchIO
import os
import glob
from pathlib import Path
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import numpy as np

In [2]:
os.chdir('../../data/Au3_deduplicate')

In [3]:
Au3_cds = list(SeqIO.parse('Au3.cds', 'fasta'))
Au3_faa = list(SeqIO.parse('Au3.faa', 'fasta'))

In [55]:
Au3_duplicate_gene_list = pd.read_csv('../Au3_duplicate_gene.txt', header=None)
Au3_duplicate_gene_list = list(Au3_duplicate_gene_list[0])

In [56]:
Au3_cds_deduplicate = []
Au3_faa_deduplicate = []
for i in range(len(Au3_cds)):
    header = Au3_cds[i].id.strip().split('-')[0]
    if header not in Au3_duplicate_gene_list:
        Au3_cds_deduplicate.append(Au3_cds[i])
for i in range(len(Au3_faa)):
    header = Au3_faa[i].id.strip().split('-')[0]
    if header not in Au3_duplicate_gene_list:
        Au3_faa_deduplicate.append(Au3_faa[i])

In [72]:
#Write deduplicated CDS and protein sequences
SeqIO.write(Au3_cds_deduplicate, 'Au3_deduplicate.cds', 'fasta')
SeqIO.write(Au3_faa_deduplicate, 'Au3_deduplicate.faa', 'fasta')

38160

In [58]:
Au3_gff = pd.read_csv('Au3.gff3', sep='\t', header=None, comment='#')
Au3_gff.columns = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']

In [59]:
#If attributes not contains duplicate gene, write to deduplicate gff file
Au3_gff_deduplicate = []
for i in range(len(Au3_gff)):
    gene = Au3_gff['attributes'][i].strip().split(';')[0].split('=')[1].split('-')[0]
    if gene not in Au3_duplicate_gene_list:
        Au3_gff_deduplicate.append(Au3_gff.iloc[i])
Au3_gff_deduplicate = pd.DataFrame(Au3_gff_deduplicate)

In [73]:
#Write deduplicated gff file
Au3_gff_deduplicate.to_csv('Au3_deduplicate.gff3', sep='\t', header=False, index=False)

In [61]:
#hapA_seq list contain HapA in seqid but not contain CHR14ab
hapA_seq_list = Au3_gff[Au3_gff['seqid'].str.contains('HapA') & ~Au3_gff['seqid'].str.contains('CHR14ab')]['seqid'].unique()
hapB_seq_list = Au3_gff[Au3_gff['seqid'].str.contains('HapB') | Au3_gff['seqid'].str.contains('CHR14ab')]['seqid'].unique()

In [62]:
Au3_gff_deduplicate_gene = Au3_gff_deduplicate[Au3_gff_deduplicate['type'] == 'gene']
Au3_gff_deduplicate_gene_hapA = Au3_gff_deduplicate_gene[Au3_gff_deduplicate_gene['seqid'].isin(hapA_seq_list)]
Au3_gff_deduplicate_gene_hapB = Au3_gff_deduplicate_gene[Au3_gff_deduplicate_gene['seqid'].isin(hapB_seq_list)]

In [74]:
Au3_gff_deduplicate_gene_hapA['attributes'] = Au3_gff_deduplicate_gene_hapA['attributes'].str.split(';').str[0] + '-T1;'
Au3_gff_deduplicate_gene_hapB['attributes'] = Au3_gff_deduplicate_gene_hapB['attributes'].str.split(';').str[0] + '-T1;'
Au3_gff_deduplicate_gene_hapA.to_csv('Au3_deduplicate_hapA.gff3', sep='\t', header=False, index=False)
Au3_gff_deduplicate_gene_hapB.to_csv('Au3_deduplicate_hapB.gff3', sep='\t', header=False, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Au3_gff_deduplicate_gene_hapA['attributes'] = Au3_gff_deduplicate_gene_hapA['attributes'].str.split(';').str[0] + '-T1;'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Au3_gff_deduplicate_gene_hapB['attributes'] = Au3_gff_deduplicate_gene_hapB['attributes'].str.split(';').str[0] + '-T1;'


In [66]:
len(Au3_gff_deduplicate_gene_hapA)

19590

In [67]:
len(Au3_gff_deduplicate_gene_hapB)

16913

In [68]:
Au3_gff_deduplicate_gene_hapA_list = list(Au3_gff_deduplicate_gene_hapA['attributes'].str.split(';').str[0].str.split('=').str[1])
Au3_gff_deduplicate_gene_hapB_list = list(Au3_gff_deduplicate_gene_hapB['attributes'].str.split(';').str[0].str.split('=').str[1])

In [69]:
#Extract hapA and hapB CDS and protein sequences
hapA_cds = []
hapA_faa = []
hapB_cds = []
hapB_faa = []
for i in range(len(Au3_cds_deduplicate)):
    header = Au3_cds_deduplicate[i].id.strip()
    if header in Au3_gff_deduplicate_gene_hapA_list:
        hapA_cds.append(Au3_cds_deduplicate[i])
    elif header in Au3_gff_deduplicate_gene_hapB_list:
        hapB_cds.append(Au3_cds_deduplicate[i])
for i in range(len(Au3_faa_deduplicate)):
    header = Au3_faa_deduplicate[i].id.strip()
    if header in Au3_gff_deduplicate_gene_hapA_list:
        hapA_faa.append(Au3_faa_deduplicate[i])
    elif header in Au3_gff_deduplicate_gene_hapB_list:
        hapB_faa.append(Au3_faa_deduplicate[i])

In [75]:
SeqIO.write(hapA_cds, 'Au3_deduplicate_hapA.cds', 'fasta')
SeqIO.write(hapA_faa, 'Au3_deduplicate_hapA.faa', 'fasta')
SeqIO.write(hapB_cds, 'Au3_deduplicate_hapB.cds', 'fasta')
SeqIO.write(hapB_faa, 'Au3_deduplicate_hapB.faa', 'fasta')

16419