In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [24]:
from Bio import SeqIO
import pandas as pd

# path to working dir
BioPlex_dir = '/n/data1/hms/dbmi/farhat/Roger/CCB/BioPlex'

In [8]:
plt.style.use('ggplot')
plt.rcParams['lines.linewidth']=3
plt.rcParams['axes.facecolor']='1.0'
plt.rcParams['xtick.color']='black'
plt.rcParams['axes.grid']=True
plt.rcParams['axes.edgecolor']='black'
plt.rcParams['grid.color']= '1.0'
plt.rcParams.update({'font.size': 12})

## [1] Load ORFeome seqs with Biopython

In [11]:
# load ORFeome seqs used to construct baits from FASTA file
ORFeome_seq_records = list(SeqIO.parse(f'{BioPlex_dir}/data/ORFeome/human_ORFeome8.1_horf81_cloneInfo20120427.fa', 'fasta'))

In [12]:
print("Found %i records" % len(ORFeome_seq_records)) # Number of ORFeome seqs

Found 12692 records


In [15]:
print("The first ORFeome seq")
first_record = ORFeome_seq_records[0]
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))

The first ORFeome seq
81001@A01|ORF_ID:
Seq('ATGACAGACACCGAAAATCACGACTCAGCCCCCTCCAGCACCTCTACCTGTTGC...TGA')
159


In [20]:
first_record.description

'81001@A01|ORF_ID: 403|ORF_SIZE:159|TEMPLATE_ACCESSION: BC009508.2, BC010491.1|GENE_ID: 112597|PERFECT_MATCH'

In [16]:
print("The last ORFeomo seq")
last_record = ORFeome_seq_records[-1]
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))

The last ORFeomo seq
81138@H05|ORF_ID:
Seq('ATGCACCTGAAATTGAGCAAGAAAATCGCCCAGCTCACCAAGGTAATATATGCT...TGA')
3306


In [19]:
last_record.description

'81138@H05|ORF_ID: 10772|ORF_SIZE:3306|TEMPLATE_ACCESSION: BC060769.1|GENE_ID: 79632|PERFECT_MATCH'

## [2] Store ORF sequences and relevant annotation as a DataFrame

In [63]:
# store ORF seqs & relevant annot as lists by iterating over records
ORF_ID_list = []
ORF_size_list = []
template_accession_list = []
entrez_gene_ids_list = []
match_type_list = []
seq_list = []

for ORF_seq_record_i in ORFeome_seq_records:
    
    ORF_ID_list.append(ORF_seq_record_i.description.split('|')[1].split(': ')[1])
    ORF_size_list.append(ORF_seq_record_i.description.split('|')[2].split(':')[1])
    template_accession_list.append(ORF_seq_record_i.description.split('|')[3].split(': ')[1])
    entrez_gene_ids_list.append(ORF_seq_record_i.description.split('|')[4].split(': ')[1])
    match_type_list.append(ORF_seq_record_i.description.split('|')[5])
    seq_list.append(ORF_seq_record_i.seq)
    
# convert lists to DataFrame
ORFeome_df = pd.DataFrame()
ORFeome_df.loc[:,'ORF_ID'] = ORF_ID_list
ORFeome_df.loc[:,'ORF_size'] = ORF_size_list
ORFeome_df.loc[:,'template_acc'] = template_accession_list
ORFeome_df.loc[:,'Entrez_gene_ID'] = entrez_gene_ids_list
ORFeome_df.loc[:,'match'] = match_type_list
ORFeome_df.loc[:,'SEQ'] = seq_list

In [70]:
ORFeome_df.head()

Unnamed: 0,ORF_ID,ORF_size,template_acc,Entrez_gene_ID,match,SEQ
0,403,159,"BC009508.2, BC010491.1",112597,PERFECT_MATCH,"(A, T, G, A, C, A, G, A, C, A, C, C, G, A, A, ..."
1,228,135,BC016025.1,9168,PERFECT_MATCH,"(A, T, G, G, C, A, G, A, C, A, A, A, C, C, A, ..."
2,52615,156,"BC031105.1, BC107049.2",222659,PERFECT_MATCH,"(A, T, G, C, A, G, C, T, G, A, G, A, C, A, C, ..."
3,5673,159,BC005269.1,5350,PERFECT_MATCH,"(A, T, G, G, A, G, A, A, A, G, T, C, C, A, A, ..."
4,5236,171,BC007251.1,84837,PERFECT_MATCH,"(A, T, G, C, A, G, G, C, C, C, C, T, G, A, T, ..."


In [71]:
np.shape(ORFeome_df)

(12692, 6)

## [3] Remove ORF seqs that *do not* have a corresponding Entrez gene ID

In [79]:
ORFeome_df = ORFeome_df[ORFeome_df.Entrez_gene_ID != '']
ORFeome_df.reset_index(inplace = True, drop = True)

In [80]:
ORFeome_df.head()

Unnamed: 0,ORF_ID,ORF_size,template_acc,Entrez_gene_ID,match,SEQ
0,403,159,"BC009508.2, BC010491.1",112597,PERFECT_MATCH,"(A, T, G, A, C, A, G, A, C, A, C, C, G, A, A, ..."
1,228,135,BC016025.1,9168,PERFECT_MATCH,"(A, T, G, G, C, A, G, A, C, A, A, A, C, C, A, ..."
2,52615,156,"BC031105.1, BC107049.2",222659,PERFECT_MATCH,"(A, T, G, C, A, G, C, T, G, A, G, A, C, A, C, ..."
3,5673,159,BC005269.1,5350,PERFECT_MATCH,"(A, T, G, G, A, G, A, A, A, G, T, C, C, A, A, ..."
4,5236,171,BC007251.1,84837,PERFECT_MATCH,"(A, T, G, C, A, G, G, C, C, C, C, T, G, A, T, ..."


In [81]:
np.shape(ORFeome_df) # 109 ORF seqs did not have a corresponding Entrez gene ID

(12583, 6)

Convert **Entrez_gene_ID** column to integer type

In [83]:
ORFeome_df.loc[:,'Entrez_gene_ID'] = ORFeome_df.Entrez_gene_ID.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


save a pickled DataFrame for downstream analysis

In [85]:
ORFeome_df.to_pickle(f'{BioPlex_dir}/data/pickled_files/ORFeome_seqs_df.pkl')