#Scripts for sorting transcripts and proteins `.fasta` files and expression levels `.csv` files

In [None]:
import pandas as pd
from Bio import SeqIO
import re

##Select transcripts with a significant level of expression
Select from the expression-averaged table the identifiers of those sequences that have an expression level of at least 2 TPM in at least 1 sample.

In [None]:
data = pd.read_csv('mean_expression_data.csv', delimiter=',')
data = data.query("healthy_fem>=2  or infected_fem>=2 or healthy_male>=2 or infected_male>=2")
data.to_csv('overexpressed.csv', index=False)

##Select long protein products
Select sequence identifiers that encode proteins at least 100 a.a. long

In [None]:
df = pd.DataFrame(columns=['Name', 'Prot_name', 'Length'])
for record in SeqIO.parse("cleaned_assembly_proteins.fasta", "fasta"):
    name = re.split('\.p[\d]+$', record.id)[0]
    prot_num = record.id
    seq_len = len(record.seq)
    if seq_len >= 100:
        new_row = {'Name': name, 'Prot_name': prot_name, 'Length': seq_len}
        df = df._append(new_row, ignore_index=True)

df.to_csv('fasta_prot_id.csv', index=False)

##Merge identifiers that encode long proteins products and transcripts with meaningful expression levels

In [None]:
def get_prot_num(x):
    y = re.search('\.p([\d]+)$', x)
    if y is None:
        return x
    else:
        return y.groups()[0]

data1 = pd.read_csv('fasta_prot_id.csv', delimiter=',')
data1['Prot_num'] = data1.apply(lambda row: get_prot_num(row['Prot_name']), axis=1)
names_list = data1.Name.unique()
print(data1.head())
data1 = data1.sort_values(by=['Prot_num'], kind='stable')
data1 = data1.sort_values(by=['Length'], kind='stable', ascending=False)
data1 = data1.sort_values(by=['Name'], kind='stable')
data1 = data1.reset_index(drop=True)
print(len(data1.Name.unique()))
data1 = data1[~data1.duplicated(subset=['Name'])]

data2 = pd.read_csv('overexpressed.csv', delimiter=',')

data = pd.merge(data1, data2)
data.to_csv('merged.csv', index=False)

##Make new expression tables

In [None]:
data = pd.read_csv('merged.csv', delimiter=',')
ids = data.Name
new_names = pd.DataFrame({'New_name': ['Pmin_ref_seq_' + str(i+1) for i in range(len(ids))]})
new_names_dict = pd.concat([pd.DataFrame({'Name': ids}), new_names], axis=1)
print(new_names_dict.head(10))
print(len(new_names_dict))
table1 = pd.read_csv('expression_data.csv', delimiter=',')
table2 = pd.read_csv('mean_expression_data.csv', delimiter=',')
new_table1 = pd.merge(new_names_dict, table1, how='left')
new_table1 = new_table1.drop('Name', axis=1)
new_table1.rename(columns={'New_name': 'Name'}, inplace=True)
new_table1.to_csv('new_expression_data.csv', index=False)
new_table2 = pd.merge(new_names_dict, table2, how='left')
new_table2 = new_table2.drop('Name', axis=1)
new_table2.rename(columns={'New_name': 'Name'}, inplace=True)
new_table2.to_csv('new_mean_expression_data.csv', index=False)

##Make new `.fasta` protein file

In [None]:
data = pd.read_csv('merged.csv', delimiter=',')
ids = data.Name
prot_ids = data.Prot_name
new_names = pd.DataFrame({'New_name': ['Pmin_ref_seq_' + str(i+1) for i in range(len(data))]})
new_names_dict = pd.concat([pd.DataFrame({'Name': ids}), pd.DataFrame({'Prot_name': prot_ids}), new_names], axis=1)
records = []
for record in SeqIO.parse("cleaned_assembly_proteins.fasta", "fasta"):
    id = record.id
    if id in list(new_names_dict['Prot_name']):
        record.description = record.description.replace(record.id, '')[1::]
        record.id = new_names_dict.loc[new_names_dict['Prot_name'] == id].iloc[0]['New_name']
        records.append(record)
SeqIO.write(records, "new_assembly_proteins.fasta", "fasta")

##Make new `.fasta` transcripts file

In [None]:
data = pd.read_csv('merged.csv', delimiter=',')
ids = data.Name
new_names = pd.DataFrame({'New_name': ['Pmin_ref_seq_' + str(i+1) for i in range(len(data))]})
new_names_dict = pd.concat([pd.DataFrame({'Name': ids}), new_names], axis=1)
records = []
for record in SeqIO.parse("cleaned_assembly.fasta", "fasta"):
    id = record.id
    if id in list(new_names_dict['Name']):
        record.description = record.description.replace(record.id, '')[1::]
        record.id = new_names_dict.loc[new_names_dict['Name'] == id].iloc[0]['New_name']
        records.append(record)
SeqIO.write(records, "new_assembly.fasta", "fasta")