In [1]:
import pandas as pd
import numpy as np

import os
import itertools
import statistics

print(pd.__version__)
print(np.__version__)

1.3.3
1.20.3


In [2]:
def read_fasta(fastafile):
    """
    Input: FASTA file of aligned Olfr (gaps OK)
           entries must start with '>' & sequence must be immediately next
           no other lines allowed
    Output: dictionary with keys equal identity of entry
            values are aa sequence associated with entry 
    """
    fasta_dict = {}
    header = ''
    sequence = ''
    with open(fastafile, 'r') as f:
        for line in f:
            if '>' in line:
                if sequence != '':
                    fasta_dict[header] = sequence
                header = line[1:].strip('\n')
                sequence = ''
            else: 
                sequence += line.strip('\n')
        fasta_dict[header] = sequence
    return fasta_dict

def parse_fasta(fasta_dict):
    """
    Input: FASTA dict with key:value = gene:sequence
    Output: FASTA dict with key:key:value = gene:position:amino acid
    """
    dict_out = {}
    for entry in fasta_dict:
        dict_out[entry] = {}
        seq_len = len(fasta_dict[entry])
        for i in range(seq_len):
            dict_out[entry][i] = fasta_dict[entry][i]
    return dict_out

def fasta_prop_calc(fasta_df, grantham_dict):
    df_out = pd.DataFrame()
    for position in fasta_df.columns:
        c_list = []
        p_list = []
        v_list = []
        for gene in fasta_df[position].index:
            aa = fasta_df[position].loc[gene]
            if aa == '-':
                c_list.append('-')
                p_list.append('-')
                v_list.append('-')
            else:
                c_list.append(grantham_dict[aa]['c'])
                p_list.append(grantham_dict[aa]['p'])
                v_list.append(grantham_dict[aa]['v'])
        c_series = pd.Series(c_list, name='c_'+str(position+1))
        p_series = pd.Series(p_list, name='p_'+str(position+1))
        v_series = pd.Series(v_list, name='v_'+str(position+1))
        df_out = pd.concat([df_out, c_series], axis = 1)
        df_out = pd.concat([df_out, p_series], axis = 1)
        df_out = pd.concat([df_out, v_series], axis = 1)
    df_out.index = fasta_df.index
    return df_out

def impute_colmeans(grantham_df):
    grantham_noNaN = pd.DataFrame()
    for col_id in grantham_df.columns:
        grantham_noNaN[col_id] = pd.to_numeric(grantham_df[col_id], errors ='coerce')
        col_means = grantham_noNaN[col_id].mean()
        grantham_noNaN[col_id] = grantham_noNaN[col_id].fillna(col_means)
    return grantham_noNaN

def feature_distances(input_vector):
    modified_vector = np.array(input_vector).reshape(-1,1)
    vector_distances = pdist(modified_vector, 'euclidean')
    vector_distances = pd.Series(vector_distances)
    return vector_distances

def normalize(x):
    return (x - x.mean())/(x - x.mean()).std()

def most_frequent(List):
    counter = 0
    num = List[0]
     
    for i in List:
        curr_frequency = List.count(i)
        if(curr_frequency> counter):
            counter = curr_frequency
            num = i
 
    return num

In [3]:
fasta_60p_aaIdentity = read_fasta("./mouseOR_alignment/mouseOR_60p_aaIdentity.fasta")
fasta_60p_aaIdentity = parse_fasta(fasta_60p_aaIdentity)
fasta_60p_aaIdentity = pd.DataFrame.from_dict(fasta_60p_aaIdentity, orient = 'index')
fasta_60p_aaIdentity.columns = range(1, fasta_60p_aaIdentity.shape[1]+1, 1)
fasta_60p_aaIdentity_dummies = pd.get_dummies(fasta_60p_aaIdentity)

In [4]:
ps6_dir = "./olfr_de/"

filename_cid = pd.read_csv("./cid_info/filename_cid.csv", index_col = 0)
filename_cid = filename_cid.to_dict(orient='index')

for filename in filename_cid:
    filename_cid[filename]['cid'] = str(filename_cid[filename]['cid'])

In [11]:
odor_test_count = {}

for base_name in filename_cid:
    split_name = base_name.split('_')[2:]
    odor_name = split_name[1].split('.')[0]
    if odor_name not in odor_test_count:
        odor_test_count[odor_name] = 0
    odor_test_count[odor_name] += 1
    
multi_conc_tested = {}

for ps6ip_file in os.listdir(ps6_dir):
    conc_odor = ps6ip_file.split('_')[2:]
    conc = conc_odor[0]
    odor_name = conc_odor[1].split('.')[0]
    if odor_test_count[odor_name] > 1:
        ps6ip_file = os.path.join(ps6_dir, ps6ip_file)
        df = pd.read_csv(ps6ip_file, index_col = 0)
        if odor_name not in multi_conc_tested:
            multi_conc_tested[odor_name] = {}
        multi_conc_tested[odor_name][conc] = df

multi_conc_activation = {}

for odor in multi_conc_tested:
    if odor not in multi_conc_activation:
        multi_conc_activation[odor] = {}
    for conc in multi_conc_tested[odor]:
        df = multi_conc_tested[odor][conc]
        sig_or_count = df[(df.logFC > 0) & (df.FDR < 0.05)].shape[0]
        if sig_or_count < 8:
            continue
        multi_conc_activation[odor][conc] = sig_or_count

In [45]:
#tested_resp = {}
#sigOR_dict = {}
#
#for odor in odor_test_count:
#    #Pick out concentration for odors tested at multiple concentrations
#    if odor_test_count[odor] > 1:
#        fewest_or_conc = min(multi_conc_activation[odor], key=multi_conc_activation[odor].get)
#        filename = "pS6_DE_"+fewest_or_conc+"_"+odor+".csv"
#    else:
#    #Rest which are tested at a single concentration
#        for base_file in os.listdir(ps6_dir):
#            odor_name = base_file.split('_')[3].split('.')[0]
#            if odor == odor_name:
#                filename = base_file
#    file_path = os.path.join(ps6_dir, filename)
#    cid = str(filename_cid[filename]['cid'])
#    df = pd.read_csv(file_path, index_col = 0)
#    df = df.loc[:,['symbol','logFC','FDR']]
#    df = df.sort_values(by=['symbol'])
#    df = df.reset_index(drop=True)
#    #Set criteria for determining significant response
#    sigOR_count = df[(df.logFC > 0) & (df.FDR < 0.05)].shape[0]
#    if sigOR_count == 0:
#        continue
#    sigOR_dict[odor] = sigOR_count
#    #Set criteria for determining non-significant response
#    df.columns = df.columns+"_"+cid
#    tested_resp[cid] = df
##Determine mean & stdev of number of responsive receptors per odor
#resp_dist = []
#for odor in sigOR_dict:
#    resp_dist.append(sigOR_dict[odor])
#resp_cutoff = statistics.mean(resp_dist)+(5*statistics.stdev(resp_dist))
##Initialize response df        
#response_df = pd.DataFrame()
##Compile dictionary into logFC df
#for cid in tested_resp:
#    df = tested_resp[cid]
#    if df[(df.iloc[:,1]>0) & (df.iloc[:,2]<0.05)].shape[0] > resp_cutoff:
#        continue
#    df[cid] = 0
#    df.iloc[(df.iloc[:,1]>0) & (df.iloc[:,2]<0.05),3] = 1
#    response_df = pd.concat([response_df, df.iloc[:,3]], axis=1)
#response_df.index = df.iloc[:,0].values
#response_df = response_df.loc[-(response_df.var(axis = 1) == 0),:]
#response_df = response_df.loc[:,-(response_df.sum(axis = 0) < 8)]
#response_df = response_df.transpose()
#response_df.to_csv("./mouseOR_alignment/binary_odor_response.csv")
response_df = pd.read_csv("./mouseOR_alignment/binary_odor_response.csv", index_col = 0)
response_df = response_df.transpose()

In [47]:
#fasta_60p_aaIdentity_dummies = fasta_60p_aaIdentity_dummies.loc[fasta_60p_aaIdentity_dummies.index.isin(response_df.index),:]
#fasta_60p_aaIdentity_dummies = fasta_60p_aaIdentity_dummies.loc[:,-(fasta_60p_aaIdentity_dummies.var(axis = 0) == 0)]
#fasta_60p_aaIdentity_dummies = fasta_60p_aaIdentity_dummies.reindex(response_df.index)
#fasta_60p_aaIdentity_dummies = fasta_60p_aaIdentity_dummies.transpose()
#fasta_60p_aaIdentity_dummies.to_csv("./mouseOR_alignment/fasta_60p_aaIdentity_dummies.csv")
fasta_60p_aaIdentity_dummies = pd.read_csv("./mouseOR_alignment/fasta_60p_aaIdentity_dummies.csv", index_col = 0)
fasta_60p_aaIdentity_dummies = fasta_60p_aaIdentity_dummies.transpose()

In [48]:
fasta_60p_aaIdentity = fasta_60p_aaIdentity.loc[fasta_60p_aaIdentity.index.isin(response_df.index),:]
fasta_60p_aaIdentity_dict = fasta_60p_aaIdentity.to_dict()

aa_frequency = {}

for position in fasta_60p_aaIdentity_dict:
    aa_list = []
    aa_dict = {}
    aa_counter = 0
    max_aa = ''
    for receptor in fasta_60p_aaIdentity_dict[position]:
        aa_list.append(fasta_60p_aaIdentity_dict[position][receptor])
    for aa in aa_list:
        if aa not in aa_dict:
            aa_dict[aa] = 0
        aa_dict[aa] += 1
    for aa in aa_dict:
        if aa == '-':
            continue
        if aa_dict[aa] > aa_counter:
            aa_counter = aa_dict[aa]
            max_aa = aa
    aa_frequency[position] = {}
    aa_frequency[position]['aa'] = max_aa
    aa_frequency[position]['conserv_percent'] = aa_counter
    
for position in aa_frequency:
    aa_frequency[position]['conserv_percent'] = aa_frequency[position]['conserv_percent']/len(aa_list)*100
    
aa_frequency = pd.DataFrame.from_dict(aa_frequency, orient='index')
#aa_frequency.to_csv("./mouseOR_alignment/aa_frequency_conservation.csv")


# Process logistic regression data for plotting

In [6]:
log_reg_outcome = pd.read_csv("./mouseOR_alignment/log_reg_auroc.csv", index_col = 0)
sig_log_reg = log_reg_outcome[log_reg_outcome['auc_1se'] > 0.5]

In [7]:
def find_common_aa(input_vector):
    input_vector_counts = pd.DataFrame(input_vector.value_counts())
    if '-' in input_vector_counts.index:
        input_vector_counts = input_vector_counts.drop(['-'])
    common_aa = input_vector_counts.index[0]
    return(common_aa)

In [14]:
log_reg_out = pd.DataFrame()

for odor in odor_test_count:
    #Pick out concentration for odors tested at multiple concentrations
    if odor_test_count[odor] > 1:
        fewest_or_conc = min(multi_conc_activation[odor], key=multi_conc_activation[odor].get)
        filename = "pS6_DE_"+fewest_or_conc+"_"+odor+".csv"
    else:
    #Rest which are tested at a single concentration
        for base_file in os.listdir(ps6_dir):
            odor_name = base_file.split('_')[3].split('.')[0]
            if odor == odor_name:
                filename = base_file
    cid = filename_cid[filename]['cid']
    if int(cid) not in sig_log_reg['cid'].values:
        continue
    input_df = pd.read_csv(os.path.join("./olfr_de",filename), index_col = 0)
    #Drop odors without at least 8 activated ORs or > 3 std devs
    sig_responders = input_df[(input_df.logFC > 0) & (input_df.FDR < 0.05)]
    if (sig_responders.shape[0] < 8) | (sig_responders.shape[0] > resp_cutoff):
        continue
    sig_olfr_aa = fasta_60p_aaIdentity.loc[sig_responders['symbol'].values,:]
    most_common_aa = pd.DataFrame(sig_olfr_aa.apply(find_common_aa))
    most_common_aa.columns = ['text']
    most_common_aa['position'] = most_common_aa.index
    most_common_aa['cid'] = cid
    most_common_aa['odor'] = odor
    most_common_aa['filename'] = filename
    log_reg_out = pd.concat([log_reg_out, most_common_aa])
    with open("./fasta_files/"+odor+".fasta", "w") as fasta_out:
        for sig_olfr in sig_responders['symbol']:
            olfr_string = ''.join(fasta_60p_aaIdentity.loc[sig_olfr,:].values)
            fasta_out.write(f'>{sig_olfr}\n')
            fasta_out.write(f'{olfr_string}\n')
        fasta_out.close()

In [15]:
   
#log_reg_out = log_reg_out.reset_index(drop=True)
#log_reg_out.to_csv("./mouseOR_alignment/common_aa_from_logReg.csv")
log_reg_out = pd.read_csv("./mouseOR_alignment/common_aa_from_logReg.csv", index_col = 0)

# pS6 data used for logistic regression compiled into a single CSV

In [96]:
#tested_resp = {}
#sigOR_dict = {}
#
#for odor in odor_test_count:
#    #Pick out concentration for odors tested at multiple concentrations
#    if odor_test_count[odor] > 1:
#        fewest_or_conc = min(multi_conc_activation[odor], key=multi_conc_activation[odor].get)
#        filename = "pS6_DE_"+fewest_or_conc+"_"+odor+".csv"
#    else:
#    #Rest which are tested at a single concentration
#        for base_file in os.listdir(ps6_dir):
#            odor_name = base_file.split('_')[3].split('.')[0]
#            if odor == odor_name:
#                filename = base_file
#    file_path = os.path.join(ps6_dir, filename)
#    cid = str(filename_cid[filename]['cid'])
#    df = pd.read_csv(file_path, index_col = 0)
#    #Set criteria for determining significant response
#    sigOR_count = df[(df.logFC > 0) & (df.FDR < 0.05)].shape[0]
#    if sigOR_count == 0:
#        continue
#    sigOR_dict[odor] = sigOR_count
#    #Set criteria for determining non-significant response
#    df['cid'] = cid
#    tested_resp[cid] = df
#    
##Determine mean & stdev of number of responsive receptors per odor
#resp_dist = []
#for odor in sigOR_dict:
#    resp_dist.append(sigOR_dict[odor])
#resp_cutoff = statistics.mean(resp_dist)+(5*statistics.stdev(resp_dist))
#
##Initialize response df        
#response_df = pd.DataFrame()
##Loop through to compile
#for cid in tested_resp:
#    df = tested_resp[cid]
#    if df[(df['logFC'] > 0) & (df['FDR'] < 0.05)].shape[0] > resp_cutoff:
#        continue
#    response_df = pd.concat([response_df, df])
#    
#response_df = response_df.reset_index(drop = True)
#response_df.to_csv("compiled_desc_resp/log_reg_compiled_ps6_data.csv")
response_df = pd.read_csv("compiled_desc_resp/log_reg_compiled_ps6_data.csv", index_col = 0)