# Assigning functional categories by gene ontological assignment
### To group proteins based on function at various levels of coarseness, the assigned gene ontological identifiers given to each protein in the proteome metadata was traced back through their parent terms to the desired categories. 
### The gene ontology library was acquired from http://geneontology.org/docs/download-ontology/ and was navigated using the python library pronto. The gene ontology library used is included in the Initial files directory. 
### The gene ontology terms belonging to each group are detailed in the GO_grouping_specific.csv and GO_grouping_general.csv files. Several proteins with known function were not assigned specific enough gene ontology identifiers and needed to be manually assigned. This manual assignment is detailed in manual_GO_assignment.

In [1]:
import numpy as np
import pandas as pd

import sys
import os
import math
import re

import pronto

### Load the gene ontology database

In [2]:
ont = pronto.Ontology('../Initial files/go-plus.owl')

### Construct multiple dictionaries to group proteins by their gene ontology

In [3]:
## Create a dictionary with all of the GO terms of interest as keys and the classification as values

go_complex = pd.read_csv('../Initial files/GO_grouping_specific.csv')

categories = go_complex.columns.tolist()
sub_cate = [category+'-'+str(x) for category in categories for x in range(len(go_complex[category].dropna().tolist()))]
ranks = dict(zip(sub_cate,range(len(sub_cate))))

cate_dict = {}
for category in categories:
    terms = go_complex[category].dropna().tolist()
    go_terms = [ont[term[0:10]] for term in terms]
    array = zip(go_terms,[category+'-'+str(x) for x in range(len(go_terms))])
    cate_dict.update(dict(array))

## Read in the manual GO term assignments
manual = pd.read_csv('../Initial files/manual_GO_assignment.csv')

all_manual = {}
for col in manual.columns.tolist():
    prots = manual[col].dropna().tolist()
    all_manual.update(dict(zip(prots,[col for x in range(len(prots))])))
manual_prots = all_manual.keys()


## General category grouping
general = pd.read_csv('../Initial files/GO_grouping_general.csv')

all_general = {}
for gen in general.columns.tolist():
    names = general[gen].dropna().tolist()
    all_general.update(dict(zip(names,[gen for x in range(len(names))])))  


### Assign each protein to their specific and general functional groups

In [4]:
def split_list(row):
    GOs = row['Gene_ontology_IDs']
    try:
        if math.isnan(float(GOs)):
            return []
    except:
        return str(GOs).split('; ')

def find_all_parents(row):
    GOs = row['GO_list']
#     print(row['fasta_header'],GOs)
    if GOs == ['nan']:
        return []
    all_GOs = [ont[go].rparents() for go in GOs]
    all_GOs.append([ont[go] for go in GOs])
    return all_GOs

def classify_parents(row):
    parent_array = row['all_parents']
    if parent_array == []:
        return []
    classified_parents = []
    for parents in parent_array:
        for parent in parents:
            if parent in cate_dict.keys():
                classified_parents.append([parent,cate_dict[parent]])
    ordered = sorted(classified_parents, key = lambda x: ranks[x[1]])
    return ordered

def top_GO(row):
     if row['classified_parents'] == []:
        return 'Not_listed'
     elif row['Protein_names'] in manual_prots:
        return ont[all_manual[row['Protein_names']][0:10]]
     return row['classified_parents'][0][0]

def top_class(row):
    if row['classified_parents'] == []:
        return 'Not listed'
    elif row['Protein_names'] in manual_prots:
        return cate_dict[row['top_GO']]
    return row['classified_parents'][0][1]

def convert_GO(row):
    return str(row['top_GO'])[1:-1]

def extract_category(row):
    return row['top_class'].split('-')[0]

def general_category(row):
    return all_general[row['category']]

proteins = pd.read_csv('../Intermediate data/labeled_IRS_proteins_exp.csv')

proteins['GO_list'] = proteins.apply(split_list,axis=1)
proteins['all_parents'] = proteins.apply(find_all_parents, axis=1)
proteins['classified_parents'] = proteins.apply(classify_parents, axis=1)
proteins['top_GO'] = proteins.apply(top_GO, axis=1)
proteins['top_class'] = proteins.apply(top_class, axis=1)
proteins['top_GO_str'] = proteins.apply(convert_GO, axis=1)
proteins['category'] = proteins.apply(extract_category, axis=1)
proteins['general_category'] = proteins.apply(general_category, axis=1)

proteins = proteins.sort_values(by=['top_class','top_GO_str'])


In [5]:
proteins.to_csv('../Intermediate data/labeled_IRS_proteins_GO.csv')