## PPI YEAST ANALYSIS

### Load annotations

In [None]:
import matplotlib.pyplot as plt
import pickle as pkl
import pandas as pd

In [None]:
annots_file_yeast = "data/ppi/4932.annotations.tsv"
annots_yeast = open(annots_file_yeast, 'r')

annots_dict = {x[0]: x[1:] for l in annots_yeast.readlines() if (x := l.strip().split('\t')) is not None} 

In [None]:
len(annots_dict)

In [None]:
goClasses = {}

for prot, classes in annots_dict.items():
    for cls in classes:
        if cls in goClasses:
            goClasses[cls] +=1
        else:
            goClasses[cls] = 1

max_value = max(goClasses, key=goClasses.get)
print(max_value, goClasses[max_value])


In [None]:
#del goClasses["GO:0005575"]
max_value = max(goClasses, key=goClasses.get)
print(max_value, goClasses[max_value])

plt.figure(figsize=(12,12))
plt.bar(list(goClasses.keys()), goClasses.values(), color='g')
plt.show()

### Look for proteins with only _binding_ annotation

In [None]:
goClasses["GO:0005488"]

In [None]:
for prot, classes in annots_dict.items():
    if len(classes) == 1:
        print(len(classes))



### Load Interactions

In [None]:
def loadInteractionsFile(file_name):
    data_frame_train = pd.read_pickle(file_name)

    degrees = {}
    pos = 0
    neg = 0

    for row in data_frame_train.itertuples():
        p1, p2 = row.interactions
        label = row.labels

        if label == 1:
            pos += 1
            if p1 in degrees:
                degrees[p1] += 1
            else:
                degrees[p1] = 1
            if p2 in degrees:
                degrees[p2] += 1
            else:
                degrees[p2] = 1
        else:
            neg +=1

    total = pos + neg
    print(f"Positives: {pos} ({pos/total}%), Negatives: {neg} ({neg/total}%), Total: {pos + neg}, Pos/Neg: {pos/neg}")
    return degrees

In [None]:
train_interactions_file = "data/ppi/4932.train_interactions.pkl"
degreesTrain = loadInteractionsFile(train_interactions_file)

val_interactions_file = "data/ppi/4932.valid_interactions.pkl"
degreesVal = loadInteractionsFile(val_interactions_file)

test_interactions_file = "data/ppi/4932.test_interactions.pkl"
degreesTest = loadInteractionsFile(test_interactions_file)

max_value_train = max(degreesTrain, key=degreesTrain.get)
print(max_value_train, degreesTrain[max_value_train])

max_value_val = max(degreesVal, key=degreesVal.get)
print(max_value_val, degreesVal[max_value_val])

max_value_test = max(degreesTest, key=degreesTest.get)
print(max_value_test, degreesTest[max_value_test])

#print(sorted(annots_dict[max_value_train]))
#print(sorted(annots_dict[max_value_val]))
#print(sorted(annots_dict[max_value_test]))

In [None]:
plt.figure(figsize=(12,12))
plt.bar(list(degrees.keys()), degrees.values(), color='g')
plt.show()

## IC

In [None]:
import sys
sys.path.append("../../../")

from mowl.datasets.base import PathDataset
from org.mowl.IC import IC
import pandas as pd

#JPype
from jpype import JObject
from java.util import HashMap
from java.util import ArrayList

def getAnnotsDict(training_prots):
    data_file = "data/ppi/4932.annotations.tsv"

    with open(data_file, 'r') as f:
        rows = [line.strip('\n').split('\t') for line in f.readlines()]

        annots_dict = HashMap()

        for i, row  in enumerate(rows):
            prot_id = row[0]

            if prot_id in training_prots:
                if not prot_id in annots_dict:
                    annots_dict.put(prot_id, ArrayList())

            for go_id in row[1:]:

                if prot_id in training_prots:
                    prot_annots = annots_dict[prot_id]
                    prot_annots.add(go_id)
                    annots_dict.put(prot_id, prot_annots)

    return annots_dict

In [None]:


ds = PathDataset("data/ppi/go.owl", None, None)

train_df = pd.read_pickle("data/ppi/4932.train_interactions.pkl")

#For computing IC
training_prots = set()
for row in train_df.itertuples():
    p1, p2 = row.interactions
    training_prots.add(p1)
    training_prots.add(p2)

annots_dict = getAnnotsDict(training_prots)
ics = IC.computeIC(ds.ontology, annots_dict)
ics = {format(str(k)): v for k, v in ics.items()}

In [None]:
sorted_ic = dict(sorted(ics.items(),key= lambda x:x[1]))

In [None]:
list_ic = list(sorted_ic.items())

In [None]:
list_ic[:10]

In [None]:
ics["http://purl.obolibrary.org/obo/GO_0005488"]

In [None]:
list_ic[-10:]