In [0]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import pickle

In [0]:
df = pickle.load(open("varianceDF.pickle",'rb')) 

In [0]:

print(df.shape) # 2507 is the right shape -- we haven't eliminated any non-equally variant complexes!
df.head()

In [0]:
# make a list of the column names that represent healthy / cancerous samples

healthyColumnIndices = []
cancerColumnIndices = []
for columnIndex in range(len(df.columns) - 6):
  print(columnIndex)
  print(df.columns[columnIndex])
  if "_NM" in df.columns[columnIndex]:
      healthyColumnIndices.append(columnIndex)
  else:
      cancerColumnIndices.append(columnIndex)

In [0]:
print(cancerColumnIndices)
print(healthyColumnIndices)
print(df.columns[healthyColumnIndices])
print(df.columns[cancerColumnIndices])
healthyMean = []
cancerMean = []
cancerDivByHealthy = []
for index, row in df.iterrows():
#   healthyMean = mean(row[healthyColumnNames])
#   print(row)
  healthyDataList = row[healthyColumnIndices]
  cleanHealthyList = np.where(np.isclose(healthyDataList,0), np.nan, healthyDataList)
  cleanHealthyList = [x for x in cleanHealthyList if str(x) != 'nan']
  healthyMean.append(np.nanmean(cleanHealthyList))

  cancerDataList = row[cancerColumnIndices]
  cleanCancerList = np.where(np.isclose(cancerDataList,0), np.nan, cancerDataList)
  cleanCancerList = [x for x in cleanCancerList if str(x) != 'nan']
  cancerMean.append(np.nanmean(cleanCancerList))
  
  cancerDivByHealthy.append(np.nanmean(cleanCancerList) / (np.nanmean(cleanHealthyList)))
  

In [0]:
print(healthyMean)
print(cancerMean)
print(cancerDivByHealthy)

df["abundance_healthy_mean"] = healthyMean
df["abundance_cancer_mean"] = cancerMean
df["abundance_cancer_div_by_healthy"] = cancerDivByHealthy

[677440000.0, 145313333.33333334, 174042777.7777778, 93899294.11764705, 211230000.0, 52258090.90909091, 1278043500.0, 6371700000.0, 227502000.0, 141622125.0, 15675728571.428572, 40497433333.333336, 112191619047.61905, 12228401636.363636, 29702013500.0, 1439933333333.3333, 16285233809.52381, 14714838095.238094, 13787752380.952381, 659436000.0, 1919876666.6666667, 6668200000.0, 7275019047.619047, 8985827647.058823, 268650500.0, 766820000.0, 811188571.4285715, 168603333.33333334, 1665895789.4736843, 118776000.0, 4659599047.619047, 237011333.33333334, 317640000.0, 53576500.0, 254378750.0, 139055666.66666666, 159978000.0, 286740000.0, 615681578.9473684, 245899090.9090909, 257485000.0, 507088461.53846157, 132185333.33333333, 229266666.66666666, 132501375.0, 3302845000.0, 349516000.0, 220073800.0, 533135769.2307692, 102645666.66666667, 741795047.6190476, 611913238.0952381, 359763846.15384614, 76265333.33333333, 614804705.882353, 347092000.0, 151823833.33333334, 657750000.0, 177888600.0, 30045

### Store the ACC coefficients

In [0]:
proteins = list(df.index)
ACC = list(df["abundance_cancer_div_by_healthy"])

proteinToACC = dict(zip(proteins, ACC))
print(proteinToACC)

outFile = open("proteinToACC.pickle", "wb")
pickle.dump(proteinToACC, outFile)

{'A0AVT1': 2.3997600098202523, 'A0M8Q6': 0.861070330779465, 'A0MZ66': 1.6134141418146342, 'A1L4H1': 0.4052226947768824, 'A1X283': 0.93299963789951, 'A2RRP1': 1.23375153738699, 'A4UGR9': 0.2397990887373291, 'A5YKK6': 0.37552689813426643, 'A6NDG6': 0.6853293338856232, 'A6NHR9': 1.2371307172596167, 'A6NMB1': 0.33827244024795544, 'B9A064;P0CG04': 0.4087595631466703, 'CON__P00761': 0.2596173848287132, 'CON__P02533;P02533;CON__Q9D312;CON__O76015;CON__O76014;O76014;O76015': 0.04372474674988525, 'CON__P02538;P02538': 0.008951184684275136, 'CON__P02768-1;P02768': 0.4309382622489833, 'CON__P05787;P05787;CON__H-INV:HIT000292931;CON__Q9H552;CON__H-INV:HIT000016045;CON__REFSEQ:XP_092267': 2.8030833916769806, 'CON__P07477;Q8NHM4;P07478;P07477': 0.2483463054111535, 'CON__P08727': 2.2607459433503987, 'CON__P08729;CON__Q9DCV7': 0.9994717410921206, 'CON__P08779;P08779': 0.04186857489109543, 'CON__P19012': 0.047907552691452394, 'CON__Q3KNV1;P08729': 2.6300421282346162, 'CON__Q3SX28': 0.18470666113493478,

A pickle file that maps complexes to the proteins that it contains

In [0]:
complex_to_protein = pickle.load(open("BrentsComplexToProteins.pickle","rb"))

In [0]:
complexToSize= {}

rangeACC = {}
for complexe in complex_to_protein:
  coefficients = []
  for protein in complex_to_protein[complexe]:

    if (df.index == protein).any():
      newCoefficient = df["abundance_cancer_div_by_healthy"].loc[protein]
      coefficients.append(newCoefficient)

  complexToSize[complexe] = len(coefficients)
  
  if len(coefficients) <= 1:
    range1 = 0
  else:
    range1 = max(coefficients) - min(coefficients)

  rangeACC[complexe] = range1
 


In [0]:
print(np.nanstd(list(rangeACC.values())))
print(np.nanmean(list(rangeACC.values())))
print(max(list(rangeACC.values())))
print(list(rangeACC.values()))

0.7959297329428595
0.33602139208226506
5.419222273484943
[0, 0, 0, 0, 0, 0, 0, 0.0660681802923555, 0, 0.47173372667562075, 0.8168214731362058, 0.8168214731362058, 0.2502208761855904, 0.2502208761855904, 0, 0.3132258852076417, 0.2502208761855904, 0.8168214731362058, 0.8168214731362058, 0.8168214731362058, 0.9261410678752803, 0.8168214731362058, 1.0649431939196345, 0.05647883252103947, 0.29952041724199097, 0.29952041724199097, 1.4406464424586471, 0, 0.2699912836849647, 1.2532539012599813, 0.6898799840990083, 0.6898799840990083, 0.47173372667562075, 0, 0.47173372667562075, 0.8168214731362058, 0.8168214731362058, 1.0649431939196345, 0.11946411668390278, 0, 0, 0.621774653586338, 0.621774653586338, 0.621774653586338, 0.15933559035700295, 0.21351245116392525, 0, 0, 0, 0, 0, 0, 0.959871267783973, 0.959871267783973, 0.4396591079134179, 0, 0.406155804993392, 0.406155804993392, 0, 0, 1.0207230484984755, 0.2700857642480252, 0.8168214731362058, 0.8168214731362058, 1.1724538021223454, 0, 0.270085764

In [0]:
acv = open("rangeACC.pickle","wb")
pickle.dump(rangeACC, acv)

cts = open("complexToSize.pickle","wb")
pickle.dump(complexToSize, cts)

print(len(rangeACC))

nonZero = []
for entry in list(rangeACC.values()):
  if entry != 0:
    nonZero.append(entry)
print(len(nonZero))

3698
1256
