In [1]:
import numpy as np
import pandas as pd

In [38]:
df = pd.read_csv('https://github.com/Zebreu/cyspresso/blob/main/CDPs.csv')

In [39]:
df

Unnamed: 0.1,Unnamed: 0,Uniprot,Sequence,Source Organism (common name),Expressibility,Organism Class,Is Knottin? Knoter1D,Is Knottin? Uniprot,Unnamed: 8,Unnamed: 9
0,,P01030,AKRCCQDGLTRLPMARTCEQRAARVQQPACREPFLSCCQFA,Bos taurus (Bovine),-,Mammalia,N,N,,
1,,P46162,PQSCRWNMGVCIPFLCRVGMRQIGTCFGPRVPCCRR,Bos taurus (Bovine),-,Mammalia,N,N,,
2,,P46163,PQSCRWNMGVCIPISCPGNMRQIGTCFGPRVPCCRRW,Bos taurus (Bovine),-,Mammalia,N,N,1228.0,166.0
3,,P46167,FVTCRINRGFCVPIRCPGHRRQIGTCLAPQIKCCR,Bos taurus (Bovine),-,Mammalia,N,N,,1.0
4,,P01223,GLACGQAMSFCIPTEYMMHVERKECAYCLTINTTVCAGYCMTR,Bos taurus (Bovine),-,Mammalia,N,N,,187.0
...,...,...,...,...,...,...,...,...,...,...
1244,,B6UHE2,ADLCVTRSRTFKGWCHQSENCITVCKSEGNTGGFCKLGACMCTKECVRS,Zea mays (Indian corn),+,Plants,N,N,,
1245,,P0C1Y5,GGGCGYKDVNKAPFNSMGACGNVPIFKDGLGCGSCFEIKCDKPAECSGK,Zea mays (Indian corn),-,Plants,N,N,,
1246,,B6SJ49,ARTCQSQSHRFRGPCLRRSNCANVCRTEGFPGGRCRGFRRRCFCTTHCH,Zea mays (Indian corn),-,Plants,N,N,,
1247,,B6SQK6,AQICYSRSKTFKGWCYHSTNCISVCITEGEISGFCQHGICMCTYECLTG,Zea mays (Indian corn),-,Plants,N,N,,


In [40]:
express = {'+': True, '+-PR': True, '-': False}

In [41]:
df['Expressibility'] = df['Expressibility'].replace(express)

In [43]:
small = df[['Uniprot','Sequence','Expressibility','Is Knottin? Uniprot']]
small

Unnamed: 0,Uniprot,Sequence,Expressibility,Is Knottin? Uniprot
0,P01030,AKRCCQDGLTRLPMARTCEQRAARVQQPACREPFLSCCQFA,False,N
1,P46162,PQSCRWNMGVCIPFLCRVGMRQIGTCFGPRVPCCRR,False,N
2,P46163,PQSCRWNMGVCIPISCPGNMRQIGTCFGPRVPCCRRW,False,N
3,P46167,FVTCRINRGFCVPIRCPGHRRQIGTCLAPQIKCCR,False,N
4,P01223,GLACGQAMSFCIPTEYMMHVERKECAYCLTINTTVCAGYCMTR,False,N
...,...,...,...,...
1244,B6UHE2,ADLCVTRSRTFKGWCHQSENCITVCKSEGNTGGFCKLGACMCTKECVRS,True,N
1245,P0C1Y5,GGGCGYKDVNKAPFNSMGACGNVPIFKDGLGCGSCFEIKCDKPAECSGK,False,N
1246,B6SJ49,ARTCQSQSHRFRGPCLRRSNCANVCRTEGFPGGRCRGFRRRCFCTTHCH,False,N
1247,B6SQK6,AQICYSRSKTFKGWCYHSTNCISVCITEGEISGFCQHGICMCTYECLTG,False,N


### Bring your own embeddings in a npy format, one representation per peptide: number of residue X number of dimensions, e.g. 38 X 256 for Alphafold2's MSA embedding

In [1]:
import glob
cembs1 = sorted(glob.glob('msa_first_row_reps/*.npy'))

In [20]:
cembs2 = sorted(glob.glob('struct_mod_reps/*.npy'))

In [21]:
cembs3 = sorted(glob.glob('pair_reps/*.npy'))

In [22]:
cembs4 = sorted(glob.glob('single_embeddings/*.npy'))

In [23]:
cembs = [cembs1, cembs2, cembs3, cembs4]

In [187]:
cembs_array = []
names = []
for c,c1,c2,c3 in zip(cembs1,cembs2,cembs3, cembs4):
    array = np.load(c)
    array1 = np.load(c1)
    array2 = np.load(c2).mean(axis=1)
    array3 = np.load(c3)
    array = np.concatenate([array, array1, array2, array3], axis=1)
    names.append(c[-10:-4])
    cembs_array.append(array)
raw_array = cembs_array.copy()                     

In [14]:
raw_array[0].shape

(38, 1152)

### Embedding preprocessing

In [26]:
# Rocket works best when zero-padded

features = [[] for i in range(384+256+128+384)]
new_names = []
for name, a in zip(names, raw_array):
  for i in range(384+256+128+384):
    f = a[:,i]
    d = 50 - len(f)
    f = np.pad(f, (0,d))
    features[i].append(pd.Series(f))
  new_names.append(name)

In [45]:
len(new_names)

1249

In [16]:
len(features)

1152

In [27]:
cc = pd.DataFrame(new_names, columns=['Uniprot'])

In [None]:
for i,f in enumerate(features):
  cc[i] = pd.Series(f)

In [46]:
combined = pd.merge(cc, small, left_on='Uniprot', right_on='Uniprot')

In [47]:
combined = combined.drop_duplicates('Uniprot').sort_values('Uniprot')
combined['Expressibility'].value_counts()

True     678
False    549
Name: Expressibility, dtype: int64

In [31]:
from sklearn import preprocessing
from sklearn.utils import shuffle

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LogisticRegressionCV, RidgeClassifierCV
from sklearn.svm import SVC

In [33]:
#import lightgbm
from sklearn.metrics import auc, roc_auc_score
from sklearn import model_selection
from sklearn import metrics

In [34]:
from sktime.transformations.panel.rocket import MiniRocketMultivariate, Rocket, MultiRocketMultivariate

  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.Index)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)


### Create a series of experiments where we split the dataset between knottin vs non-knottin and try out some methods

In [99]:
mega = {} # dictionary holding all results for later analysis

In [94]:
knot = combined[combined['Is Knottin? Uniprot'] == 'Y']
knot = knot.sort_values('Uniprot')

In [98]:
feat_cols = list(range(384+256+128+384))

rocket = Rocket(num_kernels=10000, random_state=42)
rocket.fit(knot[feat_cols])
transformed = rocket.transform(knot[feat_cols])

In [100]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train)
    tr = scaler.transform(X_train)

    model = LogisticRegression(C=0.0001)
    model.fit(tr, y_train)

    tt = scaler.transform(X_test)
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_rocket']=list(scores)

#### The following cells follow a similar methodology as above. The current notebook suffer from repetition, we may clean it up at a later time

In [105]:
st = pd.read_parquet('peptide-data.parquet')
stt = pd.read_parquet('proteInfer_embeddings_peptides.parquet') # Computable by proteinfer as directed by https://github.com/google-research/proteinfer

stt['Uniprot'] = st["('Uniprot',)"].values

knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], stt, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[str(i) for i in range(1100)]]

#### Proteinfer

In [109]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_proteinfer']=list(scores)

#### Averaged representations from Alphafold2

In [127]:
parts = []
for a in raw_array:
    parts.append(a.mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [128]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(1152)]]

In [129]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedfour']=list(scores)

In [135]:
parts = []
for a in raw_array:
    parts.append(a[:,0:256].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [136]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(256)]]

In [137]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedmsa']=list(scores)

In [139]:
parts = []
for a in raw_array:
    parts.append(a[:,256:256+384].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [140]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(384)]]

In [141]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedstruct']=list(scores)

In [143]:
parts = []
for a in raw_array:
    parts.append(a[:,256+384:256+384+128].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [144]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(128)]]

In [145]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedpair']=list(scores)

In [147]:
parts = []
for a in raw_array:
    parts.append(a[:,256+384+128:256+384+128+384].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [148]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(384)]]

In [149]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedsingle']=list(scores)

#### Non-knottins experiments

In [168]:
knot = combined[combined['Is Knottin? Uniprot'] == 'N']
knot = knot.sort_values('Uniprot')

In [169]:
feat_cols = list(range(384+256+128+384))

rocket = Rocket(num_kernels=10000, random_state=42)
rocket.fit(knot[feat_cols])
transformed = rocket.transform(knot[feat_cols])

In [170]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train)
    tr = scaler.transform(X_train)

    model = LogisticRegression(C=0.0001)
    model.fit(tr, y_train)

    tt = scaler.transform(X_test)
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_rocket']=list(scores)

In [171]:
st = pd.read_parquet('peptide-data.parquet')

stt = pd.read_parquet('proteInfer_embeddings_peptides.parquet')

stt['Uniprot'] = st["('Uniprot',)"].values

knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], stt, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[str(i) for i in range(1100)]]

In [172]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_proteinfer']=list(scores)

In [188]:
parts = []
for a in raw_array:
    parts.append(a.mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [189]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(1152)]]

In [190]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedfour']=list(scores)

In [192]:
parts = []
for a in raw_array:
    parts.append(a[:,0:256].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [193]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(256)]]

In [194]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedmsa']=list(scores)

In [196]:
parts = []
for a in raw_array:
    parts.append(a[:,256:256+384].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [197]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(384)]]

In [198]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedstruct']=list(scores)

In [200]:
parts = []
for a in raw_array:
    parts.append(a[:,256+384:256+384+128].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [201]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(128)]]

In [202]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedpair']=list(scores)

In [204]:
parts = []
for a in raw_array:
    parts.append(a[:,256+384+128:256+384+128+384].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [205]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(384)]]

In [206]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedsingle']=list(scores)

### Preparing the data to draw a critical difference diagram according to https://github.com/hfawaz/cd-diagram 

In [23]:
classifiers = []
datasets = []
accuracies = []
for key in mega:
    if key.startswith('knottin'):
        classifier = key.split('_')[1].replace('averaged','')
        for e, value in enumerate(mega[key]):
            accuracy = value
            dataset = str(e)

            classifiers.append(classifier.replace('four','combined'))
            datasets.append(dataset)
            accuracies.append(accuracy)
    

In [24]:
cdddata = pd.DataFrame()
cdddata['classifier_name'] = classifiers
cdddata['dataset_name'] = datasets
cdddata['accuracy'] = accuracies

In [12]:
cdddata

Unnamed: 0,classifier_name,dataset_name,accuracy
0,rocket,0,0.862500
1,rocket,1,0.745553
2,rocket,2,0.828526
3,rocket,3,0.786963
4,rocket,4,0.801675
...,...,...,...
345,single,45,0.714052
346,single,46,0.754427
347,single,47,0.775054
348,single,48,0.787854


In [25]:
cdddata.to_csv('peptidecdd.csv', index=False)