###Load the CDP dataset and embeddings

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Load the cysteine-dense peptides dataset (CDPs.csv) from GitHub 
df = pd.read_csv('https://raw.githubusercontent.com/Zebreu/cyspresso/main/CDPs.csv')

In [None]:
# Process the dataset (omit extraneous columns and label "Expressibility" as True or False) 
express = {'+': True, '+-PR': True, '-': False}
df['Expressibility'] = df['Expressibility'].replace(express)

In [None]:
small = df[['Uniprot','Sequence','Expressibility','Is Knottin? Uniprot']]
small

Unnamed: 0,Uniprot,Sequence,Expressibility,Is Knottin? Uniprot
0,P01030,AKRCCQDGLTRLPMARTCEQRAARVQQPACREPFLSCCQFA,False,N
1,P46162,PQSCRWNMGVCIPFLCRVGMRQIGTCFGPRVPCCRR,False,N
2,P46163,PQSCRWNMGVCIPISCPGNMRQIGTCFGPRVPCCRRW,False,N
3,P46167,FVTCRINRGFCVPIRCPGHRRQIGTCLAPQIKCCR,False,N
4,P01223,GLACGQAMSFCIPTEYMMHVERKECAYCLTINTTVCAGYCMTR,False,N
...,...,...,...,...
1244,B6UHE2,ADLCVTRSRTFKGWCHQSENCITVCKSEGNTGGFCKLGACMCTKECVRS,True,N
1245,P0C1Y5,GGGCGYKDVNKAPFNSMGACGNVPIFKDGLGCGSCFEIKCDKPAECSGK,False,N
1246,B6SJ49,ARTCQSQSHRFRGPCLRRSNCANVCRTEGFPGGRCRGFRRRCFCTTHCH,False,N
1247,B6SQK6,AQICYSRSKTFKGWCYHSTNCISVCITEGEISGFCQHGICMCTYECLTG,False,N


Download AlphaFold2 embeddings from Hugging Face (https://huggingface.co/datasets/TonyKYLim/CysPresso/tree/main). 

Alternatively, you can use your own embeddings in a '.npy' format, one representation per peptide: (number of residues, number of dimensions) e.g. (38, 256) for Alphafold2's MSA embedding

In [None]:
import glob

In [None]:
cembs1 = sorted(glob.glob('msa_first_row_reps/*.npy')) #Path to folder AlphaFold2 containing MSA embeddings

In [None]:
cembs2 = sorted(glob.glob('struct_mod_reps/*.npy')) #Path to folder contianing AlphaFold2 structure embeddings

In [None]:
cembs3 = sorted(glob.glob('pair_reps/*.npy'))#Path to folder containing AlphaFold2 pair embeddings

In [None]:
cembs4 = sorted(glob.glob('single_embeddings/*.npy')) #Path to folder containing AlphaFold2 single embeddings

In [None]:
cembs = [cembs1, cembs2, cembs3, cembs4]

In [None]:
# Concatenate the four AlphaFold2 embeddings (MSA, structure, pair, single) for each peptide to generate the combined AlphaFold2 representation (combined embeddings, 'cembs')
cembs_array = []
names = []
for c,c1,c2,c3 in zip(cembs1,cembs2,cembs3, cembs4):
    array = np.load(c)
    array1 = np.load(c1)
    array2 = np.load(c2).mean(axis=1)
    array3 = np.load(c3)
    array = np.concatenate([array, array1, array2, array3], axis=1)
    names.append(c[-10:-4])
    cembs_array.append(array)
raw_array = cembs_array.copy()                     

In [None]:
raw_array[0].shape #Output: (number of residues, number of dimensions)

### Embedding preprocessing

In [None]:
# Rocket works best when zero-padded

features = [[] for i in range(384+256+128+384)]
new_names = []
for name, a in zip(names, raw_array):
  for i in range(384+256+128+384):
    f = a[:,i]
    d = 50 - len(f) #Pad values with 0s to ensure equal length (50, in this case)"
    f = np.pad(f, (0,d)) 
    features[i].append(pd.Series(f))
  new_names.append(name)

In [None]:
len(new_names)

1249

In [None]:
len(features)

1152

In [None]:
cc = pd.DataFrame(new_names, columns=['Uniprot'])

In [None]:
for i,f in enumerate(features):
  cc[i] = pd.Series(f)

In [None]:
combined = pd.merge(cc, small, left_on='Uniprot', right_on='Uniprot')

In [None]:
# Omit duplicates from dataset
combined = combined.drop_duplicates('Uniprot').sort_values('Uniprot')
combined['Expressibility'].value_counts()

True     678
False    549
Name: Expressibility, dtype: int64

###Import required modules

In [None]:
from sklearn import preprocessing
from sklearn.utils import shuffle

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LogisticRegressionCV, RidgeClassifierCV
from sklearn.svm import SVC

In [None]:
#import lightgbm
from sklearn.metrics import auc, roc_auc_score
from sklearn import model_selection
from sklearn import metrics

In [None]:
!pip install sktime

In [None]:
from sktime.transformations.panel.rocket import MiniRocketMultivariate, Rocket, MultiRocketMultivariate

  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.Index)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)


### Split the dataset between knottins and non-knottins

In [None]:
# Select all rows that correspond to knottin proteins based on Uniprot identification 
knot = combined[combined['Is Knottin? Uniprot'] == 'Y']
knot = knot.sort_values('Uniprot') 

###ROCKET transformation and model training

In [None]:
mega = {} # Dictionary holding all results for later analysis

In [None]:
# ROCKET transformation on the knottin dataset 

feat_cols = list(range(384+256+128+384))

rocket = Rocket(num_kernels=10000, random_state=42)
rocket.fit(knot[feat_cols])
transformed = rocket.transform(knot[feat_cols])

In [None]:
# Train a logistic regression model
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train)
    tr = scaler.transform(X_train)

    model = LogisticRegression(C=0.0001)
    model.fit(tr, y_train)

    tt = scaler.transform(X_test)
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_rocket']=list(scores)

### Alternative methods
#### The following cells use a similar methodology as above, but with different embeddings and/or preprocessing, and with the non-knottin partition of the dataset


#### Proteinfer embeddings

In [None]:
# Load Proteinfer embeddings
st = pd.read_parquet('peptide-data.parquet')
stt = pd.read_parquet('proteInfer_embeddings_peptides.parquet') # Computable by proteinfer as directed by https://github.com/google-research/proteinfer

stt['Uniprot'] = st["('Uniprot',)"].values

knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], stt, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[str(i) for i in range(1100)]]

In [None]:
# Train Random Forest classifier 
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_proteinfer']=list(scores)

#### Averaged representations from Alphafold2

##### All representations are averaged

In [None]:
parts = []
for a in raw_array:
    parts.append(a.mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(1152)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedfour']=list(scores)

##### Averaging MSA representations only

In [None]:
parts = []
for a in raw_array:
    parts.append(a[:,0:256].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(256)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedmsa']=list(scores)

##### Averaging structure module representations only

In [None]:
parts = []
for a in raw_array:
    parts.append(a[:,256:256+384].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(384)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedstruct']=list(scores)

##### Averaging pair representations only

In [None]:
parts = []
for a in raw_array:
    parts.append(a[:,256+384:256+384+128].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(128)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedpair']=list(scores)

##### Averaging single representations only

In [None]:
parts = []
for a in raw_array:
    parts.append(a[:,256+384+128:256+384+128+384].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'Y']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(384)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['knottin_averagedsingle']=list(scores)

#### Non-knottins experiments

In [None]:
knot = combined[combined['Is Knottin? Uniprot'] == 'N'] # Selecting non-knottins from dataset
knot = knot.sort_values('Uniprot')

In [None]:
# Rocket transformation on the non-knottin dataset
feat_cols = list(range(384+256+128+384))

rocket = Rocket(num_kernels=10000, random_state=42)
rocket.fit(knot[feat_cols])
transformed = rocket.transform(knot[feat_cols])

In [None]:
# Train a logistic regression model
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train)
    tr = scaler.transform(X_train)

    model = LogisticRegression(C=0.0001)
    model.fit(tr, y_train)

    tt = scaler.transform(X_test)
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_rocket']=list(scores)

##### Proteinfer embeddings for non-knottins

In [None]:
# Load Proteinfer embeddings
st = pd.read_parquet('peptide-data.parquet')

stt = pd.read_parquet('proteInfer_embeddings_peptides.parquet')

stt['Uniprot'] = st["('Uniprot',)"].values

knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], stt, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N'] # Select non-knottins from dataset
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[str(i) for i in range(1100)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_proteinfer']=list(scores)

##### Averaged representations from AlphaFold2

###### All representations are averaged

In [None]:
parts = []
for a in raw_array:
    parts.append(a.mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(1152)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedfour']=list(scores)

###### Averaging MSA representations only

In [None]:
parts = []
for a in raw_array:
    parts.append(a[:,0:256].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(256)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedmsa']=list(scores)

###### Averaging structure module representations only

In [None]:
parts = []
for a in raw_array:
    parts.append(a[:,256:256+384].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(384)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedstruct']=list(scores)

###### Averaging pair representations only

In [None]:
parts = []
for a in raw_array:
    parts.append(a[:,256+384:256+384+128].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(128)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedpair']=list(scores)

###### Averaging single representations only

In [None]:
parts = []
for a in raw_array:
    parts.append(a[:,256+384+128:256+384+128+384].mean(axis=0))
averaged = pd.DataFrame(parts)
averaged['Uniprot'] = names

In [None]:
knot = pd.merge(combined[['Uniprot', 'Expressibility', 'Is Knottin? Uniprot']], averaged, left_on='Uniprot', right_on='Uniprot')
knot = knot[knot['Is Knottin? Uniprot'] == 'N']
knot = knot.drop_duplicates('Uniprot').sort_values('Uniprot').sort_values('Uniprot')
transformed = knot[[i for i in range(384)]]

In [None]:
scores=[]
skf = model_selection.ShuffleSplit(n_splits=50, test_size=0.1, random_state=10)

for train_index, test_index in skf.split(transformed.values, knot['Expressibility'].values):
    X_train, X_test = transformed.values[train_index], transformed.values[test_index]
    y_train, y_test = knot['Expressibility'].values[train_index], knot['Expressibility'].values[test_index]

    tr = X_train
    model = RandomForestClassifier(n_estimators=300)
    model.fit(tr, y_train)
    
    tt = X_test
    truth = y_test
    
    scores.append(roc_auc_score(truth, model.predict_proba(tt)[:,1]))  
mega['nonknottin_averagedsingle']=list(scores)

### Prepare the data to draw a critical difference diagram according to https://github.com/hfawaz/cd-diagram 

In [None]:
classifiers = []
datasets = []
accuracies = []
for key in mega:
    if key.startswith('knottin'):
        classifier = key.split('_')[1].replace('averaged','')
        for e, value in enumerate(mega[key]):
            accuracy = value
            dataset = str(e)

            classifiers.append(classifier.replace('four','combined'))
            datasets.append(dataset)
            accuracies.append(accuracy)
    

In [None]:
cdddata = pd.DataFrame()
cdddata['classifier_name'] = classifiers
cdddata['dataset_name'] = datasets
cdddata['accuracy'] = accuracies

In [None]:
cdddata

Unnamed: 0,classifier_name,dataset_name,accuracy
0,rocket,0,0.862500
1,rocket,1,0.745553
2,rocket,2,0.828526
3,rocket,3,0.786963
4,rocket,4,0.801675
...,...,...,...
345,single,45,0.714052
346,single,46,0.754427
347,single,47,0.775054
348,single,48,0.787854


In [None]:
cdddata.to_csv('peptidecdd.csv', index=False)