# PSSM Feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test,
    nested_loocv
)
from subpred.pssm import calculate_pssms_notebook

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    verbose = True,
    tax_ids_filter = [559292],
    output_log = "../logs/yeast_amino_sugar_dataset.log",
    sequence_clustering=70
)

cd-hit: clustered 64 sequences into 51 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    34
Sugar transport         17
Name: keywords_transport, dtype: int64

In [4]:
df_pssm = calculate_pssms_notebook(df.sequence)
df_pssm

Unnamed: 0_level_0,AA_50_1,AR_50_1,AN_50_1,AD_50_1,AC_50_1,AQ_50_1,AE_50_1,AG_50_1,AH_50_1,AI_50_1,...,VL_90_3,VK_90_3,VM_90_3,VF_90_3,VP_90_3,VS_90_3,VT_90_3,VW_90_3,VY_90_3,VV_90_3
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q08986,0.734091,0.259091,0.313636,0.220455,0.393182,0.295455,0.234091,0.529545,0.265909,0.415909,...,0.425047,0.345351,0.402277,0.58444,0.282732,0.351044,0.351044,0.605313,0.759013,0.387097
P38967,0.734914,0.284483,0.334052,0.211207,0.43319,0.3125,0.25431,0.461207,0.282328,0.465517,...,0.462547,0.419476,0.458801,0.580524,0.376404,0.419476,0.426966,0.599251,0.709738,0.43633
P38085,0.809745,0.308585,0.419954,0.264501,0.468677,0.345708,0.322506,0.582367,0.301624,0.443155,...,0.441296,0.354251,0.437247,0.714575,0.301619,0.374494,0.392713,0.611336,0.878543,0.425101
P15380,0.803644,0.244939,0.348178,0.230769,0.449393,0.299595,0.267206,0.495951,0.253036,0.370445,...,0.419483,0.33996,0.405567,0.632207,0.300199,0.369781,0.359841,0.606362,0.819085,0.39165
P38206,0.685039,0.452756,0.476378,0.438976,0.570866,0.478346,0.450787,0.566929,0.448819,0.503937,...,0.603916,0.510542,0.590361,0.671687,0.50753,0.554217,0.564759,0.637048,0.774096,0.578313
Q12300,0.733981,0.215534,0.258252,0.157282,0.52233,0.267961,0.213592,0.47767,0.219417,0.374757,...,0.430147,0.345588,0.415441,0.645221,0.338235,0.382353,0.398897,0.5625,0.849265,0.391544
Q12010,0.661376,0.42328,0.42328,0.386243,0.513228,0.444444,0.417989,0.502646,0.444444,0.502646,...,0.424658,0.373288,0.407534,0.578767,0.34589,0.414384,0.40411,0.64726,0.85274,0.424658
Q03697,0.646154,0.427692,0.436923,0.406154,0.56,0.464615,0.44,0.587692,0.412308,0.535385,...,0.571429,0.440729,0.550152,0.677812,0.398176,0.483283,0.510638,0.574468,0.817629,0.531915
Q04602,0.738095,0.406746,0.474206,0.39881,0.515873,0.46627,0.422619,0.583333,0.396825,0.490079,...,0.604555,0.559006,0.590062,0.627329,0.542443,0.627329,0.627329,0.552795,0.786749,0.606625
P10870,0.72028,0.232517,0.286713,0.204545,0.437063,0.277972,0.23951,0.480769,0.230769,0.365385,...,0.456343,0.378913,0.456343,0.652389,0.370675,0.400329,0.405272,0.560132,0.830313,0.421746


## Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_pssm, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)



## Model comparison

Linear SVC looks the best again, although in the worst fold, the performance is a bit better than with PAAC.

In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.5,0.855,0.619,0.385,0.619,0.595,0.175
KNeighborsClassifier(),1.0,0.564,0.855,0.385,0.855,0.731,0.25
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",1.0,0.855,0.873,0.667,0.619,0.803,0.157
"LinearSVC(max_iter=1000000.0, random_state=0)",1.0,0.855,0.873,0.667,0.619,0.803,0.157
"RandomForestClassifier(class_weight='balanced', random_state=0)",1.0,1.0,0.667,0.385,0.667,0.744,0.261
RandomForestClassifier(random_state=0),0.855,1.0,0.855,0.385,0.667,0.752,0.237
SGDClassifier(random_state=0),0.855,0.855,0.75,0.667,0.5,0.725,0.149
"SVC(class_weight='balanced', random_state=0)",0.855,0.855,0.855,0.385,0.855,0.761,0.21
SVC(random_state=0),1.0,1.0,0.667,0.385,0.855,0.781,0.261


## Parameter tuning

#### Custom transformer

Here, we try the multi-pssm feature, which tries all combinations of feature generation parameters, and selects the best ones based on the training set. First without the transformer:

In [25]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    C=[0.01, 0.1, 1, 10],
)

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.803


The linear kernel already leads to good results. With the transformer:

In [11]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    feature_transformer="pssm", 
    feature_names = feature_names,
    C=[0.001, 0.01, 0.1, 1]
)
best_estimator_linear = gsearch

{'linearsvc__C': 0.01, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pssmselector__iterations': 1, 'pssmselector__uniref_threshold': 90}
0.853


Same value of C, better results. Does RBF improve anything?

In [27]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[1, 10, 100],
)

{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.794


No, the score is a bit lower. The pssmselector improves it a bit, with default svm parameters:

In [29]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[0.1, 1, 10],
    feature_transformer="pssm",
    feature_names=feature_names,
)

{'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 50, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.848


RBF with feature selector is the best one so far.

## Dimensionality reduction

With PCA and the linear kernel, the results are slightly worse than for rbf with no FS

In [30]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
)

{'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.8}
0.798


In [31]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[10, 1, 0.1, 0.01],
    feature_transformer="pssm",
    feature_names=feature_names,
)
best_estimator_linearsvc_pca = gsearch

{'linearsvc__C': 10, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.86, 'pssmselector__iterations': 1, 'pssmselector__uniref_threshold': 50}
0.879


Without the pssm selection, we get lower results:

PCA seems to perform the best and is faster, how about the RBF kernel?

In [33]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    # gamma=["scale"],
)

{'pca__n_components': 0.81, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.918


That already looks good, now with the PSSMSelector:

In [35]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    feature_transformer="pssm",
    feature_names=feature_names,
    # C=[1, 0.1, 10],
    # gamma=["scale"],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.99, 'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 50, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.933


A good score with default parameters, and 99% of the variance. 

## Validation


### Linear kernel without feature selection

With just the linear kernel and the pssmselector, one wrong classification.

In [36]:
get_confusion_matrix(X_test, y_test, best_estimator_linear, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,1,3


In [19]:
get_classification_report(X_test, y_test, best_estimator_linear, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.875,1.0,0.933,7
Sugar transport,1.0,0.75,0.857,4
macro avg,0.938,0.875,0.895,11
weighted avg,0.92,0.909,0.906,11


### Linear kernel with PCA

Adding the PCA to the linear model seems to overfit on the AA transporters:

In [20]:
get_confusion_matrix(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,3,1


In [21]:
get_classification_report(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.7,1.0,0.824,7
Sugar transport,1.0,0.25,0.4,4
macro avg,0.85,0.625,0.612,11
weighted avg,0.809,0.727,0.67,11


### RBF + PCA

RBF kernel and pca leads to a perfect score again.


In [37]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,3,1


In [23]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.7,1.0,0.824,7
Sugar transport,1.0,0.25,0.4,4
macro avg,0.85,0.625,0.612,11
weighted avg,0.809,0.727,0.67,11


### Conclusion

The evaluation is consistent with that of PAAC. The bad scores for sugar were caused by the fact that three of the four sugar transport proteins in the test set were GDP/UDP transporters, and there are only three of these transporters in the dataset.

## Alternative evaluation with nested LOOCV

In [None]:
nested_loocv(df_features=df_pssm, labels=labels, dim_reduction=None, kernel="rbf", feature_transformer="pssm")

In [None]:
nested_loocv(df_features=df_pssm, labels=labels, dim_reduction="pca", kernel="rbf", feature_transformer="pssm")

### Without Nucleotide sugars

How would the model look like, if we were to remove the three nucleotide sugars? Lets remove them and perform the same test:

In [52]:
proteins_non_nuc_sugar = df[~df.protein_names.str.contains("UDP") &  ~df.protein_names.str.contains("GDP")].index.tolist()

X, y, feature_names, sample_names = preprocess_pandas(
    df_pssm.loc[proteins_non_nuc_sugar], labels.loc[proteins_non_nuc_sugar], return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    feature_transformer="pssm",
    feature_names=feature_names,
    # C=[1, 0.1, 10],
    # gamma=["scale"],
)
best_estimator_svc_pca = gsearch

get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

{'pca__n_components': 0.89, 'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 'all', 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.917


predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,0,3


Now we get a perfect score! The fact that we had all three nucleotide sugars in the test set was bad luck, or means that they should be removed. What about other random seeds?

## Estimating validation variance

How much did the result depend on choosing the training and test sets?

Mean and standard deviation for randomly selected training and validation sets.

#### RBF+PCA with nucleotide sugars

In [38]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8483,0.965
Sugar transport,0.6362,0.925


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.075507,0.02241
Sugar transport,0.215649,0.0466


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.88,0.93,0.94,0.92,0.8,0.93,0.82,0.88,0.82,0.94
pssmselector__iterations,1,1,1,3,3,1,all,3,3,1
pssmselector__uniref_threshold,90,90,all,50,50,all,50,50,all,50
svc__C,1,1,1,1,10,10,10,1,10,1
svc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,0.1,0.01,0.01,0.01,0.1,0.1,scale,scale,scale,0.1


Very good cross validation scores, but no good results for the four sugar transporters in test set. Can we reduce overfitting by using a linear kernel with fixed parameters? 

### Linear + PCA with nucleotide sugars

In [60]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
    C=[1]
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8839,0.9461
Sugar transport,0.7597,0.8883


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.065203,0.025895
Sugar transport,0.139947,0.051099


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,1,1,1,1,1,1,1,1,1,1
linearsvc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,,balanced,balanced,balanced
linearsvc__dual,True,True,True,True,True,True,True,True,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pca__n_components,0.99,0.93,0.96,0.95,0.89,0.97,0.86,0.93,0.99,0.98
pssmselector__iterations,1,1,1,3,1,1,3,3,3,1
pssmselector__uniref_threshold,50,90,50,50,all,all,90,50,50,90


The CV scores go down, but the test scores are closer. 

### RBF + PCA without nucleotide sugars

In [53]:
df_scores, df_params = full_test(
    df_pssm.loc[proteins_non_nuc_sugar],
    labels.loc[proteins_non_nuc_sugar],
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8574,0.97
Sugar transport,0.6718,0.9287


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.123905,0.027133
Sugar transport,0.201611,0.062464


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.97,0.97,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.86
pssmselector__iterations,all,1,3,3,all,1,3,3,3,3
pssmselector__uniref_threshold,90,all,50,50,all,all,50,50,all,90
svc__C,1,1,10,1,10,10,10,10,1,10
svc__class_weight,balanced,balanced,balanced,balanced,,balanced,balanced,balanced,balanced,balanced
svc__gamma,0.01,0.01,scale,1,0.1,0.1,scale,1,scale,scale


After removing the Nucl. sugars, the scores become a bit better, but the test score for sugar is still a problem. What about a linear kernel?

In [61]:
df_scores, df_params = full_test(
    df_pssm.loc[proteins_non_nuc_sugar],
    labels.loc[proteins_non_nuc_sugar],
    dim_reduction="pca",
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.887,0.9665
Sugar transport,0.7119,0.919


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.069146,0.026709
Sugar transport,0.184602,0.059725


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,10,1,1,10,1,1,1,1,1,1
linearsvc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,
linearsvc__dual,True,True,True,True,True,True,True,True,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pca__n_components,0.83,0.97,0.99,0.93,0.9,0.96,0.83,0.89,0.99,0.8
pssmselector__iterations,1,1,3,1,all,1,1,1,3,all
pssmselector__uniref_threshold,all,all,50,50,all,50,50,50,50,50


Very good CV scores, adequate test scores.

#### Just the linear kernel and PSSMselector

Can we reduce overfitting further by not performing a PCA?

In [63]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
    cross_val_method="5fold"
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8904,0.9239
Sugar transport,0.8064,0.8542


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.078745,0.027
Sugar transport,0.126112,0.051027


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,10,1,1,10,10,1,10,10,1,0.1
linearsvc__class_weight,balanced,balanced,balanced,balanced,,balanced,balanced,balanced,balanced,balanced
linearsvc__dual,False,True,True,False,False,True,False,False,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pssmselector__iterations,3,1,3,3,1,all,all,1,1,1
pssmselector__uniref_threshold,50,90,50,50,all,all,90,50,90,90


Yes, the training and test scores are now closer together!

ow does the higher number of training samples in LOOCV change the evaluation results?

In [23]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
    cross_val_method="loocv"
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8553,0.8979
Sugar transport,0.8013,0.8582


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.036179,0.021388
Sugar transport,0.059922,0.029484


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,1,1,1,10,1,10,1,10,1,1
linearsvc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,,balanced,balanced
linearsvc__dual,True,True,True,False,True,False,True,False,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pssmselector__iterations,3,all,3,all,3,all,all,3,3,3
pssmselector__uniref_threshold,all,all,50,all,all,90,50,50,50,all


It reduces both scores for AA transporter a bit, but not by much. On the other hand, it brings the standard deviations to acceptable values.

## Conclusion

Our test set contained all three nucleotide sugar transporters in the dataset, which show some differences to the remaining dataset in terms of sequence according to the PCA that we performed in previous notebooks. 

Removing these transporters gave us a perfect score. When looking at multiple random seeds for choosing the test set, a simple linear kernel without PCA worked the best. 

LOOCV reduced the standard deviation for the test scores, and brought them to acceptable values.