In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [4]:
train_X = pd.read_csv('train_X.csv') 
train_y = pd.read_csv('train_y.csv')
test_X = pd.read_csv('test_X.csv')

print(train_X.shape)
print(train_y.shape)
print(test_X.shape)

(31029, 3)
(31029, 3)
(20686, 2)


In [None]:
train_X = train_X.drop('Unnamed: 0', axis=1)

ids = test_X['Id']
test_data = test_X.drop(columns=['Id'])

train_y = train_y.drop(['Unnamed: 0', 'Id'], axis=1)
train_y = np.ravel(train_y)

In [54]:
unique, counts = np.unique(train_y, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution:", class_distribution)

Class distribution: {0: 18948, 1: 12081}


# Feature Engineering

In [None]:
# making a dataframe of all the descriptors
dpps = pd.read_csv('./descriptors/DPPS.csv', skiprows=[0, 1])
ms_whim = pd.read_csv('./descriptors/MS-WHIM.csv', skiprows=[0, 1])
physical = pd.read_csv('./descriptors/Physical.csv', skiprows=[0, 1])
st_scale = pd.read_csv('./descriptors/ST-scale.csv', skiprows=[0, 1])
t_scale = pd.read_csv('./descriptors/T-scale.csv', skiprows=[0, 1])
vhse_scale = pd.read_csv('./descriptors/VHSE-scale.csv', skiprows=[0, 1])
z_scale = pd.read_csv('./descriptors/Z-scale.csv', skiprows=[0, 1])

descriptors_df = dpps.merge(ms_whim, on=['AA_3', 'AA_1']).merge(physical, on=['AA_3', 'AA_1']).merge(st_scale, on=['AA_3', 'AA_1']).merge(
    t_scale, on=['AA_3', 'AA_1']).merge(vhse_scale, on=['AA_3', 'AA_1']).merge(z_scale, on=['AA_3', 'AA_1'])

In [None]:
# mapping each amino acid sequence to the dpps features

dpps_dict = dpps.drop(columns='AA_3').set_index("AA_1")[[
    "D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10"]].to_dict(orient="index")

def map_sequence(sequence, dpps_dict):
    mapped_descriptors = []
    for aa in sequence:
        if aa in dpps_dict:
            mapped_descriptors.append(list(dpps_dict[aa].values()))
    return np.array(mapped_descriptors)

mapped_sequences = []
for seq in train_X["ConstructedAASeq_cln"]:
    sequence_descriptors = map_sequence(seq, dpps_dict)
    mapped_sequences.append(sequence_descriptors)

train_X["Mapped_Descriptors"] = mapped_sequences

max_sequence_length = max(len(seq) for seq in train_X["ConstructedAASeq_cln"]) 
n_features = 10  

column_names = [
    f"AA_{aa_idx}_Feature_{feature_idx}"
    for aa_idx in range(max_sequence_length)
    for feature_idx in range(n_features)
]

expanded_columns = []
for descriptors in train_X["Mapped_Descriptors"]:
    padded_descriptors = np.pad(
        descriptors, 
        ((0, max_sequence_length - len(descriptors)), (0, 0)), 
        constant_values=np.nan
    )
    expanded_columns.append(padded_descriptors.flatten())  

expanded_df = pd.DataFrame(expanded_columns, columns=column_names)
train_X_dpps = pd.concat([train_X.drop(columns=["Mapped_Descriptors"]), expanded_df], axis=1)

train_X_dpps.head()

Unnamed: 0,ConstructedAASeq_cln,Id,AA_0_Feature_0,AA_0_Feature_1,AA_0_Feature_2,AA_0_Feature_3,AA_0_Feature_4,AA_0_Feature_5,AA_0_Feature_6,AA_0_Feature_7,...,AA_236_Feature_0,AA_236_Feature_1,AA_236_Feature_2,AA_236_Feature_3,AA_236_Feature_4,AA_236_Feature_5,AA_236_Feature_6,AA_236_Feature_7,AA_236_Feature_8,AA_236_Feature_9
0,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,11328,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
1,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,5781,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
2,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,13681,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
3,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,30804,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
4,SKGEELFTGVVPILVELDGDVNGHTFSVSGEGEGDATYGELTLKFI...,30813,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65


In [57]:
dpps_dict = dpps.drop(columns='AA_3').set_index("AA_1")[[
    "D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10"]].to_dict(orient="index")

def map_sequence(sequence, dpps_dict):
    mapped_descriptors = []
    for aa in sequence:
        if aa in dpps_dict:
            mapped_descriptors.append(list(dpps_dict[aa].values()))
    return np.array(mapped_descriptors)

mapped_sequences = []
for seq in test_X["ConstructedAASeq_cln"]:
    sequence_descriptors = map_sequence(seq, dpps_dict)
    mapped_sequences.append(sequence_descriptors)

test_X["Mapped_Descriptors"] = mapped_sequences

max_sequence_length = max(len(seq) for seq in test_X["ConstructedAASeq_cln"]) 
n_features = 10  

column_names = [
    f"AA_{aa_idx}_Feature_{feature_idx}"
    for aa_idx in range(max_sequence_length)
    for feature_idx in range(n_features)
]

expanded_columns = []
for descriptors in test_X["Mapped_Descriptors"]:
    padded_descriptors = np.pad(
        descriptors, 
        ((0, max_sequence_length - len(descriptors)), (0, 0)), 
        constant_values=np.nan
    )
    expanded_columns.append(padded_descriptors.flatten())  

expanded_df = pd.DataFrame(expanded_columns, columns=column_names)
test_X_dpps = pd.concat([test_X.drop(columns=["Mapped_Descriptors"]), expanded_df], axis=1)

test_X_dpps.head()

Unnamed: 0,ConstructedAASeq_cln,Id,AA_0_Feature_0,AA_0_Feature_1,AA_0_Feature_2,AA_0_Feature_3,AA_0_Feature_4,AA_0_Feature_5,AA_0_Feature_6,AA_0_Feature_7,...,AA_236_Feature_0,AA_236_Feature_1,AA_236_Feature_2,AA_236_Feature_3,AA_236_Feature_4,AA_236_Feature_5,AA_236_Feature_6,AA_236_Feature_7,AA_236_Feature_8,AA_236_Feature_9
0,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,50579,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
1,SKGEELFTGVVPILVELDGDVSGHKFSVSGEGEGDATYGKLTLKFI...,37987,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
2,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,53977,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
3,SKGEELFTGVVPILVELDGDVNGHKLSVSGEGEGDATYGKLTLKFI...,10677,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
4,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,35653,-1.76,-0.19,1.06,-0.69,-5.72,0.14,-4.14,-2.42,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65


In [58]:
# performing one-hot encoding on the amino acid sequences
combined_sequences = pd.concat([train_X_dpps['ConstructedAASeq_cln'], test_X_dpps['ConstructedAASeq_cln']], axis=0)
combined_split = combined_sequences.apply(list).tolist()

enc = OneHotEncoder(sparse_output=False)
enc.fit(combined_split)

train_encoded = enc.transform(train_X_dpps['ConstructedAASeq_cln'].apply(list).tolist())
test_encoded = enc.transform(test_X_dpps['ConstructedAASeq_cln'].apply(list).tolist())

train_encoded_df = pd.DataFrame(train_encoded, columns=enc.get_feature_names_out())
test_encoded_df = pd.DataFrame(test_encoded, columns=enc.get_feature_names_out())

train_X_with_dpps = pd.concat([train_encoded_df, train_X_dpps.drop(columns=['ConstructedAASeq_cln'])], axis=1)
test_X_with_dpps = pd.concat([test_encoded_df, test_X_dpps.drop(columns=['ConstructedAASeq_cln'])], axis=1)

print(train_X_with_dpps.shape)
print(test_X_with_dpps.shape)

(31029, 4418)
(20686, 4418)


In [59]:
train_X_with_dpps.head()

Unnamed: 0,x0_S,x1_E,x1_K,x1_M,x1_N,x1_Q,x1_R,x1_T,x2_A,x2_C,...,AA_236_Feature_0,AA_236_Feature_1,AA_236_Feature_2,AA_236_Feature_3,AA_236_Feature_4,AA_236_Feature_5,AA_236_Feature_6,AA_236_Feature_7,AA_236_Feature_8,AA_236_Feature_9
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.47,1.54,-4.28,-0.86,2.77,2.06,-6.18,2.05,2.19,-1.65


In [38]:
scaler = StandardScaler()
train_X_with_dpps = scaler.fit_transform(train_X_with_dpps)
test_X_with_dpps = scaler.fit_transform(test_X_with_dpps)

# Making a new train-test split to evaluate different models

In [40]:
x_train, x_test, y_train, y_test = train_test_split(train_X_with_dpps, train_y, test_size=0.3, random_state=42)

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

y_train = np.ravel(y_train)

x_train shape: (21720, 4418)
x_test shape: (9309, 4418)
y_train shape: (21720,)
y_test shape: (9309,)


In [131]:
lor = LogisticRegression(max_iter=10000)
model = lor.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 88.86024277580836 %


In [132]:
lor = LogisticRegression(penalty='l1', solver="liblinear", max_iter=10000)
model = lor.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")

Accuracy: 88.66688151251478 %


In [188]:
lor = LogisticRegression(penalty="l2", class_weight='balanced', max_iter=1000)
model = lor.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")
print("F1 Score:", f1_score(y_test, y_pred)*100, "%")

Accuracy: 88.18347835428081 %
F1 Score: 85.5907780979827 %


In [189]:
lor = LogisticRegression(C=0.1, penalty='l2', max_iter=1000)
model = lor.fit(x_train, y_train)
y_pred = model.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")
print("F1 Score:", f1_score(y_test, y_pred)*100, "%")

Accuracy: 88.83875819099796 %
F1 Score: 85.88123386329664 %


In [41]:
lor = LogisticRegression(C=0.1, penalty='l1', solver='liblinear', max_iter=1000)
model = lor.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")
print("F1 Score:", f1_score(y_test, y_pred)*100, "%")

Accuracy: 88.88172736061875 %
F1 Score: 86.00027052617341 %


In [10]:
clf = svm.SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")

Accuracy: 84.0369534858739 %


In [127]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")

Accuracy: 81.29766892254807 %


In [143]:
rc = RidgeClassifier()
model = rc.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")

Accuracy: 85.2078633580406 %


# Making predictions on the actual data

In [148]:
# stacking classifier
base_learners = [
    ('rc', RidgeClassifier()),                    
]

meta_learner = LogisticRegression()

stacked_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner, 
    passthrough=False 
)

stacked_clf.fit(train_X, train_y)
pred_y = stacked_clf.predict(test_X)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(stacked_clf, train_X, train_y, cv=skf, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.5f}")

Cross-validation Accuracy: 0.86513


In [151]:
lor = LogisticRegression(C=0.1, solver='saga', penalty='elasticnet', l1_ratio=0.5, max_iter=10000)
model = lor.fit(train_X, train_y)
pred_y = model.predict(test_X)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(lor, train_X, train_y, cv=skf, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.5f}")

Cross-validation Accuracy: 0.86651


In [None]:
lor = LogisticRegression(C=0.1, penalty='l1', solver='liblinear', max_iter=10000)
model = lor.fit(train_X_with_dpps, train_y)
pred_y_lor1 = model.predict(test_X_with_dpps)

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(lor, train_X_with_dpps, train_y, cv=skf, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.5f}")
# only running this model
# 0.89294- c=0.1, l1, lib

Cross-validation Accuracy: 0.89294


In [49]:
lor = LogisticRegression(C=0.1, penalty='l1', solver='saga', max_iter=10000)
model = lor.fit(train_X_with_dpps, train_y)
pred_y_lor3 = model.predict(test_X_with_dpps)

In [24]:
lor = LogisticRegression(C=0.1, penalty='l2', max_iter=10000)
model = lor.fit(train_X, train_y)
pred_y = model.predict(test_X)

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(lor, train_X, train_y, cv=skf, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.5f}")

Cross-validation Accuracy: 0.87766


In [60]:
lor = LogisticRegression(C=0.1, penalty='l2', max_iter=10000)
model = lor.fit(train_X, train_y)
pred_y = model.predict(test_X)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(lor, train_X, train_y, cv=skf, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.5f}")

Cross-validation Accuracy: 0.87963


In [None]:
lor = LogisticRegression(penalty='l2', max_iter=1000)
model = lor.fit(train_X, train_y)
pred_y = model.predict(test_X)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(lor, train_X, train_y, cv=skf, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.5f}")

Cross-validation Accuracy: 0.89342


In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(train_X, train_y)
pred_y = dt.predict(test_X)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(dt, train_X, train_y, cv=skf, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.5f}")

Cross-validation Accuracy: 0.80870


In [None]:
clf = svm.SVC()
model = clf.fit(train_X, train_y)
pred_y = clf.predict(test_X)

In [145]:
rc = RidgeClassifier()
model = rc.fit(train_X, train_y)
pred_y = model.predict(test_X)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(rc, train_X, train_y, cv=skf, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.5f}")

Cross-validation Accuracy: 0.85278


In [50]:
pd.DataFrame({
    'Id': ids, 
    'Brightness Class': pred_y_lor3
}).to_csv('test_y_lorsaga_with_dpps.csv', index=False)