In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
def load_dataset(folder_path: str):

    data_frames = []
    for csv in Path(folder_path).glob("*.csv"):
        df = pd.read_csv(csv)
        data_frames.append(df)

    combined_df = pd.concat(data_frames, ignore_index=True)

    combined_df = combined_df[(combined_df["aa"].notna()) & (combined_df["dssp"].notna())]
    combined_df['dssp3'] = combined_df['dssp'].map({
        'H': 'H',
        'G': 'H',
        'I': 'H',
        'P': 'H',
        'B': 'B',
        'E': 'B',
        '.': '.',
        'T': '.',
        'S': '.'
    })

    return combined_df

In [None]:
df_alpha = load_dataset("data/ca-features")
df_beta = load_dataset("data/cb-features")
df_com = load_dataset("data/com-features")

In [None]:
dist_cols = [col for col in df_alpha.columns if col.startswith('dist_')]
angle_cols = [col for col in df_alpha.columns if col.startswith('angle_') or col.startswith('dihedral_')]
neighbor_cols = [col for col in df_alpha.columns if col.startswith('neighbor_')]

In [None]:
sns.countplot(x='dssp3', data=df_alpha)

In [None]:
df_beta[dist_cols].hist(figsize=(15, 10), bins=30)

In [None]:
df_com[dist_cols].hist(figsize=(15, 10), bins=30)

In [None]:
df_alpha[dist_cols].hist(figsize=(15, 10), bins=30)

In [None]:
df_alpha[angle_cols].describe()

In [None]:
df_alpha[angle_cols] = df_alpha[angle_cols] / 180

In [None]:
df_alpha[angle_cols].hist(figsize=(15, 10), bins=30)

In [None]:
df_alpha[neighbor_cols].describe()

In [None]:
df_alpha[neighbor_cols].hist(figsize=(15, 10))

In [None]:
X = df_alpha[angle_cols + dist_cols + neighbor_cols +["aa"]]
y = df_alpha["dssp3"]

In [None]:
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
def preproc_random_forest(X, y):
    aa_enc = OneHotEncoder(sparse_output=False)
    aa_encoded = aa_enc.fit(X[["aa"]]).transform(X[["aa"]])

    aa_encoded_df = pd.DataFrame(aa_encoded, columns=aa_enc.get_feature_names_out(["aa"]))
    X_processed = pd.concat([X.reset_index(drop=True).drop(columns=["aa"]), aa_encoded_df.reset_index(drop=True)], axis=1)

    return X_processed, y

def train_random_forest(X_train, y_train):
    model = RandomForestClassifier(random_state=42)

    X_train, y_train = preproc_random_forest(X_train, y_train)

    params = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 50, 100],
    }

    cv = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring='accuracy',
        cv=StratifiedKFold(n_splits=5),
        return_train_score=True,
        n_jobs=-1,
    )

    cv.fit(X_train, y_train)

    return cv

In [None]:
X['aa'].dtype

In [None]:
label_mapping = {'H': 0, 'B': 1, '.': 2}

def preproc_xgboost(X, y):

    X_processed = X.copy()
    X_processed['aa'] = X_processed['aa'].astype('category')

    y_processed = y.map(label_mapping)

    return X_processed, y_processed

def train_xgboost(X_train, y_train):

    X_train, y_train = preproc_xgboost(X_train, y_train)

    model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        enable_categorical=True,
    )

    params = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 50, 100],
        'learning_rate': [0.1, 0.2, 0.3]
    }

    cv = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring='accuracy',
        cv=StratifiedKFold(n_splits=5),
        return_train_score=True,
        n_jobs=-1,
    )

    cv.fit(X_train, y_train)

    return cv

In [None]:
def preproc_cb(X, y):

    X_processed = X.copy()
    X_processed['aa'] = X_processed['aa'].astype('category')

    y_processed = y.astype('category')

    return X_processed, y_processed

def train_cb(X_train, y_train):

    X_train, y_train = preproc_cb(X_train, y_train)

    model = CatBoostClassifier(
        loss_function='MultiClass',
        cat_features=['aa'],
    )
    model.grid_search(
        param_grid={
            'iterations': [100, 200],
            'depth': [4, 6, 10],
            'learning_rate': [0.01, 0.1, 0.2]
        },
        X=X_train,
        y=y_train,
        cv=StratifiedKFold(n_splits=5),
    )

    return model

In [None]:
rf_model = train_random_forest(X_train, y_train)

In [None]:
rf_model.best_score_

In [None]:
xgb_model = train_xgboost(X_train, y_train)

In [None]:
xgb_model.best_score_

In [None]:
cb = train_cb(X_train, y_train)

In [None]:
cb.best_score_

In [None]:
rf_X_test, rf_y_test = preproc_random_forest(X_test, y_test)

rf_model.score(rf_X_test, rf_y_test)

In [None]:
xgb_X_test, xgb_y_test = preproc_xgboost(X_test, y_test)

xgb_model.score(xgb_X_test, xgb_y_test)

In [None]:
cb_X_test, cb_y_test = preproc_cb(X_test, y_test)

cb.score(cb_X_test, cb_y_test)

In [None]:
com_X_train, com_y_train = train_test_split(X_train, y_train)