In [58]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [73]:
#Reading the data
df_train_raw = pd.read_csv("../data/af2_dataset_training_labeled.csv.gz",compression='gzip')
df_test_raw = pd.read_csv("../data/af2_dataset_testset_unlabeled.csv.gz",compression='gzip')

In [101]:
df_train = df_train_raw.drop(['Unnamed: 0', 'annotation_sequence', 'entry'], axis=1)
df_test = df_test_raw.drop(['Unnamed: 0', 'annotation_sequence', 'entry'], axis=1)

In [102]:
# Apply one hot encoding
columns_to_encode= cols_to_encode = ['annotation_atomrec',
            'feat_A', 'feat_C', 'feat_D', 'feat_E', 'feat_F', 'feat_G', 'feat_H', 'feat_I', 'feat_K', 'feat_L','feat_N','feat_M','feat_P',	'feat_Q',	'feat_R','feat_S','feat_T','feat_V','feat_W','feat_Y','feat_DSSP_H','feat_DSSP_B',	'feat_DSSP_E','feat_DSSP_G','feat_DSSP_I','feat_DSSP_T','feat_DSSP_S']
df_train_encoded = pd.get_dummies(df_train, columns=columns_to_encode)
df_test_encoded = pd.get_dummies(df_test, columns=columns_to_encode)

In [103]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df_train_encoded.drop(columns= ['y_Ligand']), df_train_encoded['y_Ligand'], test_size=0.2, random_state=42)

In [104]:
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.fit_transform(df_test_encoded)

In [105]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(397732, 93)
(107624, 93)


## Model Training

In [106]:
# Train XGBoost model
class_weight = sum(y_train==0) / sum(y_train==1)
model = xgb.XGBClassifier(scale_pos_weights=class_weight)
model.fit(X_train, y_train)

Parameters: { "scale_pos_weights" } are not used.



In [107]:
# Make predictions on test data
y_pred = model.predict(X_val)

# Get classification report
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

       False       0.97      1.00      0.98     96038
        True       0.79      0.15      0.25      3396

    accuracy                           0.97     99434
   macro avg       0.88      0.57      0.62     99434
weighted avg       0.96      0.97      0.96     99434



## Final Prediction

In [108]:
final_pred = model.predict(X_test_scaled)
final_pred

array([0, 0, 0, ..., 0, 1, 0])

In [110]:
prediction_df = pd.DataFrame(columns=['Predicted'], data=final_pred)
prediction_df.to_csv('predictions.csv')