In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
data = {
    'age_normalized': [4, 4, 4, 2, 2, 3, 3, 2, 2, 3, 4, 2],
    'sex': [0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1],
    'smoking': [2, 2, 1, 0, 1, 0, 2, 0, 0, 0, 2, 1],
    'survival': [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1],
    'sur_time_mon': [37.03, 9.69, 42.15, 28.78, 18.69, 5.06, 2, 39.62, 9.2, 6.21, 37.83, 41],
    'pro_time_mon': [18.56, 9.69, 21.52, 11.89, 11.56, 5.06, 1.667, 24.61, 10, 6.21, 19.56, 25],
    'VDW': [-51.914, -51.914, -51.914, -50.5251, -50.5251, -51.914, -45.7159, -50.5251, -51.914, -51.914, -54.914, -51],
    'EEL': [-26.6512, -26.6512, -26.6512, -22.0131, -22.0131, -26.6512, -12.8403, -22.0131, -26.6512, -26.6512, -26.6512, -20],
    'EPB': [38.5811, 38.5811, 38.5811, 46.8522, 46.8522, 38.5811, 40.2891, 46.8522, 38.5811, 38.5811, 38.5811, 47],
    'ENPOLAR': [-36.8522, -36.8522, -36.8522, -34.3009, -34.3009, -36.8522, -32.0103, -34.3009, -36.8522, -36.8522, -36.8522, -35],
    'matching_rates': [0.4136, 0.4136, 0.4136, 0.4135, 0.4135, 0.4136, 0.2634, 0.4135, 0.4136, 0.4136, 0.4136, 0.42],
    'centroid_distance': [48.25, 48.25, 48.25, 48.55, 48.55, 48.25, 49.45, 48.55, 48.25, 48.25, 48.25, 49],
    'connectivity': [17.186, 22.186, 21.186, 21.019, 21.019, 22.186, 14.464, 21.019, 22.186, 22.186, 17.186, 22],
    'convex_atoms': [25, 25, 24, 29, 29, 21, 21, 27, 23, 21, 25, 30],
    'hydrogen_bonds': [1450, 1450, 1450, 1650, 1650, 1450, 910, 1650, 1450, 1450, 1450, 1600],
    'response': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
}

In [4]:
df = pd.DataFrame(data)

In [5]:
replication_factor = int(4000 / df[df['response'] == 1].shape[0]) + 1
df_positive = pd.concat([df[df['response'] == 1]] * replication_factor, ignore_index=True)

In [6]:
df_positive = df_positive.sample(n=4000, random_state=42, replace=True).reset_index(drop=True)

df_negative = pd.concat([df[df['response'] == 0]] * replication_factor, ignore_index=True)

In [7]:
df_combined = pd.concat([df_positive, df_negative.sample(n=6000, random_state=42)], ignore_index=True)

In [8]:
np.random.seed(42)
for col in df_combined.columns:
    if col not in ['age', 'age_normalized', 'sex', 'smoking', 'survival', 'convex_atoms', 'hydrogen_bonds', 'response']:
        noise = np.random.normal(0, 0.1, df_combined.shape[0])
        df_combined[col] = df_combined[col] + noise

In [9]:
columns_to_round = ['age', 'age_normalized', 'sex', 'smoking', 'survival', 'convex_atoms', 'hydrogen_bonds']
df_combined[columns_to_round] = df_combined[columns_to_round].round()

In [10]:
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [11]:
df_combined

Unnamed: 0,age,age_normalized,sex,smoking,survival,sur_time_mon,pro_time_mon,VDW,EEL,EGB,EEL.1,EPB,ENPOLAR,TOTAL.1,matching_rates,centroid_distance,connectivity,convex_atoms,hydrogen_bonds,response
0,69,3,0,0,0,5.323788,5.084729,-51.758687,-26.687743,39.425343,-26.746387,38.508305,-36.771643,-76.903658,0.339449,48.416192,22.198832,21,1450,0
1,79,4,0,2,0,37.726293,19.485278,-54.805068,-26.603018,39.496446,-26.623829,38.621394,-36.955171,-76.922010,0.392508,48.196003,17.192832,25,1450,0
2,70,3,1,2,0,1.965068,1.911577,-45.544292,-12.832093,30.288116,-12.028468,40.350020,-32.079003,-35.175607,0.482337,49.317534,14.314332,21,910,1
3,79,4,0,2,0,37.830162,19.608293,-54.935980,-26.532441,39.484720,-26.649605,38.453741,-36.813454,-76.990433,0.322538,48.317945,17.187585,25,1450,0
4,51,2,1,0,0,39.727433,24.625365,-50.692809,-22.090339,41.801330,-22.056993,46.855461,-34.114970,-60.057331,0.378843,48.674093,20.953460,27,1650,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,52,2,1,1,1,40.942592,25.157314,-50.991700,-19.844499,39.880410,-21.159558,47.016273,-34.969305,-60.185921,0.465152,48.998315,21.908139,30,1600,0
9996,69,3,0,0,0,5.055871,4.841882,-51.966745,-26.617251,39.445179,-26.776282,38.664779,-36.964288,-76.814888,0.544202,48.265580,22.216543,21,1450,0
9997,69,3,0,0,0,4.990059,4.996548,-51.695773,-26.616828,39.480315,-26.808987,38.600340,-36.993205,-76.867765,0.372103,48.181578,22.239480,21,1450,0
9998,70,3,1,2,0,2.020292,1.652036,-45.718698,-12.846476,30.220048,-12.061650,40.406347,-31.791279,-35.003624,0.467065,49.472030,14.519264,21,910,1


In [12]:
df_combined.to_csv('final_dataset.csv', index=False)

In [13]:
xb = xgb.XGBClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
mlp = MLPClassifier(random_state=42)
svm = SVC(kernel='linear', random_state=42)
ss = StandardScaler()

In [14]:
X = df_combined.drop(columns=['response'])
y = df_combined['response']

X_scaled = ss.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [16]:
svm.fit(X_train, y_train)

In [17]:
mlp.fit(X_train, y_train)

In [18]:
rf.fit(X_train, y_train)

In [19]:
xb.fit(X_train, y_train)

In [20]:
y_pred1 = svm.predict(X_test)
y_pred2 = mlp.predict(X_test)
y_pred3 = rf.predict(X_test)
y_pred4 = xb.predict(X_test)

In [21]:
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred1)}")
print(f"MLP Accuracy: {accuracy_score(y_test, y_pred2)}")
print(f"RF Accuracy: {accuracy_score(y_test, y_pred3)}")
print(f"XGB Accuracy: {accuracy_score(y_test, y_pred4)}")

SVM Accuracy: 1.0
MLP Accuracy: 1.0
RF Accuracy: 1.0
XGB Accuracy: 1.0


In [22]:
print(f"XGB: {classification_report(y_test, y_pred4)}")

XGB:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1192
           1       1.00      1.00      1.00       808

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [26]:
df2 = pd.read_csv('dataset_1.csv')

In [27]:
X2 = df2.drop(columns=['response'])
y2 = df2['response']

X2_scaled = ss.transform(X2)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_scaled, y2, test_size=0.2, random_state=42)

In [28]:
xb.fit(X2_train, y2_train)

In [29]:
y_pred_x = xb.predict(X2_test)

In [30]:
accuracy_score(y2_test, y_pred_x)

1.0

In [31]:
print(classification_report(y2_test, y_pred_x))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       226
           1       1.00      1.00      1.00        54

    accuracy                           1.00       280
   macro avg       1.00      1.00      1.00       280
weighted avg       1.00      1.00      1.00       280



In [32]:
confusion_matrix(y2_test, y_pred_x)

array([[226,   0],
       [  0,  54]])