In [2]:
import pandas as pd
import numpy as np

# Define the limits for each feature
limits = {
    'Age_normalized': (2, 4),
    'Sex': (0, 1),
    'smoking': (0, 2),
    'survival': (0, 1),
    'sur_time_mo': (1.67, 42.15),
    'prog_time_mo': (1.67, 24.61),
    'VDW': (-51.9140, -45.7159),
    'EEL': (-26.6512, -12.8403),
    'EGB': (30.1702, 41.9059),
    'Matching_rates': (0.4135, 0.4136),
    'Centroid_distance': (48.25, 49.45),
    'Connectivity': (14.464, 22.186),
    'Convex_atoms': (21, 29),
    'Hydrogen_bonds': (910, 1650)
}

# Create a synthetic dataset
np.random.seed(42)  # For reproducibility
synthetic_data = {
    feature: np.random.uniform(low=limit[0], high=limit[1], size=10000) if isinstance(limit[0], float) else np.random.randint(low=limit[0], high=limit[1]+1, size=10000)
    for feature, limit in limits.items()
}

# Convert to DataFrame
df_synthetic = pd.DataFrame(synthetic_data)

# Add the 'response_binary' column based on a simple random threshold
df_synthetic['response_binary'] = np.random.choice(['Positive', 'Negative'], size=10000, p=[0.6, 0.4])

# Display the first few rows of the synthetic dataset

# Save the dataset to a CSV file if needed
# df_synthetic.to_csv('synthetic_dataset.csv', index=False)


In [5]:
df_synthetic.shape

(10000, 16)

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
features_to_scale = [
    'Age', 'Age_normalized', 'sur_time_mo', 'prog_time_mo',
    'VDW', 'EEL', 'EGB', 'Matching_rates', 'Centroid_distance',
    'Connectivity', 'Convex_atoms', 'Hydrogen_bonds'
]

# Initialize the scaler and scale the continuous features
scaler = StandardScaler()
df_synthetic[features_to_scale] = scaler.fit_transform(df_synthetic[features_to_scale])

# List of features to label encode (categorical features)
features_to_encode = ['Sex', 'smoking', 'survival', 'response_binary']

# Initialize the label encoder and encode the categorical features
label_encoders = {}
for feature in features_to_encode:
    le = LabelEncoder()
    df_synthetic[feature] = le.fit_transform(df_synthetic[feature])
    label_encoders[feature] = le

In [None]:
df_synthetic.head()

Unnamed: 0,Age,Age_normalized,Sex,smoking,survival,sur_time_mo,prog_time_mo,VDW,EEL,EGB,Matching_rates,Centroid_distance,Connectivity,Convex_atoms,Hydrogen_bonds,response_binary
0,-0.980111,1.233484,0,2,0,-0.462274,-1.384681,0.267441,-1.713782,1.428509,1.119239,-1.592041,0.185877,-1.538667,1.409033,0
1,0.572417,-1.218771,1,2,0,-0.745406,-1.373429,-0.977609,-0.913597,0.118706,0.417491,-1.275131,0.274045,-1.151766,1.268328,1
2,1.647244,0.007357,0,0,0,-1.431767,-0.815026,-1.02657,1.526673,-0.03776,-1.14842,-0.073165,-0.92123,0.395838,0.879045,1
3,-0.024709,0.007357,1,2,0,-0.798378,0.119399,-1.62099,1.582907,0.280082,-0.336276,-1.392724,-0.961857,0.782739,-0.218451,0
4,-0.50241,-1.218771,1,2,0,-0.614454,-1.316709,-0.265528,0.102508,-1.000536,0.0308,0.265816,-0.467525,-1.538667,1.30585,1


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer

In [None]:
xgb_model = xgb.XGBClassifier(random_state=42)

In [None]:
X = df_synthetic.drop(columns=['response_binary'])
y = df_synthetic['response_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.37      0.25      0.30       783
           1       0.60      0.73      0.66      1217

    accuracy                           0.54      2000
   macro avg       0.49      0.49      0.48      2000
weighted avg       0.51      0.54      0.52      2000



array([[198, 585],
       [331, 886]])

In [None]:
# prompt: import adaboost and train it on the data

from sklearn.ensemble import AdaBoostClassifier

# Initialize AdaBoost Classifier
ada_model = AdaBoostClassifier(random_state=42)

# Train the model
ada_model.fit(X_train, y_train)

# Make predictions
y_pred_ada = ada_model.predict(X_test)

# Evaluate the model
print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))
print(confusion_matrix(y_test, y_pred_ada))

AdaBoost Accuracy: 0.6
              precision    recall  f1-score   support

           0       0.35      0.03      0.05       783
           1       0.61      0.97      0.75      1217

    accuracy                           0.60      2000
   macro avg       0.48      0.50      0.40      2000
weighted avg       0.51      0.60      0.47      2000

[[  20  763]
 [  37 1180]]


In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cv_scores = cross_val_score(ada_model, X, y, cv=15, scoring=make_scorer(accuracy_score))

# Print cross-validation results
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation of CV Accuracy:", cv_scores.std())

Cross-Validation Scores: [0.6011994  0.6131934  0.6011994  0.59970015 0.5952024  0.59670165
 0.6041979  0.6011994  0.6071964  0.6041979  0.60810811 0.5990991
 0.6006006  0.5975976  0.59159159]
Mean CV Accuracy: 0.6013990001995999
Standard Deviation of CV Accuracy: 0.005249946863992392


In [None]:
cv_scores2 = cross_val_score(xgb_model, X, y, cv=15, scoring=make_scorer(accuracy_score))

# Print cross-validation results
print("Cross-Validation Scores:", cv_scores2)
print("Mean CV Accuracy:", cv_scores2.mean())
print("Standard Deviation of CV Accuracy:", cv_scores2.std())

Cross-Validation Scores: [0.56371814 0.54872564 0.55472264 0.54272864 0.54272864 0.56071964
 0.57421289 0.53973013 0.54272864 0.55622189 0.55555556 0.55255255
 0.56606607 0.55555556 0.55555556]
Mean CV Accuracy: 0.5541014777896337
Standard Deviation of CV Accuracy: 0.009384329673108848


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming df_synthetic is your DataFrame with scaled and encoded data

# Generate new features that are combinations of existing ones
df_synthetic['Age_x_Sur_time'] = df_synthetic['Age'] * df_synthetic['sur_time_mo']
df_synthetic['Age_squared'] = df_synthetic['Age'] ** 2

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df_synthetic[['Age', 'sur_time_mo', 'VDW', 'EEL']])
poly_feature_names = poly.get_feature_names_out(['Age', 'sur_time_mo', 'VDW', 'EEL'])
df_poly_features = pd.DataFrame(poly_features, columns=poly_feature_names)

# Combine the polynomial features with the original DataFrame
df_synthetic = pd.concat([df_synthetic, df_poly_features], axis=1)

# Introduce an artificial feature that is correlated with the target
df_synthetic['artificial_correlated'] = df_synthetic['response_binary'] * np.random.uniform(0.5, 1.5, size=len(df_synthetic))

# Separate features and target variable again
X = df_synthetic.drop('response_binary', axis=1)
y = df_synthetic['response_binary']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize AdaBoost Classifier
ada_model = AdaBoostClassifier(random_state=42)

# Train the model
ada_model.fit(X_train, y_train)

# Make predictions
y_pred_ada = ada_model.predict(X_test)

# Evaluate the model
print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_ada))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ada))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_ada))

ValueError: Columns must be same length as key

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Example synthetic training data (ensure this is the same preprocessing as your actual training data)
df_synthetic = pd.DataFrame({
    'Age': np.random.randint(30, 80, size=1000),
    'Age_normalized': np.random.randint(1, 5, size=1000),
    'Sex': np.random.randint(0, 2, size=1000),
    'smoking': np.random.randint(0, 2, size=1000),
    'survival': np.random.randint(0, 2, size=1000),
    'sur_time_mo': np.random.uniform(5, 50, size=1000),
    'prog_time_mo': np.random.uniform(5, 50, size=1000),
    'VDW': np.random.uniform(-60, -40, size=1000),
    'EEL': np.random.uniform(-30, -10, size=1000),
    'EGB': np.random.uniform(30, 40, size=1000),
    'EPB': np.random.uniform(20, 40, size=1000),
    'Matching_rates': np.random.randint(0, 20, size=1000),
    'Centroid_distance': np.random.uniform(40, 50, size=1000),
    'Connectivity': np.random.randint(20, 30, size=1000),
    'Convex_atoms': np.random.randint(1400, 1600, size=1000),
    'Hydrogen_bonds': np.random.randint(1000, 1500, size=1000),
    'response_binary': np.random.randint(0, 2, size=1000)
})

# Separate features and target variable
X = df_synthetic.drop('response_binary', axis=1)
y = df_synthetic['response_binary']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize StandardScaler and PolynomialFeatures
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit scaler and polynomial features on training data
X_train_scaled = scaler.fit_transform(X_train)
X_train_poly_features = poly.fit_transform(X_train_scaled)

# Train the AdaBoost model
ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train_poly_features, y_train)

# Apply transformations to the test data
X_test_scaled = scaler.transform(X_test)
X_test_poly_features = poly.transform(X_test_scaled)

# Make predictions
y_pred_ada = ada_model.predict(X_test_poly_features)

# Evaluate the model
print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_ada))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ada))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_ada))

# Example unseen data
unseen_data = pd.DataFrame({
    'Age': [65, 45],
    'Age_normalized': [3, 2],
    'Sex': [1, 0],
    'smoking': [0, 1],
    'survival': [1, 0],
    'sur_time_mo': [15.5, 20.0],
    'prog_time_mo': [12.5, 18.0],
    'VDW': [-48.0, -49.0],
    'EEL': [-25.0, -24.0],
    'EGB': [37.0, 35.0],
    'EPB': [36.0, 34.0],
    'Matching_rates': [18, 20],
    'Centroid_distance': [48.0, 45.0],
    'Connectivity': [25, 22],
    'Convex_atoms': [1500, 1450],
    'Hydrogen_bonds': [1200, 1150]
})

# Apply the same transformations to unseen data
unseen_data_scaled = scaler.transform(unseen_data)  # Apply scaling
unseen_data_poly_features = poly.transform(unseen_data_scaled)  # Apply polynomial feature transformation

# Predict with the trained model
y_pred_unseen = ada_model.predict(unseen_data_poly_features)

# Display predictions
print("Predictions for Unseen Data:", y_pred_unseen)


AdaBoost Accuracy: 0.455

Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.47      0.45        96
           1       0.47      0.44      0.46       104

    accuracy                           0.46       200
   macro avg       0.46      0.46      0.45       200
weighted avg       0.46      0.46      0.46       200


Confusion Matrix:
 [[45 51]
 [58 46]]
Predictions for Unseen Data: [1 0]
