In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import DMatrix, train

def optimize_datatypes(data):
    # Downcast numerical columns to reduce memory usage
    for col in data.select_dtypes(include=['int']):
        data[col] = pd.to_numeric(data[col], downcast='integer')
    for col in data.select_dtypes(include=['float']):
        data[col] = pd.to_numeric(data[col], downcast='float')
    
    return data

def mainBareBones():
    # Load the dataset
    file_path = "diabetes_dataset.csv"  # Replace with the path to your CSV file
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Data Cleaning and Preprocessing
    print("\nCleaning and preprocessing data...")
    missing_values_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_values_before}")
    data.fillna(data.mean(numeric_only=True), inplace=True)  # Replace NaNs for numeric columns
    missing_values_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_values_after}")

    # Optimize datatypes
    print("\nOptimizing datatypes...")
    data = optimize_datatypes(data)

    # Convert categorical columns (e.g., Hypertension) to integers
    print("Converting categorical columns to integers...")
    if 'Hypertension' in data.columns:
        data['Hypertension'] = data['Hypertension'].astype(int)

    # Separate features (X) and target (y)
    print("\nSeparating features and target variable...")
    X = data.drop('Outcome', axis=1).values
    y = data['Outcome'].values
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Split the data into training and testing sets
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    print("\nConverting data to DMatrix format for XGBoost...")
    dtrain = DMatrix(X_train, label=y_train)
    dtest = DMatrix(X_test, label=y_test)

    # Parameters for XGBoost
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'hist',  # Optimized for GPU
        'device': 'cuda:0', # Use GPU,
        'random_state': 42 
    }

    # Train the model
    print("\nTraining the XGBoost model...")
    model = train(params, dtrain)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = (model.predict(dtest) > 0.5).astype(int)

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy (Optimized): {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

if __name__ == "__main__":
    mainBareBones()

Loading dataset...
Dataset loaded successfully! Shape: (9538, 17)

Cleaning and preprocessing data...
Missing values before cleaning: 0
Missing values after cleaning: 0

Optimizing datatypes...
Converting categorical columns to integers...

Separating features and target variable...
Features shape: (9538, 16), Target shape: (9538,)

Splitting data into training and testing sets...
Training set shape: (7630, 16), Testing set shape: (1908, 16)

Converting data to DMatrix format for XGBoost...

Training the XGBoost model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
Accuracy (Optimized): 99.90%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1275
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1908
   macro avg       1.00      1.00      1.00      1908
weighted avg       1.00      1.00      1.00      1908



In [2]:
import pandas as pd
import cupy as cp
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import DMatrix, train

def optimize_datatypes(data):
    # Downcast numerical columns to reduce memory usage
    for col in data.select_dtypes(include=['int']):
        data[col] = pd.to_numeric(data[col], downcast='integer')
    for col in data.select_dtypes(include=['float']):
        data[col] = pd.to_numeric(data[col], downcast='float')
    
    return data

def mainCuPY():
    # Load the dataset
    file_path = "diabetes_dataset.csv"  # Replace with the path to your CSV file
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Data Cleaning and Preprocessing
    print("\nCleaning and preprocessing data...")
    print(f"Missing values before cleaning: {data.isnull().sum().sum()}")
    data.fillna(data.mean(numeric_only=True), inplace=True)  # Replace NaNs with column means
    print(f"Missing values after cleaning: {data.isnull().sum().sum()}")

    # Optimize datatypes
    print("\nOptimizing datatypes...")
    data = optimize_datatypes(data)

    # Convert categorical columns (e.g., Hypertension) to integers
    if 'Hypertension' in data.columns:
        data['Hypertension'] = data['Hypertension'].astype(int)

    # Separate features (X) and target (y)
    print("\nSeparating features and target variable...")
    X = data.drop('Outcome', axis=1).values
    y = data['Outcome'].values
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Split the data into training and testing sets
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    # Convert to CuPy arrays for GPU acceleration
    print("\nTransferring data to GPU...")
    X_train_cp = cp.asarray(X_train, dtype=cp.float32)
    X_test_cp = cp.asarray(X_test, dtype=cp.float32)
    y_train_cp = cp.asarray(y_train, dtype=cp.int32)
    y_test_cp = cp.asarray(y_test, dtype=cp.int32)

    print("\nConverting data to DMatrix format for XGBoost...")
    dtrain = DMatrix(X_train_cp, label=y_train_cp)
    dtest = DMatrix(X_test_cp, label=y_test_cp)

    # Parameters for XGBoost
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'hist',  # Optimized for GPU
        'device': 'cuda:0',                # Use GPU device 0
        'random_state': 42
    }

    # Train the model
    print("\nTraining the XGBoost model...")
    model = train(params, dtrain)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = (cp.asarray(model.predict(dtest)) > 0.5).astype(int)

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(cp.asnumpy(y_test_cp), cp.asnumpy(y_pred))
    print(f"Accuracy (CuPy Optimized): {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(cp.asnumpy(y_test_cp), cp.asnumpy(y_pred)))

if __name__ == "__main__":
    mainCuPY()


Loading dataset...
Dataset loaded successfully! Shape: (9538, 17)

Cleaning and preprocessing data...
Missing values before cleaning: 0
Missing values after cleaning: 0

Optimizing datatypes...

Separating features and target variable...
Features shape: (9538, 16), Target shape: (9538,)

Splitting data into training and testing sets...
Training set shape: (7630, 16), Testing set shape: (1908, 16)

Transferring data to GPU...

Converting data to DMatrix format for XGBoost...

Training the XGBoost model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
Accuracy (CuPy Optimized): 99.90%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1275
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1908
   macro avg       1.00      1.00      1.00      1908
weighted avg       1.00      1.00      1.00      1908



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import DMatrix, train
from sklearnex import patch_sklearn
patch_sklearn()  # Apply Intel optimizations

def optimize_datatypes(data):
    # Downcast numerical columns to reduce memory usage
    for col in data.select_dtypes(include=['int']):
        data[col] = pd.to_numeric(data[col], downcast='integer')
    for col in data.select_dtypes(include=['float']):
        data[col] = pd.to_numeric(data[col], downcast='float')
    
    return data

def mainIntelex():
    # Load the dataset
    file_path = "diabetes_dataset.csv"  # Replace with the path to your CSV file
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Data Cleaning and Preprocessing
    print("\nCleaning and preprocessing data...")
    missing_values_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_values_before}")
    data.fillna(data.mean(numeric_only=True), inplace=True)  # Replace NaNs for numeric columns
    missing_values_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_values_after}")

    # Optimize datatypes
    print("\nOptimizing datatypes...")
    data = optimize_datatypes(data)

    # Convert categorical columns (e.g., Hypertension) to integers
    print("Converting categorical columns to integers...")
    if 'Hypertension' in data.columns:
        data['Hypertension'] = data['Hypertension'].astype(int)

    # Separate features (X) and target (y)
    print("\nSeparating features and target variable...")
    X = data.drop('Outcome', axis=1).values
    y = data['Outcome'].values
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Split the data into training and testing sets
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    print("\nConverting data to DMatrix format for XGBoost...")
    dtrain = DMatrix(X_train, label=y_train)
    dtest = DMatrix(X_test, label=y_test)

    # Parameters for XGBoost
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'hist',  # Optimized for GPU
        'device': 'cuda:0', # Use GPU,
        'random_state': 42 
    }

    # Train the model
    print("\nTraining the XGBoost model...")
    model = train(params, dtrain)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = (model.predict(dtest) > 0.5).astype(int)

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy (Optimized): {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

if __name__ == "__main__":
    mainIntelex()

Loading dataset...
Dataset loaded successfully! Shape: (9538, 17)

Cleaning and preprocessing data...
Missing values before cleaning: 0
Missing values after cleaning: 0

Optimizing datatypes...
Converting categorical columns to integers...

Separating features and target variable...
Features shape: (9538, 16), Target shape: (9538,)

Splitting data into training and testing sets...
Training set shape: (7630, 16), Testing set shape: (1908, 16)

Converting data to DMatrix format for XGBoost...

Training the XGBoost model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
Accuracy (Optimized): 99.90%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1275
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1908
   macro avg       1.00      1.00      1.00      1908
weighted avg       1.00      1.00      1.00      1908



Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
