In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

def mainBareBones():
    # Load the dataset
    file_path = "Loan Prediction.csv"  # Replace with the path to your dataset
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Data Cleaning
    print("\nCleaning and preprocessing data...")
    missing_values_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_values_before}")
    
    # Fill missing values for categorical and numerical columns
    for col in data.columns:
        if data[col].dtype == 'object':  # Categorical columns
            data[col] = data[col].fillna(data[col].mode()[0])
        else:  # Numerical columns
            data[col] = data[col].fillna(data[col].median())
    
    missing_values_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_values_after}")

    # Convert categorical variables to numeric
    print("Converting categorical variables to numeric with one-hot encoding...")
    categorical_columns = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
    print(f"Data shape after encoding: {data.shape}")

    # Feature-target split
    print("\nSeparating features and target variable...")
    X = data.drop(columns=['Risk_Flag', 'Id'])  # Dropping Id and target column
    y = data['Risk_Flag']
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Sanitize column names for compatibility with XGBoost
    print("\nSanitizing column names for XGBoost compatibility...")
    X.columns = [col.replace('[', '').replace(']', '').replace('<', '').replace('>', '').replace(' ', '_') for col in X.columns]

    # Convert data to optimized types
    print("\nConverting data to optimized types...")
    X = X.astype(np.float32)
    y = y.astype(np.int32)    

    # Train-test split
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    # Convert to DMatrix
    print("\nConverting data to DMatrix format for XGBoost...")
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)
    test_dmatrix = xgb.DMatrix(X_test, label=y_test)

    # Parameters for XGBoost
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'device': 'cuda',  # Use GPU
        'random_state': 42
    }

    # Train the XGBoost model
    print("\nTraining the XGBoost model...")
    model = xgb.train(params, train_dmatrix, num_boost_round=100)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(test_dmatrix)
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(y_test, y_pred_binary)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_binary))

if __name__ == "__main__":
    mainBareBones()

Loading dataset...
Dataset loaded successfully! Shape: (252000, 13)

Cleaning and preprocessing data...
Missing values before cleaning: 0
Missing values after cleaning: 0
Converting categorical variables to numeric with one-hot encoding...
Data shape after encoding: (252000, 405)

Separating features and target variable...
Features shape: (252000, 403), Target shape: (252000,)

Sanitizing column names for XGBoost compatibility...

Converting data to optimized types...

Splitting data into training and testing sets...
Training set shape: (201600, 403), Testing set shape: (50400, 403)

Converting data to DMatrix format for XGBoost...

Training the XGBoost model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
Accuracy: 88.06%

Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     44147
           1       0.68      0.07      0.13      6253

    accuracy                      

In [None]:
import pandas as pd
import numpy as np
import cupy as cp
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

def mainCuPY():
    # Load the dataset
    file_path = "Loan Prediction.csv"  # Replace with the path to your dataset
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Data Cleaning
    print("\nCleaning and preprocessing data...")
    missing_values_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_values_before}")

    # Fill missing values for categorical and numerical columns
    for col in data.columns:
        if data[col].dtype == 'object':  # Categorical columns
            data[col] = data[col].fillna(data[col].mode()[0])
        else:  # Numerical columns
            data[col] = data[col].fillna(data[col].median())
    
    missing_values_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_values_after}")

    # Convert categorical variables to numeric
    print("Converting categorical variables to numeric with one-hot encoding...")
    categorical_columns = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
    print(f"Data shape after encoding: {data.shape}")

    # Feature-target split
    print("\nSeparating features and target variable...")
    X = data.drop(columns=['Risk_Flag', 'Id'])  # Dropping Id and target column
    y = data['Risk_Flag']
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Sanitize column names for compatibility with XGBoost
    print("\nSanitizing column names for XGBoost compatibility...")
    X.columns = [col.replace('[', '').replace(']', '').replace('<', '').replace('>', '').replace(' ', '_') for col in X.columns]

    # Train-test split
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    # Ensure all data is numeric before conversion to CuPy arrays
    print("\nEnsuring all data is numeric...")
    X_train = X_train.to_numpy(dtype=np.float32)
    X_test = X_test.to_numpy(dtype=np.float32)
    y_train = y_train.to_numpy(dtype=np.float32)
    y_test = y_test.to_numpy(dtype=np.float32)

    # Convert data to CuPy arrays
    print("\nConverting data to CuPy arrays...")
    X_train_cp = cp.asarray(X_train)
    X_test_cp = cp.asarray(X_test)
    y_train_cp = cp.asarray(y_train)
    y_test_cp = cp.asarray(y_test)

    # Convert CuPy arrays to DMatrix for XGBoost
    print("\nConverting data to DMatrix format for XGBoost...")
    train_dmatrix = xgb.DMatrix(X_train_cp, label=cp.asnumpy(y_train_cp))
    test_dmatrix = xgb.DMatrix(X_test_cp, label=cp.asnumpy(y_test_cp))

    # Parameters for XGBoost
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'device': 'cuda',
        'random_state': 42
    }

    # Train the XGBoost model
    print("\nTraining the XGBoost model...")
    model = xgb.train(params, train_dmatrix, num_boost_round=100)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(test_dmatrix)
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(cp.asnumpy(y_test_cp), y_pred_binary)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(cp.asnumpy(y_test_cp), y_pred_binary))

if __name__ == "__main__":
    mainCuPY()


In [None]:
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn  # Import Intel's extension
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# Enable Intel's optimizations
patch_sklearn()

def mainIntelex():
    # Load the dataset
    file_path = "Loan Prediction.csv"  # Replace with the path to your dataset
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Data Cleaning
    print("\nCleaning and preprocessing data...")
    missing_values_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_values_before}")
    
    # Fill missing values for categorical and numerical columns
    for col in data.columns:
        if data[col].dtype == 'object':  # Categorical columns
            data[col] = data[col].fillna(data[col].mode()[0])
        else:  # Numerical columns
            data[col] = data[col].fillna(data[col].median())
    
    missing_values_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_values_after}")

    # Convert categorical variables to numeric
    print("Converting categorical variables to numeric with one-hot encoding...")
    categorical_columns = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
    print(f"Data shape after encoding: {data.shape}")

    # Feature-target split
    print("\nSeparating features and target variable...")
    X = data.drop(columns=['Risk_Flag', 'Id'])  # Dropping Id and target column
    y = data['Risk_Flag']
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Sanitize column names for compatibility with XGBoost
    print("\nSanitizing column names for XGBoost compatibility...")
    X.columns = [col.replace('[', '').replace(']', '').replace('<', '').replace('>', '').replace(' ', '_') for col in X.columns]

    # Convert data to optimized types
    print("\nConverting data to optimized types...")
    X = X.astype(np.float32)
    y = y.astype(np.float32)

    # Train-test split
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    # Convert to DMatrix
    print("\nConverting data to DMatrix format for XGBoost...")
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)
    test_dmatrix = xgb.DMatrix(X_test, label=y_test)

    # Parameters for XGBoost
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'device': 'cuda',  # Use GPU
        'random_state': 42
    }

    # Train the XGBoost model
    print("\nTraining the XGBoost model...")
    model = xgb.train(params, train_dmatrix, num_boost_round=100)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(test_dmatrix)
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(y_test, y_pred_binary)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_binary))

if __name__ == "__main__":
    mainIntelex()
