In [1]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
import cupy as cp 
from sklearnex import patch_sklearn

In [2]:
def optimize_datatypes(data):
    # Downcast numerical columns to reduce memory usage
    for col in data.select_dtypes(include=['int']):
        data[col] = pd.to_numeric(data[col], downcast='integer')
    for col in data.select_dtypes(include=['float']):
        data[col] = pd.to_numeric(data[col], downcast='float')
    return data

def clean_column_names(data):
    # Remove special characters and spaces from column names
    data.columns = data.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
    print("\nCleaned Column Names:")
    return data

def mainBareBones():
    # Load the dataset
    file_path = "Loan Prediction.csv"  # Replace with your dataset's path
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Clean column names to remove unsupported characters
    print("\nCleaning column names...")
    data = clean_column_names(data)

    # Data Cleaning
    print("\nCleaning and preprocessing data...")
    missing_values_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_values_before}")
    
    # Fill missing values for categorical and numerical columns
    for col in data.columns:
        if data[col].dtype == 'object':  # Categorical columns
            data[col] = data[col].fillna(data[col].mode()[0])
        else:  # Numerical columns
            data[col] = data[col].fillna(data[col].median())
    
    missing_values_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_values_after}")

    # Optimize datatypes
    print("\nOptimizing datatypes...")
    data = optimize_datatypes(data)

    # Convert categorical variables to numeric
    print("Converting categorical variables to numeric with one-hot encoding...")
    categorical_columns = ['Married_Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']
    categorical_columns = [col for col in categorical_columns if col in data.columns]  # Ensure columns exist
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
    print(f"Data shape after encoding: {data.shape}")

    # Clean column names after encoding to remove any unsupported characters
    print("\nCleaning encoded column names...")
    data.columns = data.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

    # Feature-target split
    print("\nSeparating features and target variable...")
    X = data.drop(columns=['Risk_Flag', 'Id'], errors='ignore')
    y = data['Risk_Flag']
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Train-test split
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    # Initialize the LightGBM Classifier
    print("\nInitializing the LightGBM classifier...")
    lgbm = LGBMClassifier(
        objective='binary', 
        random_state=42, 
        n_estimators=50
    )

    # Train the model
    print("\nTraining the LightGBM model...")
    lgbm.fit(X_train, y_train)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = lgbm.predict(X_test)

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

if __name__ == "__main__":
    mainBareBones()


Loading dataset...
Dataset loaded successfully! Shape: (252000, 13)

Cleaning column names...

Cleaned Column Names:

Cleaning and preprocessing data...
Missing values before cleaning: 0
Missing values after cleaning: 0

Optimizing datatypes...
Converting categorical variables to numeric with one-hot encoding...
Data shape after encoding: (252000, 405)

Cleaning encoded column names...

Separating features and target variable...
Features shape: (252000, 403), Target shape: (252000,)

Splitting data into training and testing sets...
Training set shape: (201600, 403), Testing set shape: (50400, 403)

Initializing the LightGBM classifier...

Training the LightGBM model...
[LightGBM] [Info] Number of positive: 24743, number of negative: 176857
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004866 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

In [3]:
import cupy as cp
import pandas as pd

def optimize_datatypes(data):
    # Downcast numerical columns
    for col in data.select_dtypes(include=['int']):
        data[col] = cp.asnumpy(cp.array(data[col], dtype=cp.int32))
    for col in data.select_dtypes(include=['float']):
        data[col] = cp.asnumpy(cp.array(data[col], dtype=cp.float32))
    return data

def clean_column_names(data):
    # Remove special characters and spaces from column names
    data.columns = data.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
    print("\nCleaned Column Names:")
    return data

def mainCuPY():
    # Load the dataset
    file_path = "Loan Prediction.csv"  # Replace with your dataset's path
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Clean column names to remove unsupported characters
    print("\nCleaning column names...")
    data = clean_column_names(data)

    # Data Cleaning
    print("\nCleaning and preprocessing data...")
    missing_values_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_values_before}")
    
    # Fill missing values for categorical and numerical columns
    for col in data.columns:
        if data[col].dtype == 'object':  # Categorical columns
            data[col] = data[col].fillna(data[col].mode()[0])
        else:  # Numerical columns
            data[col] = data[col].fillna(data[col].median())
    
    missing_values_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_values_after}")

    # Optimize datatypes
    print("\nOptimizing datatypes...")
    data = optimize_datatypes(data)

    # Convert categorical variables to numeric
    print("Converting categorical variables to numeric with one-hot encoding...")
    categorical_columns = ['Married_Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']
    categorical_columns = [col for col in categorical_columns if col in data.columns]  # Ensure columns exist
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
    print(f"Data shape after encoding: {data.shape}")

    # Clean column names after encoding to remove any unsupported characters
    print("\nCleaning encoded column names...")
    data.columns = data.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

    # Feature-target split
    print("\nSeparating features and target variable...")
    X = data.drop(columns=['Risk_Flag', 'Id'], errors='ignore')
    y = data['Risk_Flag']
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Train-test split
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    # Initialize the LightGBM Classifier
    print("\nInitializing the LightGBM classifier...")
    lgbm = LGBMClassifier(
        objective='binary', 
        random_state=42, 
        n_estimators=50
    )

    # Train the model
    print("\nTraining the LightGBM model...")
    lgbm.fit(X_train, y_train)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = lgbm.predict(X_test)

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

if __name__ == "__main__":
    mainCuPY()


Loading dataset...
Dataset loaded successfully! Shape: (252000, 13)

Cleaning column names...

Cleaned Column Names:

Cleaning and preprocessing data...
Missing values before cleaning: 0
Missing values after cleaning: 0

Optimizing datatypes...
Converting categorical variables to numeric with one-hot encoding...
Data shape after encoding: (252000, 405)

Cleaning encoded column names...

Separating features and target variable...
Features shape: (252000, 403), Target shape: (252000,)

Splitting data into training and testing sets...
Training set shape: (201600, 403), Testing set shape: (50400, 403)

Initializing the LightGBM classifier...

Training the LightGBM model...
[LightGBM] [Info] Number of positive: 24743, number of negative: 176857
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005033 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

In [4]:
patch_sklearn() # Apply Intel optimizations

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
def optimize_datatypes(data):
    # Downcast numerical columns to reduce memory usage
    for col in data.select_dtypes(include=['int']):
        data[col] = pd.to_numeric(data[col], downcast='integer')
    for col in data.select_dtypes(include=['float']):
        data[col] = pd.to_numeric(data[col], downcast='float')
    return data

def clean_column_names(data):
    # Remove special characters and spaces from column names
    data.columns = data.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
    print("\nCleaned Column Names:")
    return data

def mainIntelex():
    # Load the dataset
    file_path = "Loan Prediction.csv"  # Replace with your dataset's path
    print("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # Clean column names to remove unsupported characters
    print("\nCleaning column names...")
    data = clean_column_names(data)

    # Data Cleaning
    print("\nCleaning and preprocessing data...")
    missing_values_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_values_before}")
    
    # Fill missing values for categorical and numerical columns
    for col in data.columns:
        if data[col].dtype == 'object':  # Categorical columns
            data[col] = data[col].fillna(data[col].mode()[0])
        else:  # Numerical columns
            data[col] = data[col].fillna(data[col].median())
    
    missing_values_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_values_after}")

    # Optimize datatypes
    print("\nOptimizing datatypes...")
    data = optimize_datatypes(data)

    # Convert categorical variables to numeric
    print("Converting categorical variables to numeric with one-hot encoding...")
    categorical_columns = ['Married_Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']
    categorical_columns = [col for col in categorical_columns if col in data.columns]  # Ensure columns exist
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
    print(f"Data shape after encoding: {data.shape}")

    # Clean column names after encoding to remove any unsupported characters
    print("\nCleaning encoded column names...")
    data.columns = data.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

    # Feature-target split
    print("\nSeparating features and target variable...")
    X = data.drop(columns=['Risk_Flag', 'Id'], errors='ignore')
    y = data['Risk_Flag']
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")

    # Train-test split
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    # Initialize the LightGBM Classifier
    print("\nInitializing the LightGBM classifier...")
    lgbm = LGBMClassifier(
        objective='binary', 
        random_state=42, 
        n_estimators=50
    )

    # Train the model
    print("\nTraining the LightGBM model...")
    lgbm.fit(X_train, y_train)
    print("Model training completed!")

    # Make predictions
    print("\nMaking predictions on the test set...")
    y_pred = lgbm.predict(X_test)

    # Evaluate the model
    print("\nEvaluating the model...")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

if __name__ == "__main__":
    mainIntelex()

Loading dataset...
Dataset loaded successfully! Shape: (252000, 13)

Cleaning column names...

Cleaned Column Names:

Cleaning and preprocessing data...
Missing values before cleaning: 0
Missing values after cleaning: 0

Optimizing datatypes...
Converting categorical variables to numeric with one-hot encoding...
Data shape after encoding: (252000, 405)

Cleaning encoded column names...

Separating features and target variable...
Features shape: (252000, 403), Target shape: (252000,)

Splitting data into training and testing sets...
Training set shape: (201600, 403), Testing set shape: (50400, 403)

Initializing the LightGBM classifier...

Training the LightGBM model...
[LightGBM] [Info] Number of positive: 24743, number of negative: 176857
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 