In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load data in chunks
chunk_size = 1000
file_path = '/kaggle/input/playground-series-s3e17/train.csv'
chunks = pd.read_csv(file_path, index_col='id', chunksize=chunk_size)

# Define categorical features
cat_features = ['Product ID', 'Type']

# Initialize encoder and scaler
encoder = OneHotEncoder(drop='first')
scaler = StandardScaler()

# Gaussian Naive Bayes model
nb_model = GaussianNB()

# Initialize variables to store column names
all_columns = None

# Iterate over chunks
for chunk in chunks:
    # Handle missing values
    chunk.fillna(method='ffill', inplace=True)
    
    # Store column names for consistency across chunks
    if all_columns is None:
        all_columns = chunk.columns.tolist()
    
    # Ensure all columns are present in the current chunk
    missing_columns = set(all_columns) - set(chunk.columns.tolist())
    if missing_columns:
        for col in missing_columns:
            chunk[col] = 0
    
    # One-hot encode categorical features
    X_encoded = encoder.fit_transform(chunk[cat_features])
    
    # Scale numerical features
    X_scaled = scaler.fit_transform(chunk.drop(cat_features + ['Machine failure'], axis=1))
    
    # Combine encoded categorical and scaled numerical features
    X_combined = np.hstack((X_encoded.toarray(), X_scaled))
    
    # Ensure the number of features matches the expected number for the model
    if X_combined.shape[1] != 936:
        continue  # Skip this chunk if the number of features is incorrect
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_combined, chunk['Machine failure'], test_size=0.25, random_state=42)
    
    # Handling Imbalanced Data: Apply SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Fit the model
    nb_model.partial_fit(X_train_resampled, y_train_resampled, classes=np.unique(y_train))
    
    # Evaluate the model on the test set
    y_pred = nb_model.predict(X_test)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"ROC AUC score on test set: {roc_auc}")

    # Confusion Matrix and Classification Report
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)
def plot_roc_curve(y_true, y_pred, model_name):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc='lower right')
    plt.show()
    print(f"ROC AUC Score ({model_name}): {roc_auc:.4f}")

    # Plot ROC Curve
plot_roc_curve(y_test, y_pred, "Gaussian Naive Bayes")


