<a href="https://colab.research.google.com/github/amirmohammadkalateh/Extrovert-vs.-Introvert-Behavior-/blob/main/ambivert!.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import io
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np # Import numpy for argmax in DL classification report

# --- Step 1: Data Loading and Initial Inspection ---
print("Step 1: Loading Data and Initial Inspection\n")

# Use content_fetcher to load the dataset from the uploaded file
try:
    # Check if content_fetcher is defined in the current environment
    if 'content_fetcher' in globals():
        file_content = content_fetcher.fetch(
            query="personality_datasert.csv",
            source_references=[{"id": "uploaded:personality_datasert.csv", "type": "text/csv"}]
        )
        df = pd.read_csv(io.StringIO(file_content))
        print("Dataset loaded successfully from 'personality_datasert.csv' using content_fetcher.")
    else:
        print("Error: 'content_fetcher' is not defined in this environment.")
        print("Please ensure you are running this code in an environment where 'content_fetcher' is available (e.g., Google's collaborative Canvas).")
        print("Attempting to load from a local file path as a fallback (this might fail if the file is not present locally)...")
        # Fallback for local execution if content_fetcher is not available
        try:
            df = pd.read_csv('personality_datasert.csv')
            print("Dataset loaded successfully from local file 'personality_datasert.csv'.")
        except FileNotFoundError:
            print("Error: 'personality_datasert.csv' not found locally either.")
            print("Please ensure the CSV file is in the same directory as the script or is accessible via the environment's file fetching mechanism.")
            exit() # Exit if data cannot be loaded

except Exception as e:
    print(f"An unexpected error occurred during data loading: {e}")
    # Exit if data loading fails, as subsequent steps depend on it.
    exit()

# Display the first few rows of the dataframe
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Display concise summary of the dataframe, including data types
print("\nDataset Info:")
print(df.info())

# --- Step 2: Identify Target Variable and Apply Label Encoding ---
print("\nStep 2: Identifying Target Variable and Applying Label Encoding\n")

# Automatically determine the target column.
# We prioritize 'Personality (English)' or 'Personality' as it's a personality dataset.
target_column = None
if 'Personality (English)' in df.columns:
    target_column = 'Personality (English)'
elif 'Personality' in df.columns:
    target_column = 'Personality'
elif 'type' in df.columns: # Common for MBTI types
    target_column = 'type'
else:
    # Fallback: If no clear personality-related column, try to find the last object column
    # or the one with the most unique categorical values as a potential target.
    object_cols = df.select_dtypes(include='object').columns.tolist()
    if object_cols:
        # Prioritize a column that has relatively few unique values for classification
        best_candidate = None
        min_unique = float('inf')
        for col in object_cols:
            n_unique = df[col].nunique()
            if n_unique > 1 and n_unique < min_unique: # Must have more than 1 unique value
                min_unique = n_unique
                best_candidate = col
        if best_candidate:
            target_column = best_candidate
        else:
            print("No suitable categorical columns found for a classification target. Please specify a target column manually if it's not detected.")
            exit() # Cannot proceed without a target
    else:
        print("No categorical columns found for a classification target. Please specify a target column.")
        exit() # Cannot proceed without a target

print(f"Automatically identified target column: '{target_column}'")

# Identify all categorical columns for encoding (including potential features and the target)
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Initialize a dictionary to store LabelEncoders for inverse transformation later
label_encoders = {}

# Apply Label Encoding to all categorical feature columns
for col in categorical_cols:
    if col != target_column: # Exclude the target column for now, it's encoded separately
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le # Store the encoder
        print(f"  - Encoded feature column: '{col}'")

# Apply Label Encoding to the target column
le_target = LabelEncoder()
df[target_column] = le_target.fit_transform(df[target_column])
label_encoders[target_column] = le_target # Store the target encoder
print(f"  - Encoded target column: '{target_column}'")

# Display the dataframe after encoding to show numerical conversion
print("\nFirst 5 rows of the dataset after Label Encoding:")
print(df.head())

# --- Step 3: Separate Features (X) and Target (y) ---
print("\nStep 3: Separating Features (X) and Target (y)\n")

X = df.drop(columns=[target_column]) # Features are all columns except the target
y = df[target_column]              # Target variable

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

# --- Step 4: Split Data into Training and Testing Sets ---
print("\nStep 4: Splitting Data into Training and Testing Sets\n")

# Split the dataset into training (80%) and testing (20%) sets
# stratify=y ensures that the proportion of classes in y is the same in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training features (X_train) shape: {X_train.shape}")
print(f"Testing features (X_test) shape: {X_test.shape}")
print(f"Training target (y_train) shape: {y_train.shape}")
print(f"Testing target (y_test) shape: {y_test.shape}")

# --- Step 5: Machine Learning Model (RandomForestClassifier) ---
print("\n--- Step 5: Training and Evaluating Machine Learning Model (RandomForestClassifier) ---\n")

# Initialize and train the RandomForestClassifier
ml_model = RandomForestClassifier(n_estimators=100, random_state=42)
print("Training RandomForestClassifier...")
ml_model.fit(X_train, y_train)
print("RandomForestClassifier training complete.")

# Make predictions on the test set
y_pred_ml = ml_model.predict(X_test)

# Evaluate the model
accuracy_ml = accuracy_score(y_test, y_pred_ml)
print(f"\nRandomForestClassifier Accuracy: {accuracy_ml:.4f}")

print("\nRandomForestClassifier Classification Report:")
# Get the original class names for a more readable classification report
target_names_ml = le_target.inverse_transform(range(len(le_target.classes_)))
print(classification_report(y_test, y_pred_ml, target_names=target_names_ml, zero_division=0))

# --- Step 6: Deep Learning Model (Keras Sequential API) ---
print("\n--- Step 6: Training and Evaluating Deep Learning Model (Keras Sequential API) ---\n")

# Preprocessing for Deep Learning: Scale numerical features
# Neural networks often perform better with scaled input data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features scaled using StandardScaler for Deep Learning model.")

# Determine the number of output units and activation based on the number of classes
num_classes = len(le_target.classes_)
if num_classes == 2:
    # Binary classification
    dl_output_units = 1
    dl_activation = 'sigmoid'
    dl_loss = 'binary_crossentropy'
    y_train_dl = y_train.astype('float32')
    y_test_dl = y_test.astype('float32')
else:
    # Multi-class classification
    dl_output_units = num_classes
    dl_activation = 'softmax'
    dl_loss = 'sparse_categorical_crossentropy' # Use this when labels are integer encoded
    y_train_dl = y_train.astype('int32')
    y_test_dl = y_test.astype('int32')

# Build the Sequential Deep Learning Model
dl_model = Sequential([
    # Input layer and first hidden layer
    Dense(units=128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    # Second hidden layer
    Dense(units=64, activation='relu'),
    # Output layer
    Dense(units=dl_output_units, activation=dl_activation)
])
print("Deep Learning model architecture created:")
dl_model.summary()

# Compile the Deep Learning model
print("\nCompiling Deep Learning model...")
dl_model.compile(optimizer='adam',
                 loss=dl_loss,
                 metrics=['accuracy'])
print("Deep Learning model compiled.")

# Train the Deep Learning model
print("\nTraining Deep Learning model (this may take a moment)...")
try:
    history = dl_model.fit(X_train_scaled, y_train_dl,
                           epochs=100,      # Number of training iterations
                           batch_size=32,   # Number of samples per gradient update
                           validation_split=0.1, # Use 10% of training data for validation
                           verbose=0)       # Set to 1 for progress bar, 0 for silent
    print("Deep Learning model training complete.")
except Exception as e:
    print(f"Error during Deep Learning model training: {e}")
    print("Training might have failed or partially completed. Proceeding to evaluation.")
    pass


# Evaluate the Deep Learning model on the test set
print("\nEvaluating Deep Learning model on test set...")
loss_dl, accuracy_dl = dl_model.evaluate(X_test_scaled, y_test_dl, verbose=0)
print(f"Deep Learning Model Loss: {loss_dl:.4f}")
print(f"Deep Learning Model Accuracy: {accuracy_dl:.4f}")

print("\nDeep Learning Model Classification Report:")
# Predict probabilities for classification report
y_pred_proba_dl = dl_model.predict(X_test_scaled, verbose=0)

# Convert probabilities to class labels
if num_classes == 2:
    y_pred_dl_classes = (y_pred_proba_dl > 0.5).astype(int).flatten()
else:
    y_pred_dl_classes = np.argmax(y_pred_proba_dl, axis=1)

target_names_dl = le_target.inverse_transform(range(len(le_target.classes_)))
print(classification_report(y_test, y_pred_dl_classes, target_names=target_names_dl, zero_division=0))


Step 1: Loading Data and Initial Inspection

Error: 'content_fetcher' is not defined in this environment.
Please ensure you are running this code in an environment where 'content_fetcher' is available (e.g., Google's collaborative Canvas).
Attempting to load from a local file path as a fallback (this might fail if the file is not present locally)...
Dataset loaded successfully from local file 'personality_datasert.csv'.

First 5 rows of the dataset:
   Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0               4.0         No                      4.0            6.0   
1               9.0        Yes                      0.0            0.0   
2               9.0        Yes                      1.0            2.0   
3               0.0         No                      6.0            7.0   
4               3.0         No                      9.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                     

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Compiling Deep Learning model...
Deep Learning model compiled.

Training Deep Learning model (this may take a moment)...
Deep Learning model training complete.

Evaluating Deep Learning model on test set...
Deep Learning Model Loss: 0.2668
Deep Learning Model Accuracy: 0.9172

Deep Learning Model Classification Report:
              precision    recall  f1-score   support

   Extrovert       0.94      0.89      0.92       298
   Introvert       0.89      0.94      0.92       282

    accuracy                           0.92       580
   macro avg       0.92      0.92      0.92       580
weighted avg       0.92      0.92      0.92       580



# New Section

In [3]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [5]:
pip install --upgrade scikit-learn tensorflow

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow)
  Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (644.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.9/644.9 MB[0m [31m885.5 kB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [14]:
# Uninstall existing packages to ensure a clean slate
!pip uninstall -y scikeras scikit-learn tensorflow

# Install scikeras, which will install a compatible version of scikit-learn and tensorflow
!pip install scikeras

# Optionally, if you need a specific version of tensorflow, install it afterwards.
# Note: installing a version incompatible with scikeras might reintroduce the error.
# !pip install tensorflow==<your_desired_version>

# Verify the installed versions
!pip show scikeras scikit-learn tensorflow

Found existing installation: scikeras 0.13.0
Uninstalling scikeras-0.13.0:
  Successfully uninstalled scikeras-0.13.0
Found existing installation: scikit-learn 1.7.0
Uninstalling scikit-learn-1.7.0:
  Successfully uninstalled scikit-learn-1.7.0
Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0
Collecting scikeras
  Using cached scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting scikit-learn>=1.4.2 (from scikeras)
  Using cached scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Using cached scikeras-0.13.0-py3-none-any.whl (26 kB)
Using cached scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
Installing collected packages: scikit-learn, scikeras
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.

[0mName: scikeras
Version: 0.13.0
Summary: Scikit-Learn API wrapper for Keras.
Home-page: https://github.com/adriangb/scikeras
Author: Adrian Garcia Badaracco
Author-email: 1755071+adriangb@users.noreply.github.com
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: keras, scikit-learn
Required-by: 
---
Name: scikit-learn
Version: 1.7.0
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License

 Copyright (c) 2007-2024 The scikit-learn developers.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following d

In [None]:
# Install necessary packages if not already present
!pip install scikeras tensorflow scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler # Ensure StandardScaler is imported

# Step 1: Load the dataset
print("Step 1: Loading the dataset...")
try:
    # The original code included content_fetcher logic, which is specific to certain environments.
    # For a general Jupyter notebook, loading from a local file is standard.
    # Assuming 'personality_datasert.csv' is in the same directory.
    df = pd.read_csv('personality_datasert.csv')
    print("Dataset loaded successfully.")
    print("Initial 5 rows of the dataset:")
    display(df.head()) # Using display instead of print for better notebook formatting
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print("Error: 'personality_datasert.csv' not found. Please ensure the file is in the correct directory.")
    # Use a more robust way to handle script termination in notebooks if needed,
    # but for simple cases, exiting the cell is often sufficient.
    # raise # Re-raise the exception to stop execution if the file is missing.
    # For demonstration, we'll just print the error and allow the user to fix.
    print("Please upload the 'personality_datasert.csv' file or place it in the correct path.")


# Check if df was loaded successfully before proceeding
if 'df' not in locals():
    print("Dataframe not loaded. Exiting script.")
    # To strictly stop execution here, you might use:
    # get_ipython().run_cell_magic('javascript', '', 'IPython.notebook.execute_cells_after(this.cell_idx+1)')
    # get_ipython().run_cell_magic('python', '', 'raise SystemExit')
    # but letting the user see the error and fix the file path is usually better.
    # For this fix, we assume the user will ensure the file is present.
    # If the file isn't loaded, the subsequent steps will likely fail anyway.
else:
    # Step 2: Data Preprocessing
    print("\nStep 2: Data Preprocessing...")

    # Handle mixed types or non-numeric values by coercing to numeric, then fill NaN
    # Convert relevant columns to numeric, coercing errors will turn non-numeric into NaN
    numeric_cols_to_process = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
    for col in numeric_cols_to_process:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # Fill NaN values that might have resulted from 'coerce' with the median
            if df[col].isnull().any():
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
                print(f"Filled NaN in '{col}' with median value: {median_val}")
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")


    # Identify categorical columns (excluding the target 'Personality')
    # Check if the target column exists before proceeding
    target_column = 'Personality'
    if target_column not in df.columns:
        print(f"Error: Target column '{target_column}' not found in the dataset.")
        print("Please check the column name in your CSV file.")
        # Similar to file not found, you'd ideally stop execution if the target is missing.
    else:
        # Identify categorical features dynamically, excluding the target
        categorical_features = df.select_dtypes(include='object').columns.tolist()
        if target_column in categorical_features:
             categorical_features.remove(target_column)

        print(f"Categorical features identified for encoding: {categorical_features}")

        # Initialize OrdinalEncoder for features and LabelEncoder for the target
        encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

        # Apply OrdinalEncoder to categorical features if any exist
        if categorical_features:
            # Apply fit_transform only to the identified categorical features
            df[categorical_features] = encoder.fit_transform(df[categorical_features])
            print("Categorical features encoded using OrdinalEncoder.")
        else:
             print("No categorical features found for encoding (excluding the target).")


        # Separate features (X) and target (y)
        X = df.drop(target_column, axis=1)
        y = df[target_column]

        # Encode the target variable 'Personality' using LabelEncoder
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        print("Target variable 'Personality' encoded using LabelEncoder.")
        print(f"Original classes: {label_encoder.classes_}")
        print(f"Encoded labels: {np.unique(y_encoded)}")

        # Check if the target variable is binary (2 classes) for binary classification setup
        num_classes = len(label_encoder.classes_)
        if num_classes != 2:
            print(f"Warning: The target variable has {num_classes} classes. The current model setup is for binary classification.")
            print("Consider adjusting the model architecture and loss function for multi-class classification.")
            # For multi-class, output layer should have `num_classes` units with 'softmax' activation,
            # and loss should be 'sparse_categorical_crossentropy'.

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
        print(f"Data split into training (X_train shape: {X_train.shape}, y_train shape: {y_train.shape})")
        print(f"and testing (X_test shape: {X_test.shape}, y_test shape: {y_test.shape}) sets.")

        # Feature Scaling (StandardScaler)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("Features scaled using StandardScaler.")

        # Step 3: Define the Deep Learning Model
        print("\nStep 3: Defining the Deep Learning Model...")

        # Function to create Keras model for use with KerasClassifier
        def create_model(optimizer='adam', learning_rate=0.001, activation='relu', neurons=(64, 32)):
            # Set learning rate for the optimizer
            # Check for optimizer existence in tf.keras.optimizers
            optimizers = {
                'adam': tf.keras.optimizers.Adam,
                'rmsprop': tf.keras.optimizers.RMSprop,
                'sgd': tf.keras.optimizers.SGD # Add SGD option for completeness
            }
            opt_class = optimizers.get(optimizer.lower(), tf.keras.optimizers.Adam) # Default to Adam
            opt = opt_class(learning_rate=learning_rate)


            model = Sequential()
            # Input layer and first hidden layer
            # Ensure input_shape is correct based on scaled features
            model.add(Dense(neurons[0], input_shape=(X_train_scaled.shape[1],), activation=activation))
            # Additional hidden layers
            for i in range(1, len(neurons)):
                model.add(Dense(neurons[i], activation=activation))

            # Output layer: 2 classes (Extrovert, Introvert) -> sigmoid for binary classification
            # If you need multi-class, change units to num_classes and activation to 'softmax'
            if num_classes == 2:
                model.add(Dense(1, activation='sigmoid'))
                loss_func = 'binary_crossentropy'
            else:
                model.add(Dense(num_classes, activation='softmax'))
                loss_func = 'sparse_categorical_crossentropy' # Use this for integer labels


            model.compile(optimizer=opt, loss=loss_func, metrics=['accuracy'])
            return model

        # Step 4: Hyperparameter Tuning using GridSearchCV
        print("\nStep 4: Performing Hyperparameter Tuning using GridSearchCV...")

        # Create KerasClassifier with a default model
        # Pass num_classes to the wrapper if needed by create_model (currently not explicitly used, but good practice)
        keras_model = KerasClassifier(model=create_model, verbose=0, # Set verbose=0 for silent training during grid search
                                      loss='binary_crossentropy', # Specify loss directly if always binary
                                      metrics=['accuracy'])

        # Define the parameter grid for GridSearchCV
        param_grid = {
            'model__optimizer': ['adam', 'rmsprop'],
            'model__learning_rate': [0.001, 0.01],
            'model__activation': ['relu', 'tanh'],
            'model__neurons': [(32, 16), (64, 32), (128, 64, 32)], # Different network architectures
            'batch_size': [16, 32],
            'epochs': [50, 100] # Set max epochs, early stopping will manage the actual number
        }

        # Step 5: Use EarlyStopping callback
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Initialize GridSearchCV
        grid_search = GridSearchCV(estimator=keras_model,
                                   param_grid=param_grid,
                                   scoring='accuracy',
                                   cv=3,
                                   verbose=1,
                                   n_jobs=-1,
                                   error_score='raise'
                                  )

        # Fit GridSearchCV
        print("Fitting GridSearchCV (this may take a while)...")
        try:
            # Pass callbacks to the fit method of the KerasClassifier which GridSearchCV calls
            grid_result = grid_search.fit(X_train_scaled, y_train,
                                          callbacks=[early_stopping],
                                          validation_split=0.1 # Use part of the training data for validation
                                         )

            print("\nGridSearchCV completed.")
            print(f"Best: {grid_result.best_score_:.4f} using {grid_result.best_params_}")

            # Step 6: Evaluate the best model
            print("\nStep 6: Evaluating the best model on the test set...")

            best_model = grid_result.best_estimator_
            # predict_proba returns probabilities, predict returns class labels
            # KerasClassifier predict_proba method returns predictions compatible with sklearn expectations
            y_pred_proba = best_model.predict_proba(X_test_scaled)

            # For binary classification, y_pred_proba will have shape (n_samples, 2) if output is Dense(2, 'softmax'),
            # or (n_samples, 1) if output is Dense(1, 'sigmoid').
            # If Dense(1, 'sigmoid'), y_pred_proba will be probabilities of class 1.
            # If Dense(2, 'softmax'), y_pred_proba[:, 1] is probability of class 1.
            # We need to handle this based on the model's output layer.
            # The create_model currently uses Dense(1, 'sigmoid') for binary.
            # So y_pred_proba will be shape (n_samples, 1).
            # We need to convert these probabilities to class labels (0 or 1).
            # The predict() method of KerasClassifier already handles this binary conversion
            # for a Dense(1, 'sigmoid') output by default thresholding at 0.5.
            y_pred = best_model.predict(X_test_scaled) # This will return class indices (0 or 1) directly

            print(f"\nAccuracy on test set: {accuracy_score(y_test, y_pred):.4f}")
            print("\nClassification Report on test set:")
            # classification_report needs true labels (y_test) and predicted labels (y_pred)
            print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))

            print("\nCode execution completed successfully.")

        except Exception as e:
            print(f"\nAn error occurred during GridSearchCV or evaluation: {e}")
            import traceback
            traceback.print_exc() # Print full traceback for debugging
            print("Please review the error message and traceback to diagnose the issue.")

Collecting tensorflow
  Using cached tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Using cached tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (644.9 MB)
Installing collected packages: tensorflow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf-keras 2.18.0 requires tensorflow<2.19,>=2.18, but you have tensorflow 2.19.0 which is incompatible.
tensorflow-decision-forests 1.11.0 requires tensorflow==2.18.0, but you have tensorflow 2.19.0 which is incompatible.
tensorflow-text 2.18.1 requires tensorflow<2.19,>=2.18.0, but you have tensorflow 2.19.0 which is incompatible.[0m[31m
[0mSuccessfully installed tensorflow-2.19.0
Step 1: Loading the dataset...
Dataset loaded successfully.
Initial 5 rows of the dataset:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2900 non-null   float64
 1   Stage_fear                 2900 non-null   object 
 2   Social_event_attendance    2900 non-null   float64
 3   Going_outside              2900 non-null   float64
 4   Drained_after_socializing  2900 non-null   object 
 5   Friends_circle_size        2900 non-null   float64
 6   Post_frequency             2900 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB

Step 2: Data Preprocessing...
Categorical features identified for encoding: ['Stage_fear', 'Drained_after_socializing']
Categorical features encoded using OrdinalEncoder.
Target variable 'Personality' encoded using LabelEncoder.
Original classes: ['Extrovert' 'Int