In [None]:
import shap
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV # Import RandomizedSearchCV



In [None]:
# Load the dataset
file_path = '/content/drive/MyDrive/SAdata_allMeasures.csv'
data = pd.read_csv(file_path)
print(data.shape)
print(data.head())

2.2

In [None]:
# Basic dataset info
print(data.info())



In [None]:
print(data.describe())


In [None]:
# Check for missing values
print(data.isnull().sum())


In [None]:
# Plotting correlation heatmap to understand feature relationships
plt.figure(figsize=(20,20))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Features')
plt.show()


In [None]:
# Distribution of target variable
plt.figure(figsize=(6, 4))
sns.histplot(data['Y'], kde=True)
plt.title('Distribution of Situation Awareness')
plt.show()

2.3)

2.4)

In [None]:
# Define target and features
target_column = 'Y'  # The target variable
categorical_features = ['gender', 'temp_decision_made', 'CarPlacedLeft', 'CarPlacedRight']
numerical_features = [col for col in data.columns if col not in categorical_features + [target_column]]


In [None]:
# Feature scaling for numeric features
scaler = StandardScaler()
scaled_data = data.copy()
scaled_data[numerical_features] = scaler.fit_transform(scaled_data[numerical_features])  # Assuming last column is target

# Train-test split
X = scaled_data.drop(columns=[target_column])  # Features # Drop only the target column
y = scaled_data[target_column]  # Target variable

# Define categorical transformer using OneHotEncoder
categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # Define OneHotEncoder

# ColumnTransformer applies OneHotEncoder to the categorical columns and leaves the numerical features untouched
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features) # Apply OneHotEncoder to categorical columns
    ], remainder='passthrough')  # Leave numerical features as they are
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

2.5)

# Model Implementation LightGbm

In [None]:
# Train LightGBM Model
lgb_params = {
    'objective': 'regression',
    'metric': ['rmse', 'l2'],
    'learning_rate': 0.1,
    'num_leaves': 50,
    'random_state': 42,
    'subsample': 1.0,
    'reg_lambda': 0,
    'reg_alpha': 1,
    'n_estimators': 100,
    'min_child_samples': 10,
    'max_depth': 3,
    'learning_rate': 0.2,
    'colsample_bytree': 0.7,
    'bagging_fraction': 0.8,

}


train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
evals_result_lgb = {}

lgb_model = lgb.train(
    lgb_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'eval'],
    callbacks=[lgb.record_evaluation(evals_result_lgb)]
)

# Evaluate performance
y_pred_lgb = lgb_model.predict(X_test)
rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)

print(f'RMSE (LightGBM): {rmse_lgb}')
print(f'MAE (LightGBM): {mae_lgb}')


 RandomizedSearchCV

In [None]:
# Use the same parameter grid
param_dist_lgb = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': [20, 31, 50, 70],
    'max_depth': [3, 5, 7, -1],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_lambda': [0, 0.1, 1],
    'reg_alpha': [0, 0.1, 1],
    'min_child_samples': [10, 20, 30]
}

# Initialize LightGBM model
lgb_model = lgb.LGBMRegressor(objective='regression', random_state=42)

# RandomizedSearchCV for LightGBM
random_search_lgb = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist_lgb,
    n_iter=50,  # Limit to 50 random combinations
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)
# Apply the preprocessor to X_train and X_test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Fit the model
random_search_lgb.fit(X_train_processed, y_train)

# Best parameters and score
print("Best parameters for LightGBM:", random_search_lgb.best_params_)
print("Best RMSE (negative MSE):", (-random_search_lgb.best_score_)**0.5)

#  SHAP for LightGBM model

In [None]:
import shap
# SHAP for LightGBM model
explainer_lgb = shap.TreeExplainer(lgb_model)

# Generate SHAP values for the test set
shap_values_lgb = explainer_lgb.shap_values(X_test)

# Plot SHAP summary plot
shap.summary_plot(shap_values_lgb, X_test)


In [None]:
import shap

# Get the best model from RandomizedSearchCV
best_lgb_model = random_search_lgb.best_estimator_

# Create a new TreeExplainer with the best model
explainer_lgb = shap.TreeExplainer(best_lgb_model)

# Calculate SHAP values using the processed data (X_test_processed)
shap_values_lgb = explainer_lgb.shap_values(X_test_processed)

# Plot the SHAP summary plot
shap.summary_plot(shap_values_lgb, X_test_processed, plot_type="bar")


Model Implementation Linear Regression

In [None]:
# Apply the preprocessor to X_train and X_test before fitting the lr_model
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Train Linear Regression Model using the processed data
lr_model = LinearRegression()
lr_model.fit(X_train_processed, y_train)  # Fit with transformed data

# Evaluate performance
y_pred_lr = lr_model.predict(X_test_processed)  # Predict with transformed data
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_lr = mean_absolute_error(y_test, y_pred_lr)

print(f'RMSE (Linear Regression): {rmse_lr}')
print(f'MAE (Linear Regression): {mae_lr}')

In [None]:
# SHAP for linear regression using KernelExplainer
explainer_lr = shap.KernelExplainer(lr_model.predict, X_train_processed)
shap_values_lr = explainer_lr.shap_values(X_test_processed)

# Plot SHAP summary plot
shap.summary_plot(shap_values_lr, X_test_processed)
