In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
import pickle
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from datetime import datetime
import re
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('synthetic_access_data_10000.csv')

# Function to convert time_in_position to months
def convert_to_months(time_str):
    if pd.isna(time_str):
        return 0
    time_str = str(time_str).lower().strip()
    if 'year' in time_str or 'years' in time_str:
        years = int(re.search(r'\d+', time_str).group())
        return years * 12
    elif 'month' in time_str or 'months' in time_str:
        months = int(re.search(r'\d+', time_str).group())
        return months
    try:
        return int(time_str)  # Handle cases where it's already a number
    except ValueError:
        return 0  # Default to 0 for unparseable values

# Apply conversion to time_in_position
df['time_in_position'] = df['time_in_position'].apply(convert_to_months)

# Define features and target - using only the specified features
categorical_cols = ['user_role', 'department', 'employee_status', 'resource_type', 'resource_sensitivity', 'request_reason']
numeric_cols = ['time_in_position', 'past_violations']
date_cols = ['last_security_training', 'employee_join_date']

# Calculate derived features (days since)
def calculate_days_since(date_str, current_date="2025-04-05"):
    if pd.isna(date_str) or date_str == "Never" or date_str == "invalid_date":
        return 365 * 5  # Default to 5 years if missing
    try:
        date_obj = datetime.strptime(date_str, "%Y-%m-%d")
        current_date_obj = datetime.strptime(current_date, "%Y-%m-%d")
        return (current_date_obj - date_obj).days
    except ValueError:
        return 365 * 5

for col in date_cols:
    df[f'days_since_{col}'] = df[col].apply(calculate_days_since)

# Select only the specified features
X = df[categorical_cols + numeric_cols + [f'days_since_{col}' for col in date_cols]]
y = df['is_approved']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
        ('num', MinMaxScaler(), numeric_cols + [f'days_since_{col}' for col in date_cols])
    ]
)

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train XGBoost model
xgb_model = XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    gamma=0.5,
    reg_lambda=1.0,
    reg_alpha=0.1,
    scale_pos_weight=0.7,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Evaluate model
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

print("\nTest Set Performance:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")

# Export model and preprocessor
with open('xgb_model.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)
with open('preprocessor.pkl', 'wb') as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file)
print("\nModel and preprocessor exported as 'xgb_model.pkl' and 'preprocessor.pkl'")

# Prediction function
def predict_permission(request, preprocessor, model):
    # Convert request to DataFrame
    request_df = pd.DataFrame([request])
    
    # Handle time_in_position if it's not already in months
    if isinstance(request['time_in_position'], str):
        request_df['time_in_position'] = request_df['time_in_position'].apply(convert_to_months)
    
    # Calculate derived features
    for col in date_cols:
        request_df[f'days_since_{col}'] = request_df[col].apply(calculate_days_since)
    
    # Select only the features used in training
    X_req = request_df[categorical_cols + numeric_cols + [f'days_since_{col}' for col in date_cols]]
    
    # Transform the request
    X_req_processed = preprocessor.transform(X_req)
    
    # Make prediction
    prediction = model.predict(X_req_processed)[0]
    return "Approved" if prediction else "Denied - Flagged for admin review"

# Test with sample request
sample_request = {
    "user_role": "Employee",
    "department": "Sales",
    "employee_status": "Full-time",
    "resource_type": "pdf",
    "resource_sensitivity": "confidential",
    "request_reason": "Routine check",
    "time_in_position": 48,  # months
    "past_violations": 90,
    "last_security_training": "2024-07-01",
    "employee_join_date": "2019-03-01"
}

print("\nSample prediction:")
print(predict_permission(sample_request, preprocessor, xgb_model))


Test Set Performance:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1376
           1       0.92      0.95      0.94       624

    accuracy                           0.96      2000
   macro avg       0.95      0.96      0.95      2000
weighted avg       0.96      0.96      0.96      2000

Confusion Matrix:
 [[1327   49]
 [  32  592]]
Accuracy: 0.9595
AUC-ROC: 0.9955

Model and preprocessor exported as 'xgb_model.pkl' and 'preprocessor.pkl'

Sample prediction:
Denied - Flagged for admin review


In [3]:
df = pd.read_csv('synthetic_access_data_10000.csv')
df['resource_sensitivity'].unique()

array(['confidential', 'restricted', 'public'], dtype=object)