### Modeling Objectives
- Train and evaluate classification models to predict satisfaction.
- Use SHAP or LIME to explain key satisfaction drivers.
- Monitor performance using recall, F1, and AUC scores.


In [None]:
# Import libraries
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Import explainability libraries 
import shap
import lime
from lime.lime_tabular import LimeTabularExplainer 


: 

In [None]:
df1 = pd.read_csv("cleaned.csv")
df1.head()

In [None]:
# Load and prepare data
df2 = pd.read_csv('eda_incl.csv')  
df2.head()

In [None]:
# Check for duplicate columns
df1_cols = set(df1.columns)
df2_cols = set(df2.columns)
common_cols = df1_cols.intersection(df2_cols)
print(f"Common columns: {common_cols}")


In [None]:
# Combine df1 and df2 
# Get unique columns from df2 that aren't in df1
df2_unique_cols = [col for col in df2.columns if col not in df1.columns]

# Merge on all common columns
common_cols = list(set(df1.columns).intersection(set(df2.columns)))
merged_df = pd.merge(df1, df2[common_cols + df2_unique_cols], on=common_cols, how='outer')

merged_df.head()

In [None]:
merged_df.shape

In [None]:
# Step 1: Load and prepare data

# Step 2: Create target variable (binary classification)
merged_df['Satisfied'] = merged_df['Satisfaction Response'].apply(
    lambda x: 1 if x in ['Strongly Agree', 'Agree'] else 0
)

# Step 3: Select features (exclude target and ID columns)
features = ['Agency Name', 'Complaint Type', 'Descriptor', 'Borough', 
           'Survey Year', 'Survey Month']
X = merged_df[features].copy()
y = merged_df['Satisfied']

# Step 4: Encode categorical variables
from sklearn.preprocessing import LabelEncoder
le_dict = {}
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    le_dict[col] = le

# Handle missing values
X = X.fillna(X.mode().iloc[0])  # Fill with most frequent value
# Or drop rows with missing values: X = X.dropna()

# Check for remaining NaN values
print("Missing values:", X.isnull().sum().sum())

# Step 5: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 6: Train models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'model': model,
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba)
    }
    
    print(f'{name}: Recall={results[name]["recall"]:.3f}, F1={results[name]["f1"]:.3f}, AUC={results[name]["auc"]:.3f}')

In [None]:
# Step 7: Select best model and explain with SHAP
best_model = results['Random Forest']['model']  # Or choose based on metrics
explainer = shap.Explainer(best_model, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test, feature_names=features)

In [None]:


# Prepare features and target
# X = df.drop('satisfaction_column', axis=1)  # Replace with your feature columns
# y = df['satisfaction_column']  # Replace with your target column

# Handle categorical variables
# categorical_cols = X.select_dtypes(include=['object']).columns
# for col in categorical_cols:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])

# Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Train multiple models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Store results
results = {}

# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     y_pred_proba = model.predict_proba(X_test)[:, 1]
#     
#     results[name] = {
#         'model': model,
#         'recall': recall_score(y_test, y_pred),
#         'f1': f1_score(y_test, y_pred),
#         'auc': roc_auc_score(y_test, y_pred_proba)
#     }
#     
#     print(f'{name} Performance:')
#     print(f'Recall: {results[name]["recall"]:.3f}')
#     print(f'F1 Score: {results[name]["f1"]:.3f}')
#     print(f'AUC Score: {results[name]["auc"]:.3f}')
#     print('\n')

In [None]:
# Compare model performance
# performance_df = pd.DataFrame({
#     'Model': list(results.keys()),
#     'Recall': [results[model]['recall'] for model in results],
#     'F1 Score': [results[model]['f1'] for model in results],
#     'AUC Score': [results[model]['auc'] for model in results]
# })
# 
# print(performance_df)
# 
# # Select best model
# best_model_name = performance_df.loc[performance_df['F1 Score'].idxmax(), 'Model']
# best_model = results[best_model_name]['model']
# print(f'Best model: {best_model_name}')

In [None]:
# SHAP explanations
# explainer = shap.Explainer(best_model, X_train)
# shap_values = explainer(X_test)
# 
# # Summary plot
# shap.summary_plot(shap_values, X_test, feature_names=X.columns)
# 
# # Feature importance
# shap.summary_plot(shap_values, X_test, plot_type='bar', feature_names=X.columns)

In [None]:
# LIME explanations
# explainer = LimeTabularExplainer(
#     X_train.values,
#     feature_names=X.columns,
#     class_names=['Not Satisfied', 'Satisfied'],
#     mode='classification'
# )
# 
# # Explain a single instance
# idx = 0  # Change to any test instance
# exp = explainer.explain_instance(
#     X_test.iloc[idx].values,
#     best_model.predict_proba,
#     num_features=10
# )
# 
# exp.show_in_notebook(show_table=True)

In [None]:
# Performance monitoring visualization
# fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# 
# metrics = ['Recall', 'F1 Score', 'AUC Score']
# for i, metric in enumerate(metrics):
#     performance_df.plot(x='Model', y=metric, kind='bar', ax=axes[i], rot=45)
#     axes[i].set_title(f'{metric} Comparison')
#     axes[i].set_ylabel(metric)
# 
# plt.tight_layout()
# plt.show()

### Key Steps Summary:

1. **Data Preparation**: Load data, handle categorical variables, split into train/test
2. **Model Training**: Train multiple classification models (Logistic Regression, Random Forest, Gradient Boosting)
3. **Performance Evaluation**: Compare models using recall, F1, and AUC scores
4. **Model Explanation**: Use SHAP for global feature importance and LIME for local explanations
5. **Performance Monitoring**: Visualize and track model performance metrics

**Next Steps:**
- Uncomment and adapt the code with your actual data
- Replace placeholder column names with your actual feature and target columns
- Consider hyperparameter tuning for the best performing model
- Analyze SHAP/LIME results to identify key satisfaction drivers