In [1]:
!pip install shap lime


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=c9f13463fa8bfa335a5c4bf403b36e10f1f58730500453871c1a4cd62b3ab116
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [2]:
import pandas as pd

# Load directly from GitHub (no download needed)
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

# Show first 5 rows
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Shape of dataset
print("Rows, Columns:", df.shape)

# Info
print("\nData Types:")
print(df.dtypes)

# Summary stats
print("\nStatistics:")
print(df.describe())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())


Rows, Columns: (768, 9)

Data Types:
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

Statistics:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max 

In [4]:
# Features and target
X = df.drop('Outcome', axis=1)   # Outcome is the label (0 = No Diabetes, 1 = Yes)
y = df['Outcome']


In [None]:
from sklearn.model_selection import train_test_split

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize model
model = LogisticRegression(max_iter=1000)

# Train it
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
# See which features are most influential
import numpy as np

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Weight': model.coef_[0]
}).sort_values(by='Weight', ascending=False)

feature_importance


In [None]:
import shap

# Init the JS visualization
shap.initjs()


In [None]:
import shap

# Create a masker for the background data (SHAP now needs this)
masker = shap.maskers.Independent(X_train)

# Create the explainer
explainer = shap.Explainer(model, masker)

# Get SHAP values for the test set
shap_values = explainer(X_test)


In [None]:
# Summary plot (bar chart)
shap.summary_plot(shap_values, X_test, plot_type="bar")


In [None]:
# Show explanation for the first prediction
shap.plots.waterfall(shap_values[0])


In [None]:
shap.plots.bar(shap_values)

In [None]:
!pip install lime


In [None]:
import lime
import lime.lime_tabular
import numpy as np


In [None]:
# Convert DataFrame to NumPy for LIME
X_np = X_train.values
feature_names = X.columns.tolist()
class_names = ['No Diabetes', 'Diabetes']

# Create the explainer
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_np,
    feature_names=feature_names,
    class_names=class_names,
    mode='classification'
)


In [None]:
# Convert test data to numpy too
X_test_np = X_test.values

# Choose instance to explain (0 = first person in test set)
i = 0

# Get explanation
exp = lime_explainer.explain_instance(
    data_row=X_test_np[i],
    predict_fn=model.predict_proba
)


In [None]:
exp.show_in_notebook(show_table=True)
