# Q4-a

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
def GetFeatureImportances(dic_models,filename):
    df = pd.read_csv(f"{filename}")
    X = df.drop('Loan_Status',axis=1).values
    y = df['Loan_Status'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    df1 = pd.DataFrame(index=list(df)[:-1])
    for name in dic_models:
        model = dic_models[name]
        model.fit(X_train,y_train)
        imp = model.feature_importances_
        normalized = (imp-min(imp))/(max(imp)-min(imp))
        df1[name] = normalized
    return df1

In [None]:
dic_models={"ET":ExtraTreesClassifier(random_state=1),
"XGB":XGBClassifier(random_state = 2),
"RF":RandomForestClassifier(random_state = 2),
"LGBM": LGBMClassifier(random_state = 2)}
df_featImportances = GetFeatureImportances(dic_models, filename="q4_XAI_data.csv")
df_featImportances

# Q4-b

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score,accuracy_score

In [None]:
def RunClassification_featureRemoval(dic_models, filename, n_feats_remove):
    
    df = pd.read_csv(rf"D:\OneDrive - NITT\Custom_Download\{filename}")
    X = df.drop('Loan_Status',axis=1)
    y = df['Loan_Status']
    
    df1 = pd.DataFrame([],columns=['name','accuracy','precision','recall','f1'])
    
    for name in dic_models:
        model = dic_models[name]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score (y_test, y_pred)
        pre = precision_score (y_test, y_pred)
        recall = recall_score (y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        df1.loc[len(df1)] = [name,acc,pre,recall,f1]
        
        X_remaining_cols = df_featImportances[name].sort_values(ascending=False)[n_feats_remove:].index
        
        X_small = X[X_remaining_cols]
        X_train, X_test, y_train, y_test = train_test_split(X_small, y, test_size=0.20, random_state=42)
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score (y_test, y_pred)
        pre = precision_score (y_test, y_pred)
        recall = recall_score (y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        df1.loc[len(df1)] = [f"{name}-rtop{n_feats_remove}",acc,pre,recall,f1]
        
    df1.set_index('name',inplace=True)
    return df1

In [None]:
dic_models={"XGB":XGBClassifier(random_state = 2),
"LGBM": LGBMClassifier(random_state = 2)}
df_fremovalRess=RunClassification_featureRemoval(dic_models, filename="q4_XAI_data.csv", n_feats_remove=3)
df_fremovalRess

# Q4-c

In [2]:
import shap
import numpy as np

In [3]:
def GetTreeSHAPImportance_values(dic_models, filename, test_size):
    df = pd.read_csv(f"{filename}")
    X = df.drop('Loan_Status',axis=1).values
    y = df['Loan_Status'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    df1 = pd.DataFrame(index=list(df)[:-1])
    for name in dic_models:
        clf = dic_models[name]
        model = clf.fit(X_train,y_train)
        explainer = shap.TreeExplainer(model)
        vals = explainer.shap_values(X_test)
        if isinstance(vals,list):
            vals = vals[0]
        imp = np.array([abs(vals[:,i]).sum() for i in range(11)])
        normalized = (imp-min(imp))/(max(imp)-min(imp))
        
        df1[name] = normalized
    return df1

In [4]:
dic_models={
"ET":ExtraTreesClassifier(random_state=1),
"XGB":XGBClassifier(random_state = 2),
"RF":RandomForestClassifier(random_state = 2),
"LGBM": LGBMClassifier(random_state = 2)
}

df_treeSHAPImportances=GetTreeSHAPImportance_values(dic_models, filename="q4_XAI_data.csv", test_size = 0.2)


ntree_limit is deprecated, use `iteration_range` or model slicing instead.
LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


In [5]:
df_treeSHAPImportances

Unnamed: 0,ET,XGB,RF,LGBM
Gender,0.022885,0.059476,0.014118,0.013935
Married,0.124195,0.176754,0.096959,0.149068
Dependents,0.10266,0.119689,0.059385,0.085737
Education,0.040771,0.034346,0.013676,0.05203
Self_Employed,0.0,0.0,0.0,0.0
ApplicantIncome,0.055228,0.58162,0.152531,0.416175
CoapplicantIncome,0.066092,0.458876,0.109563,0.347184
LoanAmount,0.069007,0.545512,0.152045,0.358062
Loan_Amount_Term,0.068818,0.116028,0.116074,0.042146
Credit_History,1.0,1.0,1.0,1.0
