In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/housing-prices-dataset/Housing.csv
/kaggle/input/mushroom-classification/mushrooms.csv
/kaggle/input/heart-disease-dataset/heart.csv
/kaggle/input/healthcare-dataset/healthcare_dataset.csv
/kaggle/input/logistic-regression/Social_Network_Ads.csv


In [2]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

In [3]:
! pip install imbalanced-learn



In [7]:
# classifier
import shap
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
import time
import xgboost as xgb

def check_class_imbalance(y, threshold=0.1):
    # Count the occurrences of each class
    class_counts = y.value_counts()
    
    # Get the majority and minority class counts
    majority_class_count = class_counts.max()
    minority_class_count = class_counts.min()

    # Calculate the imbalance ratio
    imbalance_ratio = minority_class_count / majority_class_count

    print(f"Class distribution:\n{class_counts}\n")
    print(f"Imbalance ratio (minority/majority): {imbalance_ratio:.2f}")

    # Check if the imbalance ratio is below the threshold
    if imbalance_ratio < threshold:
        print("Class imbalance detected. SMOTE is recommended.")
        return True
    else:
        print("No significant class imbalance detected.")
        return False

# Preprocessing function with SHAP and SMOTE/undersampling/oversampling
def preprocess_data_with_sampling(df, target_column, sampling_strategy='none', shap_threshold=0.01):
    # Handle missing values
    df = df.dropna()
    df.drop_duplicates(inplace = True)
    
    # Encode categorical features
    label_encoders = {}
    for col in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
        
    # Split data into features and labels
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Apply sampling strategy
    if sampling_strategy == 'smote' and check_class_imbalance(y_train):
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
    elif sampling_strategy == 'undersample':
        undersample = RandomUnderSampler(random_state=42)
        X_train, y_train = undersample.fit_resample(X_train, y_train)
    elif sampling_strategy == 'oversample':
        oversample = RandomOverSampler(random_state=42)
        X_train, y_train = oversample.fit_resample(X_train, y_train)
    elif sampling_strategy == 'smoteenn':
        smote_enn = SMOTEENN(random_state=42)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    elif sampling_strategy == 'smotetomek':
        smote_tomek = SMOTETomek(random_state=42)
        X_train, y_train = smote_tomek.fit_resample(X_train, y_train)
    
    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Train a base model (Random Forest)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    
    # Calculate SHAP values
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)
    
    # Calculate mean absolute SHAP values for feature importance
    shap_importance = np.mean(np.abs(shap_values[1]), axis=0)
    feature_importance = pd.DataFrame({'feature': df.drop(target_column, axis=1).columns, 
                                       'importance': shap_importance})
    
    # Remove features with importance less than the threshold
    important_features = feature_importance[feature_importance['importance'] > shap_threshold]['feature'].values
    X_train = pd.DataFrame(X_train, columns=df.drop(target_column, axis=1).columns)[important_features]
    X_test = pd.DataFrame(X_test, columns=df.drop(target_column, axis=1).columns)[important_features]
    
    return X_train, X_test, y_train, y_test

# Function to evaluate models
def evaluate_model(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "SVM": SVC(),
        "KNN": KNeighborsClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": xgb.XGBClassifier(),
        "AdaBoost": AdaBoostClassifier(),
        "Naive Bayes": GaussianNB(),
        "MLP Neural Network": MLPClassifier()
    }
    
    for name, model in models.items():
        start_time = time.time()
        print(f"Training {name}...")
        model = model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        time_consumed = time.time() - start_time
        print(f"{name} Accuracy: {np.round(accuracy,2)} Time: {time_consumed}")

# Run all classifiers with sampling strategies
def run_all_classifiers(df, target_column, sampling_strategy='none'):
    X_train, X_test, y_train, y_test = preprocess_data_with_sampling(df, target_column, sampling_strategy)
    evaluate_model(X_train, X_test, y_train, y_test)

# Example usage with a dataset
if __name__ == "__main__":
    # Load your dataset here
    df = pd.read_csv("/kaggle/input/mushroom-classification/mushrooms.csv")
    target_column = 'class'  # Change this to your dataset's target column
    # Handle missing values
    df = df.dropna()
    df.drop_duplicates(inplace = True)
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    check=check_class_imbalance(y_train)
    if(check==1):
        # Run all classifiers with different sampling strategies
        print("Running with SMOTE...")
        run_all_classifiers(df, target_column, sampling_strategy='smote')

        print("\nRunning with Random Oversampling...")
        run_all_classifiers(df, target_column, sampling_strategy='oversample')

        print("\nRunning with Random Undersampling...")
        run_all_classifiers(df, target_column, sampling_strategy='undersample')
    else:
         run_all_classifiers(df, target_column)

Class distribution:
class
e    3365
p    3134
Name: count, dtype: int64

Imbalance ratio (minority/majority): 0.93
No significant class imbalance detected.
Training Logistic Regression...
Logistic Regression Accuracy: 0.93 Time: 0.04582333564758301
Training Decision Tree...
Decision Tree Accuracy: 1.0 Time: 0.018677949905395508
Training Random Forest...
Random Forest Accuracy: 1.0 Time: 0.49588847160339355
Training SVM...
SVM Accuracy: 1.0 Time: 0.11274194717407227
Training KNN...
KNN Accuracy: 1.0 Time: 0.15916037559509277
Training Gradient Boosting...
Gradient Boosting Accuracy: 1.0 Time: 0.4706897735595703
Training XGBoost...
XGBoost Accuracy: 1.0 Time: 0.1379077434539795
Training AdaBoost...
AdaBoost Accuracy: 1.0 Time: 0.294081449508667
Training Naive Bayes...
Naive Bayes Accuracy: 0.88 Time: 0.00832366943359375
Training MLP Neural Network...
MLP Neural Network Accuracy: 1.0 Time: 4.089228868484497


In [8]:
"""import shap
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import time
import xgboost as xgb

# Preprocessing function with SHAP
def preprocess_data_with_shap(df, target_column, shap_threshold=0.01):
    # Handle missing values
    df = df.dropna()
    
    # Encode categorical features
    label_encoders = {}
    for col in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
        
    # Split data into features and labels
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Train a base model (Random Forest)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    
    # Calculate SHAP values
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)
    
    # Calculate mean absolute SHAP values for feature importance
    shap_importance = np.mean(np.abs(shap_values[1]), axis=0)
    feature_importance = pd.DataFrame({'feature': df.drop(target_column, axis=1).columns, 
                                       'importance': shap_importance})
    
    # Remove features with importance less than the threshold
    important_features = feature_importance[feature_importance['importance'] > shap_threshold]['feature'].values
    X_train = pd.DataFrame(X_train, columns=df.drop(target_column, axis=1).columns)[important_features]
    X_test = pd.DataFrame(X_test, columns=df.drop(target_column, axis=1).columns)[important_features]
    
    return X_train, X_test, y_train, y_test

# Function to evaluate models
def evaluate_model(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "SVM": SVC(),
        "KNN": KNeighborsClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": xgb.XGBClassifier(),
        "AdaBoost": AdaBoostClassifier(),
        "Naive Bayes": GaussianNB(),
        "MLP Neural Network": MLPClassifier()
    }
    
    for name, model in models.items():
        start_time = time.time()
        print(f"Training {name}...")
        model = model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        time_consumed = time.time() - start_time
        print(f"{name} Accuracy: {np.round(accuracy,2)} Time: {time_consumed}")

# Run all classifiers
def run_all_classifiers(df, target_column):
    X_train, X_test, y_train, y_test = preprocess_data_with_shap(df, target_column)
    evaluate_model(X_train, X_test, y_train, y_test)

# Example usage with a dataset
if __name__ == "__main__":
    # Load your dataset here
    df = pd.read_csv("/kaggle/input/mushroom-classification/mushrooms.csv")
    target_column = 'class'  # Change this to your dataset's target column
    
    # Run all classifiers on the dataset
    run_all_classifiers(df, target_column)
"""

'import shap\nimport numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.neural_network import MLPClassifier\nimport time\nimport xgboost as xgb\n\n# Preprocessing function with SHAP\ndef preprocess_data_with_shap(df, target_column, shap_threshold=0.01):\n    # Handle missing values\n    df = df.dropna()\n    \n    # Encode categorical features\n    label_encoders = {}\n    for col in df.select_dtypes(include=[\'object\']).columns:\n        le = LabelEncoder()\n        df[col] = le.fit_transform(df[col])\n    

Training Linear Regression...
Linear Regression MSE: 1771751116594.04 Time: 0.015462398529052734
Training Decision Tree...
Decision Tree MSE: 2835977158256.88 Time: 0.002724170684814453
Training Random Forest...
Random Forest MSE: 1926061032226.21 Time: 0.292468786239624
Training SVM...
SVM MSE: 5567937467895.9 Time: 0.01466989517211914
Training KNN...
KNN MSE: 2106456221466.06 Time: 0.0030815601348876953
Training Gradient Boosting...
Gradient Boosting MSE: 1699486795075.28 Time: 0.09216928482055664
Training XGBoost...
XGBoost MSE: 2032404618961.44 Time: 0.06665825843811035
Training AdaBoost...
AdaBoost MSE: 2214324085042.79 Time: 0.12534332275390625
Training MLP Neural Network...
MLP Neural Network MSE: 30127923406265.48 Time: 0.8832221031188965


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
