In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import HuberRegressor, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

ModuleNotFoundError: No module named 'xgboost'

In [10]:
#Gain insight on the dataset
def eda_pipeline(df):
    
    # Basic Information
    print("Dataset Shape:", df.shape)
    print("\nColumn Data Types:")
    print(df.dtypes)
    
    # Missing Values
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    missing_info = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percent})
    print("\nMissing Values:")
    print(missing_info[missing_info['Missing Values'] > 0])
    
    # Categorical Cardinality
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 1:
        print("\nCategorical Columns:", categorical_cols)
        print("\nCategorical Cardinality:")
        for col in categorical_cols:
            cardinality = len(df[col].unique())
            print(f"{col}: {cardinality} unique values")
    else:
        print("We do not have any categorical feature\nCheck for the categoricals with type of string")
        
    # Numerical Summary
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    if len(numerical_cols) > 1:
        print("\nNumerical Columns:", numerical_cols)
        print("\nNumerical Summary:")
        print(df[numerical_cols].describe())
        
        #Outlier examination with IQR and BoxPlot
        outliers = {}
        for col in numerical_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers[col] = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
            print(f'Outliers of {col}: {outliers[col]}')
                  
            plt.figure(figsize=(8, 4))
            sns.boxplot(data=df, y=col)
            plt.title(f'Box Plot of {col}')
            plt.show()
    else:
        print("We do not have any numeric feature")
    
    # Distribution Plots for Numerical Columns
    for col in numerical_cols:
        plt.figure(figsize=(10, 5))
        sns.histplot(data=df, x=col, kde=True)
        plt.title(f'Distribution of {col}')
        plt.show()
        
    # Correlation Matrix for Numerical Columns
    correlation_matrix = df[numerical_cols].corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Matrix')
    plt.show()

In [None]:
#Data preparation for the models not to have any error. fillna, encode categoricals, scale numericals
def baseline_prep(df, target, classification = None, scaler = None):
    numerical_cols = df.select_dtypes(exclude=['object']).columns
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

    if len(categorical_cols) > 0:
        if classification:
            label_encoder = LabelEncoder()
            df[target] = label_encoder.fit_transform(df[target])
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
        
    if len(numerical_cols) > 0:
        if scaler is None:
            scaler = StandardScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    return df

In [9]:
#Baseline Classifier models in diversed characteristics with evaluating them using accuracy, f1, precision, recall
def baseline_clf(df, target_col, test_size=0.2, random_state=42):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    global X_train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    models = {
        "Logistic Regression": LogisticRegression(),
        "XGBoost": XGBClassifier(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "Random Forest": RandomForestClassifier()
    }

    global tuned_models_clf
    
    for model_name, model in models.items():
        if model_name == "Logistic Regression":
            param_grid = {
                'C': [0.01, 0.1, 1, 10],
                'penalty': ['l1', 'l2']
            }
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            tuned_models_clf[model_name] = grid_search.best_estimator_
            
        elif model_name == "XGBoost":
            param_grid = {
                'n_estimators': [100, 200, 300],
                'max_depth': [3, 4, 5],
                'learning_rate': [0.01, 0.1, 0.2]
            }
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            tuned_models_clf[model_name] = grid_search.best_estimator_
            
        else:
            tuned_models_clf[model_name] = model

    results = {}
    
    for model_name, model in tuned_models_clf.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        results[model_name] = {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1-Score": f1}

    print(results)

In [8]:
#Baseline Regression models in diversed characteristics with evaluating them using MSE,MAE,MAPE,R^2
def baseline_reg(df, target, test_size = 0.2, random_state = 42):
    def mean_abs_percentage_error(y_true, y_pred):
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    X = df.drop(target, axis = 1)
    y = df[target]
    
    global X_train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = random_state)
    
    models = {
        "Random Forest": RandomForestRegressor(),
        "XGBoost": XGBRegressor(),
        "KNN": KNeighborsRegressor(),
        "Huber Regressor": HuberRegressor()
    }
    
    global tuned_models_reg
    
    for model_name, model in models.items():
        if model_name == "Random Forest":
            param_grid = {
                'n_estimators': [100,200,300],
                'max_depth': [10,20,30],
                'min_samples_split': [2,5,10],
                'min_samples_leaf': [1,2,4]
            }
            grid_search = GridSearchCV(model, param_grid, cv = 5, scoring = 'neg_mean_squared_error', n_jobs = -1)
            grid_search.fit(X_train, y_train)
            tuned_models_reg[model_name] = grid_search.best_estimator_
            
        elif model_name == "XGBoost":
            param_grid = {
                'n_estimators': [100,200,300],
                'max_depth': [3,4,5],
                'learning_rate': [0.01,0.1,0.2]
            }
            
        elif model_name == "Huber Regressor":
            tuned_models_reg[model_name] = model
            
        else:
            tuned_models_reg[model_name] = model
            
    results = {}
    
    for model_name, model in tuned_models_reg.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mape = mean_abs_percentage_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results[model_name] = {"MSE": mse, "MAE": mae, "MAPE": mape, "R-squared": r2}
        
    print(results)
        

In [19]:
#Feature importance plot.For instance: the case is classification, baseline_feature_importance(tuned_models_clf, X_train)
def baseline_feature_importance(models, X_train):
    feature_names = X.columns
    
    for model_name, model in models.items():
        if hasattr(model, 'feature_importances_'):
            feature_importances = model.feature_importances_
        elif hasattr(model, 'coef_'):
            feature_importances = model.coef_[0]
        else:
            print(f"Built-in feature importance not available for model: {model_name}")
            continue

        feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

        plt.figure(figsize = (10, 6))
        sns.barplot(x = 'Importance', y = 'Feature', data = feature_importance_df)
        plt.title(f"Feature Importance for {model_name}")
        plt.xlabel("Importance Score")
        plt.ylabel("Feature")
        plt.show()