In [None]:
# !pip3 install scikit-learn

In [None]:
# !pip3 uninstall -y lightgbm 
# !pip3 install lightgbm

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from lofo import LOFOImportance, FLOFOImportance, Dataset, plot_importance
from sklearn.metrics import make_scorer, f1_score
from data.test_data import generate_test_data, generate_unstructured_test_data
import gc

  from tqdm.autonotebook import tqdm


In [2]:
np.random.seed(42)

# Generate a synthetic dataset with 3000 samples, 10 features, and 3 classes
X, y = make_classification(n_samples=3000, n_features=100, n_informative=10, n_redundant=10, n_classes=2, random_state=42)


# Scale the features to the range 0 to 100
scaler = MinMaxScaler(feature_range=(0, 100))
X_scaled = scaler.fit_transform(X)

# Introduce sparsity by setting some of the numerical features to zero
sparsity_level = 0.6  # 30% of the data will be zero
mask = np.random.rand(*X_scaled.shape) < sparsity_level
X_scaled[mask] = 0

# Convert to DataFrame
df = pd.DataFrame(X_scaled, columns=[f"feature_{i}" for i in range(100)])
df['target'] = y

# Let's assume feature_0 and feature_1 are categorical for this example
# Convert these features to categorical type
df['feature_0'] = pd.cut(df['feature_0'], bins=2, labels=["low", "high"])
df['feature_1'] = pd.cut(df['feature_1'], bins=2, labels=["low", "high"])

# One-hot encode the categorical features
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df[['feature_0', 'feature_1']])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['feature_0', 'feature_1']))

# Drop the original categorical columns and concatenate the one-hot encoded columns
df = df.drop(columns=['feature_0', 'feature_1'])
df = pd.concat([df, encoded_df], axis=1)

df.head()



Unnamed: 0,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,...,feature_95,feature_96,feature_97,feature_98,feature_99,target,feature_0_high,feature_0_low,feature_1_high,feature_1_low
0,47.722401,0.0,0.0,0.0,0.0,45.947408,40.196609,48.261393,0.0,74.144483,...,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,1.0
1,0.0,0.0,58.461513,0.0,0.0,52.833007,0.0,0.0,0.0,0.0,...,0.0,61.469354,60.600476,48.028057,49.092541,0,0.0,1.0,1.0,0.0
2,0.0,54.864942,46.748569,0.0,0.0,35.78528,0.0,0.0,0.0,51.341374,...,0.0,42.134671,0.0,63.304027,0.0,1,0.0,1.0,0.0,1.0
3,0.0,54.285962,75.548895,55.214799,0.0,0.0,29.526191,0.0,0.0,0.0,...,57.295981,41.066785,0.0,61.872817,20.940964,1,0.0,1.0,0.0,1.0
4,0.0,61.342581,0.0,38.055779,0.0,0.0,33.686711,0.0,0.0,50.115391,...,0.0,0.0,0.0,41.626554,52.13627,0,0.0,1.0,1.0,0.0


In [3]:
df.columns

Index(['feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11',
       ...
       'feature_95', 'feature_96', 'feature_97', 'feature_98', 'feature_99',
       'target', 'feature_0_high', 'feature_0_low', 'feature_1_high',
       'feature_1_low'],
      dtype='object', length=103)

### Running on Logistic Regression

In [4]:
# Define the function
def run_importance_calculations(df, target, features, model_name ,n_runs=1000):
    scorer = make_scorer(f1_score, pos_label=1)
    # cv = KFold(n_splits=4, shuffle=True, random_state=0)
    cv = 4
    dataset = Dataset(df=df, target=target, features=features)
    importance_means = []

    for _ in range(n_runs):
        fi = LOFOImportance(dataset, scoring=scorer, model=model_name, cv=cv)
        importances = fi.get_importance()
        importance_means.append(importances['importance_mean'].values)
        del fi
        gc.collect()

    importance_means = np.array(importance_means)
    
    # Check if the same importance mean for each feature was generated
    same_means = np.all(importance_means == importance_means[0], axis=0)

    # Convert results to a DataFrame for better visualization
    results_df = pd.DataFrame(importance_means, columns=features)
    
    return results_df,same_means

In [5]:
results_df,same_means = run_importance_calculations(df=df,
                                                    target='target',
                                                    features=[x for x in df.columns if x != 'target'],
                                                    model_name =LogisticRegression(),
                                                    n_runs=5)

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

In [6]:
results_df

Unnamed: 0,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,...,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_0_high,feature_0_low,feature_1_high,feature_1_low
0,0.011173,0.008403,0.006706,0.005005,0.004935,0.004708,0.004572,0.004489,0.004242,0.00397,...,-0.003653,-0.003847,-0.004071,-0.004106,-0.004393,-0.004568,-0.004641,-0.004865,-0.006021,-0.007685
1,0.011173,0.008403,0.006706,0.005005,0.004935,0.004708,0.004572,0.004489,0.004242,0.00397,...,-0.003653,-0.003847,-0.004071,-0.004106,-0.004393,-0.004568,-0.004641,-0.004865,-0.006021,-0.007685
2,0.011173,0.008403,0.006706,0.005005,0.004935,0.004708,0.004572,0.004489,0.004242,0.00397,...,-0.003653,-0.003847,-0.004071,-0.004106,-0.004393,-0.004568,-0.004641,-0.004865,-0.006021,-0.007685
3,0.011173,0.008403,0.006706,0.005005,0.004935,0.004708,0.004572,0.004489,0.004242,0.00397,...,-0.003653,-0.003847,-0.004071,-0.004106,-0.004393,-0.004568,-0.004641,-0.004865,-0.006021,-0.007685
4,0.011173,0.008403,0.006706,0.005005,0.004935,0.004708,0.004572,0.004489,0.004242,0.00397,...,-0.003653,-0.003847,-0.004071,-0.004106,-0.004393,-0.004568,-0.004641,-0.004865,-0.006021,-0.007685


In [7]:
same_means

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [8]:
results_df_2,same_means_2 = run_importance_calculations(df=df,
                                                    target='target',
                                                    features=[x for x in df.columns if x != 'target'],
                                                    model_name =RandomForestClassifier(n_estimators=50,random_state=42),
                                                    n_runs=5)

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

In [9]:
results_df_2

Unnamed: 0,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,...,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_0_high,feature_0_low,feature_1_high,feature_1_low
0,0.028262,0.022092,0.021738,0.021607,0.02139,0.018448,0.016876,0.016458,0.015739,0.01341,...,-0.002974,-0.003193,-0.003284,-0.003762,-0.004123,-0.004672,-0.005635,-0.009029,-0.009747,-0.01328
1,0.028262,0.022092,0.021738,0.021607,0.02139,0.018448,0.016876,0.016458,0.015739,0.01341,...,-0.002974,-0.003193,-0.003284,-0.003762,-0.004123,-0.004672,-0.005635,-0.009029,-0.009747,-0.01328
2,0.028262,0.022092,0.021738,0.021607,0.02139,0.018448,0.016876,0.016458,0.015739,0.01341,...,-0.002974,-0.003193,-0.003284,-0.003762,-0.004123,-0.004672,-0.005635,-0.009029,-0.009747,-0.01328
3,0.028262,0.022092,0.021738,0.021607,0.02139,0.018448,0.016876,0.016458,0.015739,0.01341,...,-0.002974,-0.003193,-0.003284,-0.003762,-0.004123,-0.004672,-0.005635,-0.009029,-0.009747,-0.01328
4,0.028262,0.022092,0.021738,0.021607,0.02139,0.018448,0.016876,0.016458,0.015739,0.01341,...,-0.002974,-0.003193,-0.003284,-0.003762,-0.004123,-0.004672,-0.005635,-0.009029,-0.009747,-0.01328


In [10]:
same_means_2

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

## Random forest doesn't produce same results even after setting random state

https://github.com/scikit-learn/scikit-learn/discussions/25411

At times it is OS/Cloud dependent
https://community.databricks.com/t5/data-engineering/cannot-reproduce-result-scikit-learn-random-forest/m-p/27659#M19520

https://github.com/scikit-learn/scikit-learn/issues/28920

In [13]:
# Get versions
import sklearn
numpy_version = np.__version__
pandas_version = pd.__version__
sklearn_version = sklearn.__version__

# Print versions
print(f"NumPy version: {numpy_version}")
print(f"Pandas version: {pandas_version}")
print(f"Scikit-learn version: {sklearn_version}")

NumPy version: 1.22.4
Pandas version: 1.5.3
Scikit-learn version: 1.2.2
