In [None]:
# !pip3 install scikit-learn

In [None]:
# !pip3 uninstall -y lightgbm 
# !pip3 install lightgbm

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from lofo import LOFOImportance, FLOFOImportance, Dataset, plot_importance
from sklearn.metrics import make_scorer, f1_score
from data.test_data import generate_test_data, generate_unstructured_test_data
import gc

  from tqdm.autonotebook import tqdm


In [3]:
np.random.seed(42)

df = pd.read_csv("recruitment_data.csv")

df.head()

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,HiringDecision
0,26,1,2,0,3,26.783828,48,78,91,1,1
1,39,1,4,12,3,25.862694,35,68,80,2,1
2,48,0,2,3,2,9.920805,20,67,13,2,0
3,34,1,2,5,2,6.407751,36,27,70,3,0
4,30,0,1,6,1,43.105343,23,52,85,2,0


In [5]:
df.columns

Index(['Age', 'Gender', 'EducationLevel', 'ExperienceYears',
       'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore',
       'SkillScore', 'PersonalityScore', 'RecruitmentStrategy',
       'HiringDecision'],
      dtype='object')

### Running on Logistic Regression

In [6]:
# Define the function
def run_importance_calculations(df, target, features, model_name ,n_runs=1000):
    scorer = make_scorer(f1_score, pos_label=1)
    # cv = KFold(n_splits=4, shuffle=True, random_state=0)
    cv = 4
    dataset = Dataset(df=df, target=target, features=features)
    importance_means = []

    for _ in range(n_runs):
        fi = LOFOImportance(dataset, scoring=scorer, model=model_name, cv=cv)
        importances = fi.get_importance()
        importance_means.append(importances['importance_mean'].values)
        del fi
        gc.collect()

    importance_means = np.array(importance_means)
    
    # Check if the same importance mean for each feature was generated
    same_means = np.all(importance_means == importance_means[0], axis=0)

    # Convert results to a DataFrame for better visualization
    results_df = pd.DataFrame(importance_means, columns=features)
    
    return results_df,same_means

In [7]:
results_df,same_means = run_importance_calculations(df=df,
                                                    target='HiringDecision',
                                                    features=[x for x in df.columns if x != 'HiringDecision'],
                                                    model_name =LogisticRegression(),
                                                    n_runs=5)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [8]:
results_df

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy
0,0.315375,0.068724,0.044524,0.039634,0.037683,0.029929,0.020929,0.012507,0.012432,0.00135
1,0.315375,0.068724,0.044524,0.039634,0.037683,0.029929,0.020929,0.012507,0.012432,0.00135
2,0.315375,0.068724,0.044524,0.039634,0.037683,0.029929,0.020929,0.012507,0.012432,0.00135
3,0.315375,0.068724,0.044524,0.039634,0.037683,0.029929,0.020929,0.012507,0.012432,0.00135
4,0.315375,0.068724,0.044524,0.039634,0.037683,0.029929,0.020929,0.012507,0.012432,0.00135


In [9]:
same_means

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [10]:
results_df_2,same_means_2 = run_importance_calculations(df=df,
                                                        target='HiringDecision',
                                                        features=[x for x in df.columns if x != 'HiringDecision'],
                                                        model_name =RandomForestClassifier(n_estimators=50,random_state=42),
                                                        n_runs=5)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
results_df_2

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy
0,0.396292,0.123195,0.110842,0.097286,0.083138,0.073661,0.004828,-0.002192,-0.00457,-0.008342
1,0.396292,0.123195,0.110842,0.097286,0.083138,0.073661,0.004828,-0.002192,-0.00457,-0.008342
2,0.396292,0.123195,0.110842,0.097286,0.083138,0.073661,0.004828,-0.002192,-0.00457,-0.008342
3,0.396292,0.123195,0.110842,0.097286,0.083138,0.073661,0.004828,-0.002192,-0.00457,-0.008342
4,0.396292,0.123195,0.110842,0.097286,0.083138,0.073661,0.004828,-0.002192,-0.00457,-0.008342


In [12]:
same_means_2

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

## Random forest doesn't produce same results even after setting random state

https://github.com/scikit-learn/scikit-learn/discussions/25411

At times it is OS/Cloud dependent
https://community.databricks.com/t5/data-engineering/cannot-reproduce-result-scikit-learn-random-forest/m-p/27659#M19520

https://github.com/scikit-learn/scikit-learn/issues/28920