In [172]:
import pandas as pd

In [173]:

df = pd.read_csv("space/imputed2.csv")

pre_remov = df.shape[1]
df = df.loc[:, df.nunique() != 1]
print(f"Removed {pre_remov - df.shape[1]} columns with only one unique value")

Removed 0 columns with only one unique value


In [174]:
mean_wl_time = df['wl_time'].mean()
print(f"Mean waiting list time: {mean_wl_time}")

df['wl_time'] = df['wl_time'].apply(lambda x: 0 if x < mean_wl_time else 1)
print(df['wl_time'].value_counts(normalize=True))


Mean waiting list time: 192.52839426430694
0    0.716807
1    0.283193
Name: wl_time, dtype: float64


In [175]:
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
CV = 5

X = df.drop(['wl_time'], axis=1)
#X = X.iloc[:, :20]# Scale down X to only 10 features
y = df['wl_time']

scaler = StandardScaler()
X = scaler.fit_transform(X)

log = LogisticRegression(max_iter=1000)
scores = cross_val_score(log, X, y, cv=CV, scoring='f1').mean()
print("F1 score:", scores)


logit_model = sm.OLS(y, X)
result = logit_model.fit()
print(result.summary())

X = sm.add_constant(X)
X_opt = df

print(X_opt.head())
SIGNIFICANCE_LEVEL = 0.05

while True:
    print(f"Number of variables: {len(X_opt.columns)}")
    model = sm.OLS(y, X_opt).fit()
    p_values = model.pvalues
    max_p_value = p_values.max()

    if max_p_value <= SIGNIFICANCE_LEVEL:
        break

    print(f"Removing {p_values.idxmax()} with p-value {max_p_value}")
    X_opt = X_opt.drop([p_values.idxmax()], axis=1)

X_opt.head()

#result = logit_model.fit()
#print(result.summary())

log = LogisticRegression(max_iter=1000)
scores = cross_val_score(log, X_opt, y, cv=CV, scoring='f1').mean()
print("F1 score:", scores)


F1 score: 0.8952750557996195
                                 OLS Regression Results                                
Dep. Variable:                wl_time   R-squared (uncentered):                   0.359
Model:                            OLS   Adj. R-squared (uncentered):              0.358
Method:                 Least Squares   F-statistic:                              323.0
Date:                Tue, 18 Apr 2023   Prob (F-statistic):                        0.00
Time:                        20:12:56   Log-Likelihood:                         -43793.
No. Observations:               77410   AIC:                                  8.785e+04
Df Residuals:                   77276   BIC:                                  8.909e+04
Df Model:                         134                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
------------

In [176]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm

# Backward elimination using F1 score as criterion

CV = 5

X = df.drop(['wl_time'], axis=1)
X = X.iloc[:, :20]# Scale down X to only 10 features

y = df['wl_time']

scaler = StandardScaler()
X = scaler.fit_transform(X)

log = LogisticRegression(max_iter=1000)

features = X.shape[1]
removed_features = []

tracker = tqdm(total=features, desc="Iterations")

for iteration in range(features):
    f1_iteration_baseline = cross_val_score(log, X, y, cv=CV, scoring="f1").mean()
    #f1_iteration_baseline = log.fit(X, y).score(X, y)
    print(f"{iteration}/{features}", f"Baseline F1 score: {f1_iteration_baseline}")

    f1_results = {}
    eligible_features = [i for i in range(features) if i not in removed_features]

    tracker2 = tqdm(total=features-iteration, desc="Features")


    for feature in eligible_features:
        X_subset = X[:, [i for i in range(features) if i not in removed_features + [feature]]]

        f1 = cross_val_score(log, X_subset, y, cv=CV, scoring="f1").mean()
        #f1 = log.fit(X_subset, y).score(X_subset, y)

        f1_results[feature] = f1_iteration_baseline - f1
        print(f"{iteration}/{features}",
              f"{feature}/{len(eligible_features)}",
              f"F1 = {f1}"
              )
        tracker2.update(1)


        # 90 91 = -1 # BAD
        # 90 89 = 1 # GOOD
        # 90 90 = 0 # BAD

    worst_feature = min(f1_results, key=f1_results.get)
    best_feature = max(f1_results, key=f1_results.get)

    if worst_feature == best_feature:
        print("No feature to remove")
        break

    print(f"{iteration}/{features}",
          f"Best feature: {best_feature} F1 = {f1_results[best_feature]}",
          f"Worst feature: {worst_feature} F1 = {f1_results[worst_feature]}",
          )

    removed_features.append(worst_feature)
    tracker.update(1)

print("ranking:", removed_features)



Iterations:   0%|          | 0/20 [00:00<?, ?it/s]

0/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

0/20 0/20 F1 = 0.36052876718467175
0/20 1/20 F1 = 0.3495625775774318
0/20 2/20 F1 = 0.3559559075333286
0/20 3/20 F1 = 0.35983261752095164
0/20 4/20 F1 = 0.36053856744595564
0/20 5/20 F1 = 0.35337475872728275
0/20 6/20 F1 = 0.3614389043544609
0/20 7/20 F1 = 0.35909242007446973
0/20 8/20 F1 = 0.3346404537997382
0/20 9/20 F1 = 0.3226628649354059
0/20 10/20 F1 = 0.36040109513966023
0/20 11/20 F1 = 0.36080327370300314
0/20 12/20 F1 = 0.3603882382241522
0/20 13/20 F1 = 0.35973271948027435
0/20 14/20 F1 = 0.35550269583625266
0/20 15/20 F1 = 0.3605916608849584
0/20 16/20 F1 = 0.2996030110122348
0/20 17/20 F1 = 0.34695054716962487
0/20 18/20 F1 = 0.29257285668340194
0/20 19/20 F1 = 0.35856589929878274
0/20 Best feature: 18 F1 = 0.06822191699532798 Worst feature: 6 F1 = -0.0006441306757309917
1/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

1/20 0/19 F1 = 0.36054015011394436
1/20 1/19 F1 = 0.3495025731996222
1/20 2/19 F1 = 0.35552207541887204
1/20 3/19 F1 = 0.3606771127159417
1/20 4/19 F1 = 0.362213502081814
1/20 5/19 F1 = 0.31483338632459096
1/20 7/19 F1 = 0.35954052312970547
1/20 8/19 F1 = 0.33339584411589346
1/20 9/19 F1 = 0.32248912815099995
1/20 10/19 F1 = 0.36166132951343966
1/20 11/19 F1 = 0.3615698823470906
1/20 12/19 F1 = 0.36212192774806534
1/20 13/19 F1 = 0.36011646094523597
1/20 14/19 F1 = 0.3565577318249794
1/20 15/19 F1 = 0.3615402022698575
1/20 16/19 F1 = 0.2999914729878513
1/20 17/19 F1 = 0.34822190006637976
1/20 18/19 F1 = 0.2920665894033698
1/20 19/19 F1 = 0.3574133490886595
1/20 Best feature: 18 F1 = 0.0687281842753601 Worst feature: 4 F1 = -0.0014187284030841019
2/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

2/20 0/18 F1 = 0.3612969563571464
2/20 1/18 F1 = 0.3495962825239095
2/20 2/18 F1 = 0.3554906187706134
2/20 3/18 F1 = 0.3606513066727163
2/20 5/18 F1 = 0.3132937170773623
2/20 7/18 F1 = 0.35973987219993264
2/20 8/18 F1 = 0.3333780435942901
2/20 9/18 F1 = 0.32266491274483344
2/20 10/18 F1 = 0.36181654809506053
2/20 11/18 F1 = 0.36224546975444627
2/20 12/18 F1 = 0.3618681133408029
2/20 13/18 F1 = 0.36080450797048425
2/20 14/18 F1 = 0.35672474747798055
2/20 15/18 F1 = 0.36215070204453725
2/20 16/18 F1 = 0.3000449115568293
2/20 17/18 F1 = 0.34783454392798135
2/20 18/18 F1 = 0.2916394473214655
2/20 19/18 F1 = 0.3574721238519918
2/20 Best feature: 18 F1 = 0.0691553263572644 Worst feature: 11 F1 = -0.0014506960757163512
3/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

3/20 0/17 F1 = 0.361581349295646
3/20 1/17 F1 = 0.34990416173007377
3/20 2/17 F1 = 0.3553796452039671
3/20 3/17 F1 = 0.3602904724835031
3/20 5/17 F1 = 0.3129655068736974
3/20 7/17 F1 = 0.35969674697406084
3/20 8/17 F1 = 0.33337509790140063
3/20 9/17 F1 = 0.322751989737674
3/20 10/17 F1 = 0.3619169356283731
3/20 12/17 F1 = 0.3617735451549744
3/20 13/17 F1 = 0.3610094969847398
3/20 14/17 F1 = 0.35663134117136697
3/20 15/17 F1 = 0.36224730538747874
3/20 16/17 F1 = 0.30001951019419526
3/20 17/17 F1 = 0.34802549366745567
3/20 18/17 F1 = 0.2914967050567264
3/20 19/17 F1 = 0.35717343009098645
3/20 Best feature: 18 F1 = 0.0692980686220035 Worst feature: 15 F1 = -0.0014525317087488165
4/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

4/20 0/16 F1 = 0.3615727615118547
4/20 1/16 F1 = 0.3498732273088875
4/20 2/16 F1 = 0.35517375426947567
4/20 3/16 F1 = 0.36040702981220596
4/20 5/16 F1 = 0.31308646250661065
4/20 7/16 F1 = 0.3596684501770619
4/20 8/16 F1 = 0.3332758192242701
4/20 9/16 F1 = 0.32275471647406756
4/20 10/16 F1 = 0.36188934868093353
4/20 12/16 F1 = 0.3617071055311857
4/20 13/16 F1 = 0.36098528661483276
4/20 14/16 F1 = 0.35685917933387523
4/20 16/16 F1 = 0.3000643944629895
4/20 17/16 F1 = 0.3480995155175628
4/20 18/16 F1 = 0.2914047498958202
4/20 19/16 F1 = 0.3570993568736656
4/20 Best feature: 18 F1 = 0.0693900237829097 Worst feature: 10 F1 = -0.0010945750022036127
5/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

5/20 0/15 F1 = 0.3614501850216113
5/20 1/15 F1 = 0.34909009299402105
5/20 2/15 F1 = 0.3555428707859639
5/20 3/15 F1 = 0.36119802682255847
5/20 5/15 F1 = 0.31278458270806825
5/20 7/15 F1 = 0.3585804699463964
5/20 8/15 F1 = 0.33334908891571463
5/20 9/15 F1 = 0.2784978109604455
5/20 12/15 F1 = 0.3617922869030261
5/20 13/15 F1 = 0.3607703112301265
5/20 14/15 F1 = 0.3565036617440292
5/20 16/15 F1 = 0.3002175384053377
5/20 17/15 F1 = 0.34774459923197104
5/20 18/15 F1 = 0.2922479039813861
5/20 19/15 F1 = 0.35757612447784065
5/20 Best feature: 9 F1 = 0.0822969627182844 Worst feature: 12 F1 = -0.0009975132242961982
6/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

6/20 0/14 F1 = 0.3615205797075361
6/20 1/14 F1 = 0.34581419627216975
6/20 2/14 F1 = 0.35555471545107636
6/20 3/14 F1 = 0.3606218742934073
6/20 5/14 F1 = 0.3144496477873468
6/20 7/14 F1 = 0.36007159826918667
6/20 8/14 F1 = 0.33415517665267946
6/20 9/14 F1 = 0.28243751448563137
6/20 13/14 F1 = 0.3604461096500111
6/20 14/14 F1 = 0.35706456781644585
6/20 16/14 F1 = 0.30130291896248207
6/20 17/14 F1 = 0.34662453393645104
6/20 18/14 F1 = 0.29404592302131205
6/20 19/14 F1 = 0.3585793691398013
6/20 Best feature: 9 F1 = 0.07835725919309855 Worst feature: 0 F1 = -0.0007258060288061752
7/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

7/20 1/13 F1 = 0.34540166857241955
7/20 2/13 F1 = 0.3554980572717603
7/20 3/13 F1 = 0.36024399657119704
7/20 5/13 F1 = 0.31419103695268974
7/20 7/13 F1 = 0.3599216766703314
7/20 8/13 F1 = 0.33439070898809503
7/20 9/13 F1 = 0.2828849311173293
7/20 13/13 F1 = 0.3605285721663563
7/20 14/13 F1 = 0.35629185188309526
7/20 16/13 F1 = 0.30149758052137
7/20 17/13 F1 = 0.3468005262534735
7/20 18/13 F1 = 0.294059437653
7/20 19/13 F1 = 0.3583486330610771
7/20 Best feature: 9 F1 = 0.0779098425614006 Worst feature: 13 F1 = 0.00026620151237360945
8/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

8/20 1/12 F1 = 0.34477380624160425
8/20 2/12 F1 = 0.3545482813757973
8/20 3/12 F1 = 0.3593593920889785
8/20 5/12 F1 = 0.312552102157915
8/20 7/12 F1 = 0.35793737558778793
8/20 8/12 F1 = 0.33286615902537425
8/20 9/12 F1 = 0.27785933944358526
8/20 14/12 F1 = 0.35475553961846334
8/20 16/12 F1 = 0.2999870153430546
8/20 17/12 F1 = 0.34529212785996033
8/20 18/12 F1 = 0.29353136190583873
8/20 19/12 F1 = 0.35747013814736983
8/20 Best feature: 9 F1 = 0.08293543423514466 Worst feature: 3 F1 = 0.0014353815897514277
9/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

9/20 1/11 F1 = 0.3433068137504962
9/20 2/11 F1 = 0.3519424924338269
9/20 5/11 F1 = 0.31180485486505105
9/20 7/11 F1 = 0.3569286324057207
9/20 8/11 F1 = 0.33143221093735653
9/20 9/11 F1 = 0.2759583243578848
9/20 14/11 F1 = 0.35476955015159967
9/20 16/11 F1 = 0.2994366400381934
9/20 17/11 F1 = 0.34425145533436585
9/20 18/11 F1 = 0.2914944531929139
9/20 19/11 F1 = 0.35628451125186567
9/20 Best feature: 9 F1 = 0.08483644932084511 Worst feature: 7 F1 = 0.003866141273009216
10/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

10/20 1/10 F1 = 0.3428499701146565
10/20 2/10 F1 = 0.35117482248085274
10/20 5/10 F1 = 0.3091998776096826
10/20 8/10 F1 = 0.32982665773405123
10/20 9/10 F1 = 0.2473549081037203
10/20 14/10 F1 = 0.35208626317765246
10/20 16/10 F1 = 0.29733227147238844
10/20 17/10 F1 = 0.3422734601264089
10/20 18/10 F1 = 0.2880552710812191
10/20 19/10 F1 = 0.35430104616420804
10/20 Best feature: 9 F1 = 0.11343986557500962 Worst feature: 19 F1 = 0.006493727514521874
11/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

11/20 1/9 F1 = 0.3379383203527418
11/20 2/9 F1 = 0.34578701426878544
11/20 5/9 F1 = 0.30859260477527173
11/20 8/9 F1 = 0.3269225051031351
11/20 9/9 F1 = 0.24542822805189565
11/20 14/9 F1 = 0.34853272900279986
11/20 16/9 F1 = 0.29360240268578536
11/20 17/9 F1 = 0.3421288301317296
11/20 18/9 F1 = 0.28940897384761
11/20 Best feature: 9 F1 = 0.11536654562683427 Worst feature: 14 F1 = 0.012262044675930062
12/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

12/20 1/8 F1 = 0.3331844831805749
12/20 2/8 F1 = 0.34137336467175583
12/20 5/8 F1 = 0.29648975286660795
12/20 8/8 F1 = 0.32011576531193214
12/20 9/8 F1 = 0.2424616812866825
12/20 16/8 F1 = 0.28194022290648924
12/20 17/8 F1 = 0.33605521027346114
12/20 18/8 F1 = 0.28216842057653013
12/20 Best feature: 9 F1 = 0.11833309239204742 Worst feature: 2 F1 = 0.01942140900697409
13/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

13/20 1/7 F1 = 0.3222900020807448
13/20 5/7 F1 = 0.2937279688267662
13/20 8/7 F1 = 0.31382315866519
13/20 9/7 F1 = 0.24324922272273136
13/20 16/7 F1 = 0.2739680308801988
13/20 17/7 F1 = 0.3301977663363591
13/20 18/7 F1 = 0.27957002264241104
13/20 Best feature: 9 F1 = 0.11754555095599856 Worst feature: 17 F1 = 0.030597007342370808
14/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

14/20 1/6 F1 = 0.3147628075448939
14/20 5/6 F1 = 0.274589306214473
14/20 8/6 F1 = 0.3015300444634485
14/20 9/6 F1 = 0.23524951247784295
14/20 16/6 F1 = 0.265499982131184
14/20 18/6 F1 = 0.26891979498145574
14/20 Best feature: 9 F1 = 0.12554526120088697 Worst feature: 1 F1 = 0.04603196613383603
15/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

15/20 5/5 F1 = 0.25839030244214684
15/20 8/5 F1 = 0.2887394657553083
15/20 9/5 F1 = 0.1967965448187055
15/20 16/5 F1 = 0.26042833444801616
15/20 18/5 F1 = 0.25347475264024877
15/20 Best feature: 9 F1 = 0.1639982288600244 Worst feature: 8 F1 = 0.07205530792342163
16/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

16/20 5/4 F1 = 0.21680126880575737
16/20 9/4 F1 = 0.13427092887501707
16/20 16/4 F1 = 0.23311710438583924
16/20 18/4 F1 = 0.195696639885233
16/20 Best feature: 9 F1 = 0.22652384480371285 Worst feature: 16 F1 = 0.12767766929289068
17/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

17/20 5/3 F1 = 0.11842262031590867
17/20 9/3 F1 = 0.0810325607042371
17/20 18/3 F1 = 0.13565196498908993
17/20 Best feature: 9 F1 = 0.27976221297449283 Worst feature: 18 F1 = 0.22514280868963998
18/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

18/20 5/2 F1 = 0.047141271727279
18/20 9/2 F1 = 0.0
18/20 Best feature: 9 F1 = 0.3607947736787299 Worst feature: 5 F1 = 0.3136535019514509
19/20 Baseline F1 score: 0.3607947736787299


Features:   0%|          | 0/20 [00:00<?, ?it/s]

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/envs/edap01/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/envs/edap01/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1138, in fit
    X, y = self._validate_data(
  File "/opt/homebrew/anaconda3/envs/edap01/lib/python3.10/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/homebrew/anaconda3/envs/edap01/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/opt/homebrew/anaconda3/envs/edap01/lib/python3.10/site-packages/sklearn/utils/validation.py", line 918, in check_array
    raise ValueError(
ValueError: Found array with 0 feature(s) (shape=(61928, 0)) while a minimum of 1 is required by LogisticRegression.


In [None]:
"""
import numpy as np
from time import time
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

X = df.drop(['wl_time'], axis=1)
y = df['wl_time']

scaler = StandardScaler()
X = scaler.fit_transform(X)

log = LogisticRegression(max_iter=1000)

# Evaluate the model using all features and the F1 score metric
print("Evaluating model with all features")
scores = cross_val_score(log, X, y, cv=5, scoring='f1_macro')
avg_score = np.mean(scores)
print("Initial F1 score:", avg_score)

# Iterate over all features and remove the one that has the smallest impact on the average F1 score
n_features = X.shape[1]
removed_features = []
t_total_start = time()

for i in range(n_features):
    t_start = time()
    print("Evaluating feature", i, "of", n_features)
    # Compute the F1 score for the current set of features
    current_features = [j for j in range(n_features) if j not in removed_features + [i]]
    X_subset = X[:, current_features]
    scores_subset = cross_val_score(log, X_subset, y, cv=5, scoring='f1_macro')
    avg_score_subset = np.mean(scores_subset)

    # Compute the difference in F1 score between the current set of features and the original set of features
    score_diff = avg_score - avg_score_subset

    # Remove the feature that has the smallest impact on the F1 score
    if score_diff > 0:
        removed_features.append(i)
        avg_score = avg_score_subset
        print("Removed feature", i, "F1 score:", avg_score)
    t_stop = time()
    t_elapsed = t_stop - t_start
    t_left = (n_features - i - 1) * t_elapsed
    print("Time elapsed:", t_elapsed, "Time left:", t_left)
    """


In [None]:
#Print removed features
print("Removed features:", removed_features)