In [40]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report
from sklearn.utils import all_estimators
from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pandas as pd
import numpy as np
import os

df = pd.read_csv(r'data/data_processed.csv')
df_continous = pd.read_csv(r'data/data_continous.csv')
df_categorical = pd.read_csv(r'data/data_categorical.csv')

# --------------------------------------------------------------------------------------------------
# copying job satisfaction into new array
JAtt = df['Attrition'].values

df_good = pd.concat([df_categorical, df["MonthlyIncome"], df["BusinessTravel"], df["StockOptionLevel"], df["DistanceFromHome"]], axis=1)
df = df_good.drop(['Attrition'], axis=1)

# splitting inputs by row index
# all data
df_training = df.iloc[:1200,:]
df_validation = df.iloc[1200:,:]
# continous data
df_training_continous = df_continous.iloc[:1200,:]
df_validation_continous = df_continous.iloc[1200:,:]
# categorical data
df_training_categorical = df_categorical.iloc[:1200,:]
df_validation_categorical = df_categorical.iloc[1200:,:]
# splitting outputs by number
JAtt_training = JAtt[:1200]
JAtt_validation = JAtt[1200:]
# --------------------------------------------------------------------------------------------------

# data classifier
clf = LinearDiscriminantAnalysis()

clf.fit(df_training,JAtt_training)
acc = clf.score(df_validation, JAtt_validation)
acc_all = clf.score(df, JAtt)

pred = clf.predict(df)

print(classification_report(JAtt, pred))

# print results
print('Data accuracy on only new samples: ', acc)
print('Data accuracy on all samples: ', acc_all)

# print(allColumnsNames)

              precision    recall  f1-score   support

           0       0.88      0.97      0.93      1233
           1       0.70      0.34      0.46       237

    accuracy                           0.87      1470
   macro avg       0.79      0.66      0.69      1470
weighted avg       0.85      0.87      0.85      1470

Data accuracy on only new samples:  0.8740740740740741
Data accuracy on all samples:  0.8700680272108844


In [10]:
# randomized search
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [29]:
# specify parameters and distributions to sample from
param_dist = {
    "solver": ["svd", "lsqr", "eigen"],
    "shrinkage": [None, 'auto'],
    "tol": np.power(10, np.arange(-6, -4, dtype=float))
}

In [30]:
# run randomized search
n_iter_search = 150
random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist, n_iter=n_iter_search
)

In [31]:
start = time()
random_search.fit(df_training, JAtt_training)
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_iter_search)
)
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {
    # "shrinkage": np.concatenate([[None, 'auto'], np.linspace(1e-7,1, num=100),], axis=0),
    "shrinkage": [None, 'auto'],
    "tol": np.power(10, np.arange(-6, -4, dtype=float)),
}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(df_training, JAtt_training)

print(
    "GridSearchCV took %.2f seconds for %d candidate parameter settings."
    % (time() - start, len(grid_search.cv_results_["params"]))
)
report(grid_search.cv_results_)



RandomizedSearchCV took 0.61 seconds for 1500 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.860 (std: 0.013)
Parameters: {'tol': 1e-06, 'solver': 'svd', 'shrinkage': None}

Model with rank: 1
Mean validation score: 0.860 (std: 0.013)
Parameters: {'tol': 1e-05, 'solver': 'svd', 'shrinkage': None}

Model with rank: 1
Mean validation score: 0.860 (std: 0.013)
Parameters: {'tol': 1e-06, 'solver': 'lsqr', 'shrinkage': None}

Model with rank: 1
Mean validation score: 0.860 (std: 0.013)
Parameters: {'tol': 1e-05, 'solver': 'lsqr', 'shrinkage': None}

Model with rank: 1
Mean validation score: 0.860 (std: 0.013)
Parameters: {'tol': 1e-06, 'solver': 'lsqr', 'shrinkage': 'auto'}

Model with rank: 1
Mean validation score: 0.860 (std: 0.013)
Parameters: {'tol': 1e-05, 'solver': 'lsqr', 'shrinkage': 'auto'}

Model with rank: 1
Mean validation score: 0.860 (std: 0.013)
Parameters: {'tol': 1e-06, 'solver': 'eigen', 'shrinkage': 'auto'}

Model with rank: 1
Mean validation s

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\siman\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\siman\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\discriminant_analysis.py", line 605, in fit
    self._solve_eigen(
  File "c:\Users\siman\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\discriminant_analysis.py", line 445, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
  File "c:\Users\siman\AppData\Local\Programs\Pyth