In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from scipy.stats import randint, uniform
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
# Load data
data = pd.read_excel(r'C:\\Users\\HP\\Desktop\\final project12.xlsx')
# Drop rows with missing target values
data.dropna(subset=['Job status'], inplace=True)
# Define features and target
X = data.drop(columns=['Job status', 'No'])
y = data['Job status']
# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Define categorical and numerical columns
categorical_cols = ['Sex', 'Label', 'Status', 'College', 'Fild of Study']
numerical_cols = ['Salary', 'age', 'year_of_service']
# Preprocessing for numerical data: impute missing values with mean
numerical_transformer = SimpleImputer(strategy='mean')
# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
# Function to convert sparse matrix to dense
def to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X

dense_transformer = FunctionTransformer(to_dense)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Naive Bayes': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('to_dense', dense_transformer),
        ('classifier', GaussianNB())
    ]),
    'Support Vector Machine': SVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier()
}
# Evaluate each model
for name, model in models.items():
    if name == 'Naive Bayes':
        clf = model
    else:
        clf = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
    
    # Train the model
    clf.fit(X_train, y_train)
    
    # Predict and evaluate the model
    y_pred = clf.predict(X_test)
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    print("="*60)
# Hyperparameter tuning for Decision Tree
dt = DecisionTreeClassifier(random_state=42)
pipeline_dt = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', dt)])

param_grid_dt = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__splitter': ['best', 'random'],
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': [None, 'auto', 'sqrt', 'log2']
}

grid_search_dt = GridSearchCV(estimator=pipeline_dt, param_grid=param_grid_dt, 
                              cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search_dt.fit(X_train, y_train)

best_params_dt = grid_search_dt.best_params_
best_score_dt = grid_search_dt.best_score_

print("Best parameters for Decision Tree: ", best_params_dt)
print("Best cross-validation accuracy for Decision Tree: {:.2f}".format(best_score_dt))

best_dt = grid_search_dt.best_estimator_

y_pred_dt = best_dt.predict(X_test)
print("Tuned Decision Tree Model:")
print(classification_report(y_test, y_pred_dt))
print("="*60)


Model: Logistic Regression
                precision    recall  f1-score   support

not terminated       0.93      0.97      0.95       607
    terminated       0.92      0.83      0.87       263

      accuracy                           0.93       870
     macro avg       0.93      0.90      0.91       870
  weighted avg       0.93      0.93      0.93       870

Model: Decision Tree
                precision    recall  f1-score   support

not terminated       0.96      0.95      0.96       607
    terminated       0.89      0.90      0.90       263

      accuracy                           0.94       870
     macro avg       0.92      0.93      0.93       870
  weighted avg       0.94      0.94      0.94       870

Model: Random Forest
                precision    recall  f1-score   support

not terminated       0.96      0.96      0.96       607
    terminated       0.91      0.90      0.90       263

      accuracy                           0.94       870
     macro avg       0.93  

1080 fits failed out of a total of 4320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
416 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\deploy\deploy-model-1\venv1\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\HP\deploy\deploy-model-1\venv1\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\deploy\deploy-model-1\venv1\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\HP\d

Best parameters for Decision Tree:  {'classifier__criterion': 'gini', 'classifier__max_depth': 20, 'classifier__max_features': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__splitter': 'best'}
Best cross-validation accuracy for Decision Tree: 0.94
Tuned Decision Tree Model:
                precision    recall  f1-score   support

not terminated       0.96      0.96      0.96       607
    terminated       0.90      0.90      0.90       263

      accuracy                           0.94       870
     macro avg       0.93      0.93      0.93       870
  weighted avg       0.94      0.94      0.94       870



In [3]:
joblib.dump(best_dt, 'best_decision_tree_model.pkl')

['best_decision_tree_model.pkl']

In [1]:
#my