In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt


In [24]:
# Function to perform feature selection and return selected features and their names
def selectkbest(indep_X, dep_y, n):
    test = SelectKBest(score_func=chi2, k=n)
    fit1 = test.fit(indep_X, dep_y)
    selectk_features = fit1.transform(indep_X)
    feature_names = indep_X.columns[test.get_support()]
    return selectk_features, test, feature_names

In [25]:
# Function to split and scale data
def split_scaler(indep_X, dep_y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test, sc

In [26]:
# Function to train Decision Tree classifier and evaluate it
def DT(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return classifier, accuracy, report, cm

In [27]:
# Load data
df = pd.read_csv("prep.csv")
df = pd.get_dummies(df, drop_first=True)
indep_X = df.drop('classification_yes', axis=1)
dep_y = df['classification_yes']

In [28]:
k = 5
selected_features, feature_selector, best_feature_names = selectkbest(indep_X, dep_y, k)

In [29]:
selected_features

array([[1.48112676e+02, 5.74821053e+01, 3.07735602e+00, 3.88689024e+01,
        8.40819113e+03],
       [1.48112676e+02, 2.20000000e+01, 7.00000000e-01, 3.40000000e+01,
        1.23000000e+04],
       [9.90000000e+01, 2.30000000e+01, 6.00000000e-01, 3.40000000e+01,
        8.40819113e+03],
       ...,
       [1.10000000e+02, 1.15000000e+02, 6.00000000e+00, 2.60000000e+01,
        9.20000000e+03],
       [2.07000000e+02, 8.00000000e+01, 6.80000000e+00, 3.88689024e+01,
        8.40819113e+03],
       [1.00000000e+02, 4.90000000e+01, 1.00000000e+00, 5.30000000e+01,
        8.50000000e+03]])

In [30]:
feature_selector

In [31]:
best_feature_names

Index(['bgr', 'bu', 'sc', 'pcv', 'wc'], dtype='object')

In [32]:
# Print selected feature names
print("Selected Best Features:", best_feature_names)

Selected Best Features: Index(['bgr', 'bu', 'sc', 'pcv', 'wc'], dtype='object')


In [33]:
# Split and scale data
X_train, X_test, y_train, y_test, scaler = split_scaler(selected_features, dep_y)

In [34]:
# Train and evaluate model
classifier, accuracy, report, cm = DT(X_train, y_train, X_test, y_test)
print("Accuracy:", accuracy)
print("Report:\n", report)

Accuracy: 0.96
Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.95        36
           1       0.98      0.95      0.97        64

    accuracy                           0.96       100
   macro avg       0.95      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100



In [36]:
#importing pickle file and  used for serializing and deserializing Python objects. 
import pickle
#model saved in save fomat
filenmae = 'DTclassification.sav'
#pickle.dump() serializes the regressor object and writes it to the specified file (filename) in binary mode ('wb').
pickle.dump(classifier, open(filenmae, 'wb'))

In [38]:
# the mode 'rb' used in file operations stands for "read binary" and saved in laoded model
loaded_model = pickle.load(open('DTclassification.sav','rb'))
#using predict keyeword, predicting 15 years of experience of emplyee and predicted value store in result variable
result1 = loaded_model.predict([[148.112676, 57.4821052631579, 3.077356021, 38.86890244, 8408.191126]])
#pring the result
print(result1)

[1]


In [39]:
result2 = loaded_model.predict([[100, 26, 0.6, 49, 6600]])
#pring the result
print(result2)

[1]


In [40]:
result2 = loaded_model.predict([[70, 36, 1, 49, 9800]])
#pring the result
print(result2)

[1]


In [42]:
# Importing pickle file for serializing and deserializing Python objects
import pickle

# Save the model, scaler, and feature selector
filename = 'DTclassification.sav'
with open(filename, 'wb') as f:
    pickle.dump((classifier, sc, feature_selector), f)

# Load the model, scaler, and feature selector
with open(filename, 'rb') as f:
    loaded_model, loaded_scaler, loaded_feature_selector = pickle.load(f)

# Prepare new data with all original features
new_data = [[70, 36, 1, 49, 9800] + [0] * (indep_X.shape[1] - 5)]  # Fill remaining features with zeros or appropriate values
new_data_selected = loaded_feature_selector.transform(new_data)
new_data_scaled = loaded_scaler.transform(new_data_selected)

# Predict with the loaded model
result = loaded_model.predict(new_data_scaled)
print("Prediction:", result)


Prediction: [1]




In [44]:
# Importing pickle file for serializing and deserializing Python objects
import pickle

# Save the model, scaler, and feature selector
filename = 'DTclassification.sav'
with open(filename, 'wb') as f:
    pickle.dump((classifier, sc, feature_selector), f)

# Load the model, scaler, and feature selector
with open(filename, 'rb') as f:
    loaded_model, loaded_scaler, loaded_feature_selector = pickle.load(f)

# Prepare new data with all original features
new_data = [[148.1127, 33, 1, 44, 10500] + [0] * (indep_X.shape[1] - 5)]  # Fill remaining features with zeros or appropriate values
new_data_selected = loaded_feature_selector.transform(new_data)
new_data_scaled = loaded_scaler.transform(new_data_selected)

# Predict with the loaded model
result = loaded_model.predict(new_data_scaled)
print("Prediction:", result)

Prediction: [1]


