<a href="https://colab.research.google.com/github/YatindraRai002/Employee-Performance-using-machine-learning/blob/main/CodingNinjas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


In [None]:
df = pd.read_csv('/content/Employee_Performance_Retention.csv')
df.size

90000

In [None]:
df.head(5)

Unnamed: 0,Employee_ID,Age,Department,Years_of_Experience,Monthly_Working_Hours,Training_Hours_per_Year,Performance_Rating,Job_Satisfaction_Level,Promotion_in_Last_2_Years,Attrition
0,E1000,59,Sales,27,135,30,3,Medium,Yes,No
1,E1001,49,Tech,13,132,36,5,Medium,Yes,Yes
2,E1002,35,Operations,16,197,11,5,Low,No,No
3,E1003,28,HR,23,204,36,3,High,No,No
4,E1004,41,Operations,33,192,36,2,Medium,No,No


In [None]:
df.tail(5)

Unnamed: 0,Employee_ID,Age,Department,Years_of_Experience,Monthly_Working_Hours,Training_Hours_per_Year,Performance_Rating,Job_Satisfaction_Level,Promotion_in_Last_2_Years,Attrition
8995,E9995,44,Operations,26,187,17,2,Medium,No,No
8996,E9996,23,Operations,24,147,1,3,High,No,Yes
8997,E9997,50,Finance,3,208,39,2,Medium,No,No
8998,E9998,33,Operations,8,225,47,3,Low,No,No
8999,E9999,39,HR,14,186,3,3,Low,No,No


In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())



Missing values per column:
Employee_ID                  0
Age                          0
Department                   0
Years_of_Experience          0
Monthly_Working_Hours        0
Training_Hours_per_Year      0
Performance_Rating           0
Job_Satisfaction_Level       0
Promotion_in_Last_2_Years    0
Attrition                    0
dtype: int64


## Preprocessing

### Subtask:
Handle categorical features and split the data into training and testing sets.


**Reasoning**:
Identify categorical columns, apply one-hot encoding, separate features and target, and split the data into training and testing sets.



In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Apply one-hot encoding, dropping the first category to avoid multicollinearity
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Separate features (X) and target (y)
X = df_encoded.drop('Attrition_Yes', axis=1)  # Assuming 'Attrition_No' is the dropped column
y = df_encoded['Attrition_Yes']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (7200, 9011)
Shape of X_test: (1800, 9011)
Shape of y_train: (7200,)
Shape of y_test: (1800,)


## Random forest model

### Subtask:
Train a Random Forest model to predict Attrition, perform feature importance analysis, and evaluate the model's performance.


**Reasoning**:
Train a Random Forest model, predict on the test set, calculate accuracy, extract feature importances, and display the top 10 features.



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate RandomForestClassifier with fewer estimators and max depth
rf_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf:.4f}")

# Extract feature importances
feature_importances = rf_model.feature_importances_

# Create a Series mapping feature names to importances
feature_importance_series = pd.Series(feature_importances, index=X_train.columns)

# Sort and display the top 10 features
top_10_features = feature_importance_series.sort_values(ascending=False).head(10)
print("\nTop 10 Most Important Features:")
print(top_10_features)

Random Forest Model Accuracy: 0.8033

Top 10 Most Important Features:
Employee_ID_E9207    0.006941
Employee_ID_E5579    0.006425
Employee_ID_E4430    0.005717
Employee_ID_E9308    0.005612
Employee_ID_E8811    0.005390
Employee_ID_E6137    0.005134
Employee_ID_E8126    0.005089
Employee_ID_E4355    0.005047
Employee_ID_E1001    0.005037
Employee_ID_E1955    0.004978
dtype: float64


## Support vector machine (svm) model




In [None]:
from sklearn.svm import SVC

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Linear SVM with a smaller C value
svm_linear = SVC(kernel='linear', C=0.1, random_state=42)
svm_linear.fit(X_train_scaled, y_train)

# Predict and evaluate Linear SVM
y_pred_linear = svm_linear.predict(X_test_scaled)
accuracy_linear = accuracy_score(y_test, y_pred_linear)
print(f"Linear SVM Model Accuracy: {accuracy_linear:.4f}")

# Initialize and train RBF SVM with a smaller C value
svm_rbf = SVC(kernel='rbf', C=0.1, random_state=42)
svm_rbf.fit(X_train_scaled, y_train)

# Predict and evaluate RBF SVM
y_pred_rbf = svm_rbf.predict(X_test_scaled)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)
print(f"RBF SVM Model Accuracy: {accuracy_rbf:.4f}")

# Compare accuracies
print("\nAccuracy Comparison:")
print(f"Linear SVM: {accuracy_linear:.4f}")
print(f"RBF SVM: {accuracy_rbf:.4f}")

Linear SVM Model Accuracy: 0.8033
RBF SVM Model Accuracy: 0.8033

Accuracy Comparison:
Linear SVM: 0.8033
RBF SVM: 0.8033


## Xgboost model




In [None]:
from xgboost import XGBClassifier

# Instantiate an XGBClassifier object with reduced n_estimators and max_depth
xgb_model = XGBClassifier(n_estimators=100, max_depth=5, eval_metric='logloss', random_state=42, n_jobs=-1)

# Train the XGBoost model on the scaled training data
xgb_model.fit(X_train_scaled, y_train)

# Predict on the scaled test set
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Calculate the accuracy of the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Print the XGBoost model's accuracy
print(f"XGBoost Model Accuracy: {accuracy_xgb:.4f}")

# Compare the accuracy of the XGBoost model with the accuracies obtained from the other models
print("\nModel Accuracy Comparison:")
print(f"Random Forest: {accuracy_rf:.4f}")
print(f"Linear SVM: {accuracy_linear:.4f}")
print(f"RBF SVM: {accuracy_rbf:.4f}")
print(f"XGBoost: {accuracy_xgb:.4f}")

XGBoost Model Accuracy: 0.7817

Model Accuracy Comparison:
Random Forest: 0.8033
Linear SVM: 0.8033
RBF SVM: 0.8033
XGBoost: 0.7817


## Conclusion




In [None]:
print("--- Model Performance Summary ---")
print(f"Random Forest Model Accuracy: {accuracy_rf:.4f}")
print(f"Linear SVM Model Accuracy: {accuracy_linear:.4f}")
print(f"RBF SVM Model Accuracy: {accuracy_rbf:.4f}")
print(f"XGBoost Model Accuracy: {accuracy_xgb:.4f}")

print("\n--- Performance Comparison ---")
print("Based on accuracy on the test set:")
if accuracy_rf >= accuracy_linear and accuracy_rf >= accuracy_rbf and accuracy_rf >= accuracy_xgb:
    print("- Random Forest performed best.")
elif accuracy_linear >= accuracy_rf and accuracy_linear >= accuracy_rbf and accuracy_linear >= accuracy_xgb:
     print("- Linear SVM performed best.")
elif accuracy_rbf >= accuracy_rf and accuracy_rbf >= accuracy_linear and accuracy_rbf >= accuracy_xgb:
     print("- RBF SVM performed best.")
else:
    print("- XGBoost performed best.")

if accuracy_rf <= accuracy_linear and accuracy_rf <= accuracy_rbf and accuracy_rf <= accuracy_xgb:
    print("- Random Forest performed worst.")
elif accuracy_linear <= accuracy_rf and accuracy_linear <= accuracy_rbf and accuracy_linear <= accuracy_xgb:
     print("- Linear SVM performed worst.")
elif accuracy_rbf <= accuracy_rf and accuracy_rbf <= accuracy_linear and accuracy_rbf <= accuracy_xgb:
     print("- RBF SVM performed worst.")
else:
    print("- XGBoost performed worst.")



--- Model Performance Summary ---
Random Forest Model Accuracy: 0.8033
Linear SVM Model Accuracy: 0.8033
RBF SVM Model Accuracy: 0.8033
XGBoost Model Accuracy: 0.7694

--- Performance Comparison ---
Based on accuracy on the test set:
- Random Forest performed best.
- XGBoost performed worst.
