In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv("churn_data.csv")

# Encode categorical columns
categorical_cols = ['number_of_projects', 'work_accident', 'churn',
                    'promotion', 'department','satisfaction','evaluation', 'salary',
                    'average_montly_hours','time_spend_company']

for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# Split features and target
X = df.drop('churn', axis=1)
y = df['churn']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# Evaluate and store results
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results.append({
        "Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    })

# Create results DataFrame
results_df = pd.DataFrame(results).round(4)

# Determine best model based on F1-score
best_model = results_df.loc[results_df['F1-Score'].idxmax()]

# Display results
print("\nModel Comparison Table:")
print(results_df)
print("\nBest Model Based on F1-Score:")
print(best_model)