In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# 1. Dataset Selection
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("SampleSubmission.csv")

In [None]:
# 2. Data Preprocessing
# Drop 'sl_no' as it's not useful
train_df.drop('sl_no', axis=1, inplace=True)

# Fill missing 'salary' values with 0 (implies Not Placed)
train_df['salary'] = train_df['salary'].fillna(0)


# Encode 'status' column: Placed = 1, Not Placed = 0
train_df['status'] = train_df['status'].map({'Placed': 1, 'Not Placed': 0})

# Label encode all object (categorical) columns
le = LabelEncoder()
for col in train_df.select_dtypes(include='object').columns:
    train_df[col] = le.fit_transform(train_df[col])

In [None]:
#3
plt.figure(figsize=(10, 6))
sns.heatmap(train_df.corr(), annot=True, fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# 4. Feature and Target Selection
X = train_df.drop(['status', 'salary'], axis=1)  # Features
y = train_df['status']  # Target

# 5. Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# 6. Model Selection and Training
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True)
}

In [None]:
# Train each model
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained.")

# 7. Model Evaluation
for name, model in models.items():
    y_pred = model.predict(X_val)
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("Precision:", precision_score(y_val, y_pred))
    print("Recall:", recall_score(y_val, y_pred))
    print("F1 Score:", f1_score(y_val, y_pred))

    cm = confusion_matrix(y_val, y_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot()
    plt.title(f"{name} Confusion Matrix")
    plt.show()

In [None]:
# 9. Voting Classifier Implementation
voting_clf = VotingClassifier(estimators=[
    ('lr', models["Logistic Regression"]),
    ('rf', models["Random Forest"]),
    ('svm', models["SVM"])
], voting='soft')

voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_val)

# Evaluate Voting Classifier
print("\n--- Voting Classifier ---")
print("Accuracy:", accuracy_score(y_val, y_pred_voting))
print("Precision:", precision_score(y_val, y_pred_voting))
print("Recall:", recall_score(y_val, y_pred_voting))
print("F1 Score:", f1_score(y_val, y_pred_voting))

cm = confusion_matrix(y_val, y_pred_voting)
ConfusionMatrixDisplay(confusion_matrix=cm).plot()
plt.title("Voting Classifier Confusion Matrix")
plt.show()

In [None]:
# 10. output

# Drop 'sl_no' if exists
if 'sl_no' in test_df.columns:
    test_df.drop('sl_no', axis=1, inplace=True)

# Drop 'salary' if exists (some test sets may have it)
if 'salary' in test_df.columns:
    test_df.drop('salary', axis=1, inplace=True)

# Fill missing values with 0
test_df.fillna(0, inplace=True)

# Encode categorical columns using the same logic as train_df
for col in test_df.select_dtypes(include='object').columns:
    if col in train_df.columns:
        # Use the unique values from train_df to build a consistent mapping
        train_col_values = train_df[col].unique()
        mapping = {label: idx for idx, label in enumerate(train_col_values)}
        test_df[col] = test_df[col].map(mapping).fillna(-1).astype(int)

# Ensure all columns from training features are present in test_df
for col in X.columns:
    if col not in test_df.columns:
        test_df[col] = 0  # or a neutral default like mean or -1

# Match the column order
X_test = test_df[X.columns]

# Predict using the trained voting classifier
test_preds = voting_clf.predict(X_test)

# Save predictions to submission file
submission_df['Salary'] = test_preds
submission_df.to_csv("result.csv", index=False)
print("\n Submission file 'result.csv' created.")
