<a href="https://colab.research.google.com/github/aniketsharma00411/mba_placement_prediction/blob/main/status_xgboost_mba_placement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset link: https://www.kaggle.com/benroshan/factors-affecting-campus-placement

# Uploading dataset

In [None]:
from google.colab import files

uploaded = files.upload()

Saving Placement_Data_Full_Class.csv to Placement_Data_Full_Class.csv


# Initialization

In [None]:
import pandas as pd
import numpy as np

from itertools import product

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('Placement_Data_Full_Class.csv', index_col='sl_no').reset_index(drop=True)
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


# Preparing data

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [None]:
df_train = df_train.reset_index(drop=True)
df_train.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,44.0,Central,58.0,Central,Arts,55.0,Comm&Mgmt,Yes,64.25,Mkt&HR,58.54,Not Placed,
1,M,66.0,Central,76.0,Central,Commerce,72.0,Comm&Mgmt,Yes,84.0,Mkt&HR,58.95,Placed,275000.0
2,M,63.0,Central,60.0,Central,Commerce,57.0,Comm&Mgmt,Yes,78.0,Mkt&Fin,54.55,Placed,204000.0
3,F,73.0,Others,63.0,Others,Science,66.0,Comm&Mgmt,No,89.0,Mkt&Fin,60.5,Placed,216000.0
4,M,52.0,Others,65.0,Others,Arts,57.0,Others,Yes,75.0,Mkt&Fin,59.81,Not Placed,


In [None]:
numerical = ['hsc_p', 'degree_p', 'ssc_p']
categorical = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']

classification_target = ['status']
regression_target = ['salary']

In [None]:
X_train = df_train[numerical+categorical]
y_train = pd.get_dummies(df_train[classification_target])['status_Placed']
X_val = df_val[numerical+categorical]
y_val = pd.get_dummies(df_val[classification_target])['status_Placed']

In [None]:
X_train.head()

Unnamed: 0,hsc_p,degree_p,ssc_p,gender,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation
0,58.0,55.0,44.0,M,Central,Central,Arts,Comm&Mgmt,Yes,Mkt&HR
1,76.0,72.0,66.0,M,Central,Central,Commerce,Comm&Mgmt,Yes,Mkt&HR
2,60.0,57.0,63.0,M,Central,Central,Commerce,Comm&Mgmt,Yes,Mkt&Fin
3,63.0,66.0,73.0,F,Others,Others,Science,Comm&Mgmt,No,Mkt&Fin
4,65.0,57.0,52.0,M,Others,Others,Arts,Others,Yes,Mkt&Fin


In [None]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: status_Placed, dtype: uint8

# Creating a Pipeline

In [None]:
def create_new_pipeline(params):
    numerical_transformer = SimpleImputer(strategy='mean')

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoding', OneHotEncoder(drop='first'))
    ])

    preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical),
        ('categorical', categorical_transformer, categorical)
    ])

    scaler = StandardScaler()

    logreg = XGBClassifier(
        n_jobs=-1,
        random_state=42,
        **params
    )

    pipeline = Pipeline(
    steps=[
           ('preprocessing', preprocessor),
           ('scaling', scaler),
           ('model', logreg)
          ]
    )

    return pipeline

# Hyperparameter Tuning

In [None]:
search_space = {
    'n_estimators': np.linspace(10, 1000, num=20),
    'max_depth': np.linspace(1, 10, num=10),
    'learning_rate': np.linspace(0.001, 10, num=20),
    'reg_alpha': np.logspace(-1, 1, num=5),
    'reg_lambda': np.logspace(-1, 1, num=5)
}

In [None]:
max_score = 0
best_params = {}

for n_estimators, max_depth, learning_rate, reg_alpha, reg_lambda in product(*search_space.values()):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda
    }

    pipeline = create_new_pipeline(params)

    pipeline.fit(X_train, y_train)

    score = pipeline.score(X_val, y_val)
    if score > max_score:
        max_score = score
        best_params = params

In [None]:
best_params

{'learning_rate': 0.5272631578947369,
 'max_depth': 6,
 'n_estimators': 10,
 'reg_alpha': 0.1,
 'reg_lambda': 1.0}

In [None]:
max_score

0.9767441860465116

# Training

In [None]:
X = df_full_train[numerical+categorical]
y = pd.get_dummies(df_full_train[classification_target])['status_Placed']

In [None]:
pipeline = create_new_pipeline(best_params)

In [None]:
pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['hsc_p', 'degree_p',
                                                   'ssc_p']),
                                                 ('categorical',
                

# Validation

In [None]:
pipeline.score(X, y)

0.9883720930232558