<a href="https://colab.research.google.com/github/aniketsharma00411/mba_placement_prediction/blob/main/salary_feature_selection_mba_placement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset link: https://www.kaggle.com/benroshan/factors-affecting-campus-placement

# Uploading dataset

In [None]:
from google.colab import files

uploaded = files.upload()

Saving Placement_Data_Full_Class.csv to Placement_Data_Full_Class.csv


# Initialization

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('Placement_Data_Full_Class.csv', index_col='sl_no').reset_index(drop=True)

In [None]:
df = df.dropna(subset=['salary']).reset_index(drop=True)
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0
4,M,82.0,Central,64.0,Central,Science,66.0,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,Placed,252000.0


# Preparing data

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [None]:
df_train = df_train.reset_index(drop=True)
df_train.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,73.24,Others,50.83,Others,Science,64.27,Sci&Tech,Yes,64.0,Mkt&Fin,66.23,Placed,500000.0
1,M,74.0,Central,70.0,Central,Science,72.0,Comm&Mgmt,Yes,60.0,Mkt&Fin,57.24,Placed,260000.0
2,M,82.0,Others,61.0,Others,Science,62.0,Sci&Tech,No,89.0,Mkt&Fin,65.45,Placed,360000.0
3,M,69.6,Central,68.4,Central,Commerce,78.3,Comm&Mgmt,Yes,60.0,Mkt&Fin,63.7,Placed,250000.0
4,M,73.0,Central,73.0,Central,Science,66.0,Sci&Tech,Yes,70.0,Mkt&Fin,68.07,Placed,275000.0


In [None]:
numerical = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
categorical = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']

classification_target = ['status']
regression_target = ['salary']

In [None]:
X_train = df_train[numerical+categorical]
y_train = df_train[regression_target]['salary']
X_val = df_val[numerical+categorical]
y_val = df_val[regression_target]['salary']

In [None]:
X_train.head()

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,gender,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation
0,73.24,50.83,64.27,64.0,66.23,M,Others,Others,Science,Sci&Tech,Yes,Mkt&Fin
1,74.0,70.0,72.0,60.0,57.24,M,Central,Central,Science,Comm&Mgmt,Yes,Mkt&Fin
2,82.0,61.0,62.0,89.0,65.45,M,Others,Others,Science,Sci&Tech,No,Mkt&Fin
3,69.6,68.4,78.3,60.0,63.7,M,Central,Central,Commerce,Comm&Mgmt,Yes,Mkt&Fin
4,73.0,73.0,66.0,70.0,68.07,M,Central,Central,Science,Sci&Tech,Yes,Mkt&Fin


In [None]:
y_train.head()

0    500000.0
1    260000.0
2    360000.0
3    250000.0
4    275000.0
Name: salary, dtype: float64

# Creating Pipeline

In [None]:
def create_new_pipeline(numerical, categorical):
    numerical_transformer = SimpleImputer(strategy='mean')

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoding', OneHotEncoder(drop='first'))
    ])

    preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical),
        ('categorical', categorical_transformer, categorical)
    ])

    scaler = StandardScaler()

    linreg = LinearRegression()

    pipeline = Pipeline(
    steps=[
           ('preprocessing', preprocessor),
           ('scaling', scaler),
           ('model', linreg)
          ]
    )

    return pipeline

# Deciding features to use based on their correlation with target (calculated during EDA)

In [None]:
numerical_drop_order = ['hsc_p', 'ssc_p', 'degree_p', 'etest_p', 'mba_p']
categorical_drop_order = ['degree_t', 'specialisation', 'hsc_b', 'hsc_s', 'ssc_b', 'workex', 'gender']

In [None]:
for i in range(len(numerical_drop_order)):
    pipeline = create_new_pipeline(numerical_drop_order[i:], categorical)

    pipeline.fit(X_train.drop(numerical_drop_order[:i], axis=1), y_train)

    print(f'Features included: {numerical_drop_order[i:]}')
    print(f'Training score: {mean_squared_error(pipeline.predict(X_train.drop(numerical_drop_order[:i], axis=1)), y_train, squared=False)}')
    print(f'Validation score: {mean_squared_error(pipeline.predict(X_val.drop(numerical_drop_order[:i], axis=1)), y_val, squared=False)}')
    print()
    print()

Features included: ['hsc_p', 'ssc_p', 'degree_p', 'etest_p', 'mba_p']
Training score: 90155.88760109483
Validation score: 72827.10459533164


Features included: ['ssc_p', 'degree_p', 'etest_p', 'mba_p']
Training score: 90714.5808924323
Validation score: 70987.00753007791


Features included: ['degree_p', 'etest_p', 'mba_p']
Training score: 91043.83537905151
Validation score: 73287.36862415534


Features included: ['etest_p', 'mba_p']
Training score: 94331.16582251826
Validation score: 65318.33470730808


Features included: ['mba_p']
Training score: 94345.9408342291
Validation score: 65131.27318674698




We are not removing any features.

In [None]:
for i in range(len(categorical_drop_order)):
    pipeline = create_new_pipeline(numerical, categorical_drop_order[i:])

    pipeline.fit(X_train.drop(categorical_drop_order[:i], axis=1), y_train)

    print(f'Features included: {categorical_drop_order[i:]}')
    print(f'Training score: {mean_squared_error(pipeline.predict(X_train.drop(categorical_drop_order[:i], axis=1)), y_train, squared=False)}')
    print(f'Validation score: {mean_squared_error(pipeline.predict(X_train.drop(categorical_drop_order[:i], axis=1)), y_train, squared=False)}')
    print()
    print()

Features included: ['degree_t', 'specialisation', 'hsc_b', 'hsc_s', 'ssc_b', 'workex', 'gender']
Training score: 90155.88760109483
Validation score: 90155.88760109483


Features included: ['specialisation', 'hsc_b', 'hsc_s', 'ssc_b', 'workex', 'gender']
Training score: 91327.66451701414
Validation score: 91327.66451701414


Features included: ['hsc_b', 'hsc_s', 'ssc_b', 'workex', 'gender']
Training score: 91920.88191234689
Validation score: 91920.88191234689


Features included: ['hsc_s', 'ssc_b', 'workex', 'gender']
Training score: 92877.52600326746
Validation score: 92877.52600326746


Features included: ['ssc_b', 'workex', 'gender']
Training score: 93039.34107887023
Validation score: 93039.34107887023


Features included: ['workex', 'gender']
Training score: 93039.42986867475
Validation score: 93039.42986867475


Features included: ['gender']
Training score: 93126.8348663811
Validation score: 93126.8348663811




We can not remove any categorical feature.