<a href="https://colab.research.google.com/github/aniketsharma00411/mba_placement_prediction/blob/main/salary_elastic_net_mba_placement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset link: https://www.kaggle.com/benroshan/factors-affecting-campus-placement

# Uploading dataset

In [2]:
from google.colab import files

uploaded = files.upload()

Saving Placement_Data_Full_Class.csv to Placement_Data_Full_Class.csv


# Initialization

In [1]:
import pandas as pd
import numpy as np

from itertools import product

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('Placement_Data_Full_Class.csv', index_col='sl_no').reset_index(drop=True)

In [4]:
df = df.dropna(subset=['salary']).reset_index(drop=True)
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0
4,M,82.0,Central,64.0,Central,Science,66.0,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,Placed,252000.0


# Preparing data

In [5]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [6]:
df_train = df_train.reset_index(drop=True)
df_train.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,73.24,Others,50.83,Others,Science,64.27,Sci&Tech,Yes,64.0,Mkt&Fin,66.23,Placed,500000.0
1,M,74.0,Central,70.0,Central,Science,72.0,Comm&Mgmt,Yes,60.0,Mkt&Fin,57.24,Placed,260000.0
2,M,82.0,Others,61.0,Others,Science,62.0,Sci&Tech,No,89.0,Mkt&Fin,65.45,Placed,360000.0
3,M,69.6,Central,68.4,Central,Commerce,78.3,Comm&Mgmt,Yes,60.0,Mkt&Fin,63.7,Placed,250000.0
4,M,73.0,Central,73.0,Central,Science,66.0,Sci&Tech,Yes,70.0,Mkt&Fin,68.07,Placed,275000.0


In [7]:
numerical = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
categorical = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']

classification_target = ['status']
regression_target = ['salary']

In [8]:
X_train = df_train[numerical+categorical]
y_train = df_train[regression_target]['salary']
X_val = df_val[numerical+categorical]
y_val = df_val[regression_target]['salary']

In [9]:
X_train.head()

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,gender,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation
0,73.24,50.83,64.27,64.0,66.23,M,Others,Others,Science,Sci&Tech,Yes,Mkt&Fin
1,74.0,70.0,72.0,60.0,57.24,M,Central,Central,Science,Comm&Mgmt,Yes,Mkt&Fin
2,82.0,61.0,62.0,89.0,65.45,M,Others,Others,Science,Sci&Tech,No,Mkt&Fin
3,69.6,68.4,78.3,60.0,63.7,M,Central,Central,Commerce,Comm&Mgmt,Yes,Mkt&Fin
4,73.0,73.0,66.0,70.0,68.07,M,Central,Central,Science,Sci&Tech,Yes,Mkt&Fin


In [10]:
y_train.head()

0    500000.0
1    260000.0
2    360000.0
3    250000.0
4    275000.0
Name: salary, dtype: float64

# Creating a Pipeline

In [11]:
def create_new_pipeline(params):
    numerical_transformer = SimpleImputer(strategy='mean')

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoding', OneHotEncoder(drop='first'))
    ])

    preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical),
        ('categorical', categorical_transformer, categorical)
    ])

    scaler = StandardScaler()

    elasticnet = ElasticNet(
        random_state=42,
        **params
    )

    pipeline = Pipeline(
    steps=[
           ('preprocessing', preprocessor),
           ('scaling', scaler),
           ('model', elasticnet)
          ]
    )

    return pipeline

# Hyperparameter Tuning

In [14]:
search_space = {
    'alpha': np.logspace(-1, 0, num=5),
    'l1_ratio': np.linspace(0, 1, num=10),
    'fit_intercept': [False, True],
    'normalize': [False, True]
}

In [15]:
best_score = float('inf')
best_params = {}

for alpha, l1_ratio, fit_intercept, normalize in product(*search_space.values()):
    params = {
        'alpha': alpha,
        'l1_ratio': l1_ratio,
        'fit_intercept': fit_intercept,
        'normalize': normalize
    }

    pipeline = create_new_pipeline(params)

    pipeline.fit(X_train, y_train)

    score = mean_squared_error(pipeline.predict(X_val), y_val, squared=False)
    if score < best_score:
        best_score = score
        best_params = params

In [16]:
best_params

{'alpha': 0.1,
 'fit_intercept': True,
 'l1_ratio': 0.7777777777777777,
 'normalize': True}

In [17]:
best_score

58410.36220224665

# Training

In [18]:
X = df_full_train[numerical+categorical]
y = df_full_train[regression_target]['salary']

In [19]:
pipeline = create_new_pipeline(best_params)

In [20]:
pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['ssc_p', 'hsc_p', 'degree_p',
                                                   'etest_p', 'mba_p']),
                                                 ('categorica

# Validation

In [21]:
mean_squared_error(pipeline.predict(X), y, squared=False)

89727.79991854979