# Prediction of Student Placement Status

# Overview

In [None]:
import numpy as np
import pandas as pd 

source data : https://www.kaggle.com/datasets/benroshan/factors-affecting-campus-placement

In [None]:
train = pd.read_csv('Placement_Data_Full_Class.csv')

In [None]:
train.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


# Preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer

In [None]:
X=train.drop(columns=['sl_no', 'salary', 'status'])
y=train['status']

In [None]:
# numerical data
num=X.select_dtypes(exclude='object').columns

# categorical data
cat=X.select_dtypes(include='object').columns

In [None]:
num

Index(['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p'], dtype='object')

In [None]:
cat

Index(['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex',
       'specialisation'],
      dtype='object')

In [None]:
# SimpleImputer
impute = SimpleImputer(strategy='most_frequent')
Xcat = impute.fit_transform(X[cat])

In [None]:
# OneHotEncoder
encode = OneHotEncoder()
Xcat = encode.fit_transform(Xcat)

In [None]:
# PowerTransform
transform = PowerTransformer()
Xnum = transform.fit_transform(X[num])

# Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [None]:
inter=Pipeline([('i',SimpleImputer(strategy='most_frequent')),
                ('e',OneHotEncoder())])
              
transformers=[('t',PowerTransformer(),num),
              ('inter',inter,cat)]
    
steps=[('pre',ColumnTransformer(transformers=transformers)),
       ('model',LogisticRegression(max_iter=100))]

In [None]:
# create model
model = Pipeline(steps=steps)

In [None]:
# split data
X = train.drop(columns=['sl_no', 'salary', 'status'])
y = train['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# change y to int
y_train = y_train.apply(lambda x:1 if x == 'Placed' else 0)
y_test = y_test.apply(lambda x:1 if x == 'Placed' else 0)

In [None]:
# fit model
model.fit(X_train, y_train)

Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('t', PowerTransformer(),
                                                  Index(['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p'], dtype='object')),
                                                 ('inter',
                                                  Pipeline(steps=[('i',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('e',
                                                                   OneHotEncoder())]),
                                                  Index(['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex',
       'specialisation'],
      dtype='object'))])),
                ('model', LogisticRegression())])

In [None]:
# predict test
y_pred = model.predict(X_test)
precision, recall, fscore, *_ = precision_recall_fscore_support(y_test, y_pred)

In [None]:
# performance report
result = f"""
Precision : {precision}
Recall    : {recall}
F1-Score  : {fscore}"""

print(result)


Precision : [0.8125     0.96296296]
Recall    : [0.92857143 0.89655172]
F1-Score  : [0.86666667 0.92857143]
