In [1]:
import numpy as np
import pandas as pd

In [332]:
df = pd.read_csv('college_placement_dataset_final.csv')
df = df.drop(columns=['Workshop_Certification'])
df.head()

Unnamed: 0,CGPA,Internship,Paid_Internship,Projects,Aptitude_Score,Skills_Rating,Placement_Training,SSC_Marks,Gender,Placement_Status
0,8.19,0,0,4,74.72,6.55,Not Attended,83.94,Male,Placed
1,7.17,0,0,3,68.51,6.79,Attended,67.28,Male,Placed
2,9.02,1,0,5,85.67,7.79,Attended,60.86,Male,Placed
3,8.44,1,0,4,55.29,6.47,Attended,76.2,Female,Placed
4,8.63,1,0,5,70.58,7.54,Attended,95.0,Female,Placed


In [334]:
print((df['Aptitude_Score'] < 25).sum())
df = df[df['Aptitude_Score'] > 25]

print((df['Skills_Rating'] < 2.5).sum())
df = df[df['Skills_Rating'] > 2.5]
print(df.shape)

51
27
(4921, 10)


In [336]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import pickle

In [338]:
num_cols = ['CGPA', 'Internship', 'Paid_Internship', 'Projects', 
            'Aptitude_Score', 'Skills_Rating', 'SSC_Marks']
cat_cols = ['Placement_Training', 'Gender',]

In [340]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_cols)
])

In [342]:
lr = LogisticRegression()
dt = DecisionTreeClassifier(max_depth=5)
rf = RandomForestClassifier(n_estimators=100, max_depth=5)
svm = SVC(probability=True)
knn = KNeighborsClassifier(n_neighbors=8)

In [344]:
estimators = [('lr', lr), ('dt', dt), ('rf', rf), ('svm', svm), ('knn', knn)]

stc = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(),
    n_jobs=-1,
)

In [346]:
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', stc)
])

In [348]:
x = df.drop(columns=['Placement_Status'])
y = df['Placement_Status']

In [350]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)

In [352]:
full_pipeline.fit(x_train, y_train)
y_pred = full_pipeline.predict(x_test)

In [353]:
from sklearn.metrics import accuracy_score, f1_score

print('accuracy_score: ', accuracy_score(y_test, y_pred))
print('f1_score: ', f1_score(y_test, y_pred, pos_label='Placed'))

accuracy_score:  0.868020304568528
f1_score:  0.9013657056145675


In [354]:
columns = ['CGPA', 'Internship', 'Paid_Internship', 'Projects', 'Aptitude_Score',
       'Skills_Rating', 'Placement_Training', 'SSC_Marks', 'Gender']

testing = pd.DataFrame([[7,	2,	1,	3, 70.97, 8.8, 'Not Attended', 87, 'Female']], columns=columns)

print(full_pipeline.predict(testing))
print('[P(Not_Placed), P(Placed)]')
full_pipeline.predict_proba(testing)

['Placed']
[P(Not_Placed), P(Placed)]


array([[0.15448644, 0.84551356]])

In [355]:
import pickle 

import sklearn
print('numpy: ', np.__version__)
print('pandas: ', pd.__version__)
print('scikit-learn: ', sklearn.__version__)

numpy:  1.26.4
pandas:  2.2.2
scikit-learn:  1.6.1


In [356]:
with open('Final_Placement_prediction_model_with_pipeline.pkl', 'wb') as file:
    pickle.dump(stc, file)