In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error, accuracy_score

In [2]:
#importing Dataset
data = pd.read_csv("FinalDataset.csv")
data.head()

Unnamed: 0,SR. NO,Qualification,Branch,10th Year of passing,10th Percentage (Min 60 %),12th Year of passing,12th Percentage (Min 60 %),Diploma Year of Passing (If Any),Diploma (%),UG Starting Year,...,Sem 6 %,Aggregate UG Percentage (Min 60 %),UG Year of passing,Gaps (If Any) Also Specify Year,Placed or Not,Job Offer,Aptitude,Package,Internships,Certifications
0,119,BE,Computer Engineering,2013,95.0,2015.0,92.3,,,2016,...,84.89,83.26,2020,,yes,CAPGEMINI,1,10.926,3,3
1,151,BE,Electronics and Telecommunication,2014,88.2,2016.0,86.0,,,2016,...,70.4,83.2,2020,,yes,LTI,1,9.62,3,2
2,20,BE,IT,2011,95.27,2013.0,74.0,,,2013,...,70.8,82.46,2020,THREE,no,No,1,0.0,1,0
3,48,BE,Electronics and Telecommunication,2014,95.0,2016.0,90.46,,,2016,...,81.41,81.41,2020,,yes,BIRLASOFT,1,9.241,3,2
4,10,BE,Electronics and Telecommunication,2014,87.0,2016.0,87.0,,,2016,...,83.632,77.49,2020,,yes,CAPGEMINI,1,8.449,3,2


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 25 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   SR. NO                              197 non-null    int64  
 1   Qualification                       197 non-null    object 
 2   Branch                              197 non-null    object 
 3   10th Year of passing                197 non-null    int64  
 4   10th Percentage (Min 60 %)          197 non-null    float64
 5   12th  Year of passing               144 non-null    float64
 6   12th Percentage (Min 60 %)          144 non-null    float64
 7   Diploma Year of Passing (If Any)    55 non-null     float64
 8   Diploma (%)                         55 non-null     float64
 9   UG Starting Year                    197 non-null    int64  
 10  Sem 1 %                             141 non-null    float64
 11  Sem 2 %                             140 non-n

In [4]:
data.isnull().sum()

SR. NO                                  0
Qualification                           0
Branch                                  0
10th Year of passing                    0
10th Percentage (Min 60 %)              0
12th  Year of passing                  53
12th Percentage (Min 60 %)             53
Diploma Year of Passing (If Any)      142
Diploma (%)                           142
UG Starting Year                        0
Sem 1 %                                56
Sem 2 %                                57
Sem 3 %                                 0
Sem 4 %                                 0
Sem 5 %                                 0
Sem 6 %                                 0
Aggregate UG Percentage (Min 60 %)      0
UG Year of passing                      0
Gaps (If Any) Also Specify Year       194
Placed or Not                           0
Job Offer                               0
Aptitude                                0
Package                                 0
Internships                       

In [5]:
#DROPPING COLUMNS THAT WON'T CONTRIBUTE IN THE MODEL
data=data.drop(['Sem 1 %','Sem 2 %','SR. NO','Qualification','UG Starting Year','10th Year of passing','12th  Year of passing','Diploma Year of Passing (If Any)','UG Year of passing'], axis=1)

In [6]:
data.head(10)

Unnamed: 0,Branch,10th Percentage (Min 60 %),12th Percentage (Min 60 %),Diploma (%),Sem 3 %,Sem 4 %,Sem 5 %,Sem 6 %,Aggregate UG Percentage (Min 60 %),Gaps (If Any) Also Specify Year,Placed or Not,Job Offer,Aptitude,Package,Internships,Certifications
0,Computer Engineering,95.0,92.3,,86.0,86.0,81.63,84.89,83.26,,yes,CAPGEMINI,1,10.926,3,3
1,Electronics and Telecommunication,88.2,86.0,,80.3,71.94,75.0,70.4,83.2,,yes,LTI,1,9.62,3,2
2,IT,95.27,74.0,,59.73,66.93,67.73,70.8,82.46,THREE,no,No,1,0.0,1,0
3,Electronics and Telecommunication,95.0,90.46,,82.6,82.45,84.89,81.41,81.41,,yes,BIRLASOFT,1,9.241,3,2
4,Electronics and Telecommunication,87.0,87.0,,83.78,77.712,77.194,83.632,77.49,,yes,CAPGEMINI,1,8.449,3,2
5,Computer Engineering,83.4,,86.65,79.71,78.304,80.524,71.2,77.49,,no,No,1,0.0,1,1
6,Computer Engineering,94.2,86.92,,81.19,73.64,72.31,71.2,76.27,,no,No,1,0.0,1,1
7,Electronics and Telecommunication,89.8,80.41,,83.71,75.34,75.05,69.42,76.23,,no,No,0,0.0,1,1
8,Instrumentation,95.6,84.6,,76.97,77.86,80.89,78.89,76.15,,no,No,1,0.0,1,1
9,Electronics,91.8,71.69,,78.008,81.042,74.604,82.448,75.788,,yes,TCS NINJA,1,6.9788,1,0


In [7]:
data['Branch'].unique()

array(['Computer Engineering', 'Electronics and Telecommunication', 'IT',
       'Instrumentation', 'Electronics'], dtype=object)

In [8]:
#RENAMING THE COLUMNS
data = data.rename(columns={'10th Percentage (Min 60 %)':'10th_p'})
data = data.rename(columns={'12th Percentage (Min 60 %)':'12th_p'})
data = data.rename(columns={'Diploma (%)':'Diploma_p'})
data = data.rename(columns={'Aggregate UG Percentage (Min 60 %)':'Agg_UG_p'})
data = data.rename(columns={'Gaps (If Any) Also Specify Year':'Gaps'})
data = data.rename(columns={'Placed or Not':'status'})
data = data.rename(columns={'Job Offer':'Offer'})

In [9]:
#Label Encoding of String type data
le = LabelEncoder()
data['Branch'] = le.fit_transform(data.Branch.values)
data['Gaps'] = le.fit_transform(data.Gaps.values)
data['status'] = le.fit_transform(data.status.values)
data['Gaps'] = data['Gaps'].map({1:1,0:1,2:0})

In [10]:
data.head(5)

Unnamed: 0,Branch,10th_p,12th_p,Diploma_p,Sem 3 %,Sem 4 %,Sem 5 %,Sem 6 %,Agg_UG_p,Gaps,status,Offer,Aptitude,Package,Internships,Certifications
0,0,95.0,92.3,,86.0,86.0,81.63,84.89,83.26,0,1,CAPGEMINI,1,10.926,3,3
1,2,88.2,86.0,,80.3,71.94,75.0,70.4,83.2,0,1,LTI,1,9.62,3,2
2,3,95.27,74.0,,59.73,66.93,67.73,70.8,82.46,1,0,No,1,0.0,1,0
3,2,95.0,90.46,,82.6,82.45,84.89,81.41,81.41,0,1,BIRLASOFT,1,9.241,3,2
4,2,87.0,87.0,,83.78,77.712,77.194,83.632,77.49,0,1,CAPGEMINI,1,8.449,3,2


In [11]:
# 2-EXTC 0-Comps 3-IT 1-Electronics 4-Instru
data.Branch.unique()


array([0, 2, 3, 4, 1])

In [12]:
# 0 - NO GAPS, 1- ONE OR MORE GAPS 
data.Gaps.unique()

array([0, 1], dtype=int64)

In [13]:
# 0-no 1-yes
data.status.unique()

array([1, 0])

In [14]:
#IMPUTING NULL VALUES
data=data.fillna(value=1)


In [15]:
#Converting 12th Percent into 3 categories as [0-50,50-75,75-100] due to existence of null value in it
data['12th_P'] = pd.cut(data['12th_p'],
                      bins=[0, 50, 75, float('Inf')],
                      labels=[0, 1, 2])

In [16]:
#Converting Diploma Percent into 3 categories as [0-50,50-75,75-100] due to existence of null value in it
data['Diploma_P'] = pd.cut(data['Diploma_p'],
                      bins=[0, 50, 75, float('Inf')],
                      labels=[0, 1, 2])

In [17]:
#Dropping the 12th and diploma columns which had null values in it
data=data.drop(['12th_p','Diploma_p'], axis=1)

In [18]:
data.head()

Unnamed: 0,Branch,10th_p,Sem 3 %,Sem 4 %,Sem 5 %,Sem 6 %,Agg_UG_p,Gaps,status,Offer,Aptitude,Package,Internships,Certifications,12th_P,Diploma_P
0,0,95.0,86.0,86.0,81.63,84.89,83.26,0,1,CAPGEMINI,1,10.926,3,3,2,0
1,2,88.2,80.3,71.94,75.0,70.4,83.2,0,1,LTI,1,9.62,3,2,2,0
2,3,95.27,59.73,66.93,67.73,70.8,82.46,1,0,No,1,0.0,1,0,1,0
3,2,95.0,82.6,82.45,84.89,81.41,81.41,0,1,BIRLASOFT,1,9.241,3,2,2,0
4,2,87.0,83.78,77.712,77.194,83.632,77.49,0,1,CAPGEMINI,1,8.449,3,2,2,0


In [20]:
#creating a variable having the columns of the dataset which are needed for the model
predict1 = "status"
data1 = data[["Branch","10th_p","Sem 3 %","Sem 4 %","Sem 5 %","Sem 6 %","Agg_UG_p","Gaps","status","Aptitude","12th_P","Diploma_P","Certifications","Internships"]]

In [21]:
# x variable has all the column's data except of status and vice versa
x = np.array(data1.drop([predict1],axis=1))
y = np.array(data1[predict1])

In [22]:
#Splitting Dataset into test and train
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [23]:
#Training Different Classification models
xgb_model = XGBClassifier().fit(x_train,y_train)
lgbm_model = LGBMClassifier().fit(x_train,y_train)
dt_model = DecisionTreeClassifier().fit(x_train,y_train)
hgb_model = HistGradientBoostingClassifier().fit(x_train,y_train)
RF_model = RandomForestClassifier().fit(x_train,y_train)
gb_model = GradientBoostingClassifier().fit(x_train,y_train)
svc_model = svm.SVC().fit(x_train,y_train)

[LightGBM] [Info] Number of positive: 83, number of negative: 74
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000541 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 303
[LightGBM] [Info] Number of data points in the train set: 157, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.528662 -> initscore=0.114776
[LightGBM] [Info] Start training from score 0.114776


In [25]:
#creating a function to test the model and display its metrics
def evaluate_model(model,x_test,y_test):
    prediction = model.predict(x_test)
    print("Mean Absolute error", mean_absolute_error(y_test,prediction))
    print("Mean Absolute Percentage error", mean_absolute_percentage_error(y_test,prediction))
    print("Mean Squared error", mean_squared_error(y_test,prediction))
    print("Root Mean Squared error", np.sqrt(mean_squared_error(y_test,prediction)))
    print("R2 Score", r2_score(y_test,prediction))
    print("accuracy score:",accuracy_score(y_test,prediction))

In [26]:
#printing the values of the metrics of all the models
print("model: XG Boost Classifier")
print(evaluate_model(xgb_model,x_test,y_test))
print()
print("model: LGBM Classifier")
print(evaluate_model(lgbm_model,x_test,y_test))
print()
print("model: Decision Tree Classifier")
print(evaluate_model(dt_model,x_test,y_test))
print()
print("model: Hist Gradient Boosting Classifier")
print(evaluate_model(hgb_model,x_test,y_test))
print()
print("model: Random Forest Classifier")
print(evaluate_model(RF_model,x_test,y_test))
print()
print("model: Gradient Boosting Classifier")
print(evaluate_model(gb_model,x_test,y_test))
print()
print("model: SVM Classifier")
print(evaluate_model(svc_model,x_test,y_test))

model: XG Boost Classifier
Mean Absolute error 0.1
Mean Absolute Percentage error 337769972052787.2
Mean Squared error 0.1
Root Mean Squared error 0.31622776601683794
R2 Score 0.5989974937343359
accuracy score: 0.9
None

model: LGBM Classifier
Mean Absolute error 0.1
Mean Absolute Percentage error 225179981368524.84
Mean Squared error 0.1
Root Mean Squared error 0.31622776601683794
R2 Score 0.5989974937343359
accuracy score: 0.9
None

model: Decision Tree Classifier
Mean Absolute error 0.1
Mean Absolute Percentage error 225179981368524.84
Mean Squared error 0.1
Root Mean Squared error 0.31622776601683794
R2 Score 0.5989974937343359
accuracy score: 0.9
None

model: Hist Gradient Boosting Classifier
Mean Absolute error 0.025
Mean Absolute Percentage error 112589990684262.4
Mean Squared error 0.025
Root Mean Squared error 0.15811388300841897
R2 Score 0.899749373433584
accuracy score: 0.975
None

model: Random Forest Classifier
Mean Absolute error 0.1
Mean Absolute Percentage error 3377699

In [27]:
#saving the model using pickle and choosing the random forest classifier algorithm
import pickle
pickle.dump(RF_model,open("placement_model1.pkl","wb"))

In [28]:
#dropping data of people who are not placed for second model  
i = data[(data.Offer=='No')].index
data=data.drop(i)

In [29]:
#saving the changed dataset in an new csv file
data.to_csv("model2_data.csv")