# CampusPlacementPredictor

Predict the chances getting placed in campus interviews based on certain conditions. 
Prediction is done using Machine Learning models. 
Data has been taken from Kaggle - https://www.kaggle.com/benroshan/factors-affecting-campus-placement

In [1]:
# Add libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split



In [2]:
# Load the data

data= pd.read_csv("../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")

### Do data ananlysis and visualization

In [3]:

data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


* The sl_no column is not required. It can be dropped. 
* The columns ssc_b, hsc_b, hsc_s, degrecae_t and specialisation have categorical values. More analysis is required on how many catigorical values are present. get
* The gender column can also be dropped as it is not useful in determining the outcome.

In [4]:
print('ssc_b unique values:',data['ssc_b'].unique())
print('hsc_b unique values:',data['hsc_b'].unique())
print('hsc_s unique values:',data['hsc_s'].unique())
print('degree_t unique values:',data['degree_t'].unique())
print('specialisation unique values:',data['specialisation'].unique())

ssc_b unique values: ['Others' 'Central']
hsc_b unique values: ['Others' 'Central']
hsc_s unique values: ['Commerce' 'Science' 'Arts']
degree_t unique values: ['Sci&Tech' 'Comm&Mgmt' 'Others']
specialisation unique values: ['Mkt&HR' 'Mkt&Fin']


* The number of categorical values are not too many as seen above. We can use One-hot Encoding and convert them to numerical values.
* Find out some more information about the data distribution


In [6]:
print('size of data:',data.shape)
print()
print('info about data distribution...')
print(data.describe())

size of data: (215, 15)

info about data distribution...
            sl_no       ssc_p       hsc_p    degree_p     etest_p       mba_p  \
count  215.000000  215.000000  215.000000  215.000000  215.000000  215.000000   
mean   108.000000   67.303395   66.333163   66.370186   72.100558   62.278186   
std     62.209324   10.827205   10.897509    7.358743   13.275956    5.833385   
min      1.000000   40.890000   37.000000   50.000000   50.000000   51.210000   
25%     54.500000   60.600000   60.900000   61.000000   60.000000   57.945000   
50%    108.000000   67.000000   65.000000   66.000000   71.000000   62.000000   
75%    161.500000   75.700000   73.000000   72.000000   83.500000   66.255000   
max    215.000000   89.400000   97.700000   91.000000   98.000000   77.890000   

              salary  
count     148.000000  
mean   288655.405405  
std     93457.452420  
min    200000.000000  
25%    240000.000000  
50%    265000.000000  
75%    300000.000000  
max    940000.000000  



* Find out if there are any null values in the data
* Find out if there are any NaN values in 'salary' column that have values other than 'Not Placed' in status column
* Convert all NaN values in salary column to 0

In [7]:
#Find null values in data

c_null = data.isnull().sum()
print('columns with null values: \n',c_null.sort_values(ascending=False))
print()
#find if null values in salry column have placd candidate
d_status=data[data['salary'].isnull() & (data['status'] !='Not Placed')] 
print('Number of rows present in resulting dataset with null values in salary that also have placed candidate: ',d_status.shape[0])

#convert NaN values to 0
data.fillna(0, inplace=True)

columns with null values: 
 salary            67
status             0
mba_p              0
specialisation     0
etest_p            0
workex             0
degree_t           0
degree_p           0
hsc_s              0
hsc_b              0
hsc_p              0
ssc_b              0
ssc_p              0
gender             0
sl_no              0
dtype: int64

Number of rows present in resulting dataset with null values in salary that also have placed candidate:  0


### Data Cleanup

* Drop unnecessary columns
* Do One-hot encoding on categorical data

In [8]:
# Drop the unnecessary columns

data=data.drop(['sl_no', 'gender'], axis=1)

In [9]:
data.head()

Unnamed: 0,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
4,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [10]:
# One hot encode the categorical columns
encoder = OneHotEncoder(sparse=False)
data_a = pd.DataFrame(data[['ssc_p','hsc_p','degree_p','etest_p','mba_p','salary']])
data_b = pd.DataFrame(data[['ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation']])
data_c = pd.DataFrame(data[['status']])

en_data_b=encoder.fit_transform(data_b)
col_names=encoder.get_feature_names(['ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation'])

#Label encode the 'status' column
le = preprocessing.LabelEncoder()
data_cc=le.fit_transform(data_c)
en_data_cc=pd.DataFrame(data_cc,columns=['status'])
print('encoded status column:',en_data_cc.head())
print()
print('category list of status column:',list(le.classes_))
print()

#convert to dataframe
en_data_bb= pd.DataFrame(en_data_b,columns=col_names)
print('One Hot encoded categorical columns:', en_data_bb.head())
print()

encoded status column:    status
0       1
1       1
2       1
3       0
4       1

category list of status column: ['Not Placed', 'Placed']

One Hot encoded categorical columns:    ssc_b_Central  ssc_b_Others  hsc_b_Central  hsc_b_Others  hsc_s_Arts  \
0            0.0           1.0            0.0           1.0         0.0   
1            1.0           0.0            0.0           1.0         0.0   
2            1.0           0.0            1.0           0.0         1.0   
3            1.0           0.0            1.0           0.0         0.0   
4            1.0           0.0            1.0           0.0         0.0   

   hsc_s_Commerce  hsc_s_Science  degree_t_Comm&Mgmt  degree_t_Others  \
0             1.0            0.0                 0.0              0.0   
1             0.0            1.0                 0.0              0.0   
2             0.0            0.0                 1.0              0.0   
3             0.0            1.0                 0.0              0.0   
4    

  return f(**kwargs)


In [11]:
#Combine the encoded data to create final dataset
final_data=pd.concat([data_a,en_data_bb,en_data_cc], axis=1)
final_data.head()

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,ssc_b_Central,ssc_b_Others,hsc_b_Central,hsc_b_Others,...,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status
0,67.0,91.0,58.0,55.0,58.8,270000.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1
1,79.33,78.33,77.48,86.5,66.28,200000.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1
2,65.0,68.0,64.0,75.0,57.8,250000.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
3,56.0,52.0,52.0,66.0,59.43,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0
4,85.8,73.6,73.3,96.8,55.5,425000.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1


* Create test and train split (use 20% as test set as there is less data)
* Currently testing with linear regression only

In [12]:
final_data.columns

Index(['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary',
       'ssc_b_Central', 'ssc_b_Others', 'hsc_b_Central', 'hsc_b_Others',
       'hsc_s_Arts', 'hsc_s_Commerce', 'hsc_s_Science', 'degree_t_Comm&Mgmt',
       'degree_t_Others', 'degree_t_Sci&Tech', 'workex_No', 'workex_Yes',
       'specialisation_Mkt&Fin', 'specialisation_Mkt&HR', 'status'],
      dtype='object')

In [13]:
# Create X and y
X=final_data[['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p','ssc_b_Central', 'ssc_b_Others', 'hsc_b_Central', 'hsc_b_Others','hsc_s_Arts', 'hsc_s_Commerce',
              'hsc_s_Science', 'degree_t_Comm&Mgmt','degree_t_Others', 'degree_t_Sci&Tech', 'workex_No', 'workex_Yes','specialisation_Mkt&Fin', 
              'specialisation_Mkt&HR']]

y_status=final_data['status']
y_salary=final_data['salary']

# Test train split for salary
X_train, X_test, y_salary_train, y_salary_test = train_test_split(X, y_salary, test_size=0.2, random_state=42)

# Test train split for status
X_train, X_test, y_status_train, y_status_test = train_test_split(X, y_status, test_size=0.2, random_state=42)

In [14]:
#Predict Salary

#Linear Regression
from sklearn.linear_model import LinearRegression
lr_model_salary = LinearRegression()
lr_model_salary.fit(X_train,y_salary_train)
lr_y_pred_salary=lr_model_salary.predict(X_test)

In [15]:
#Calculate accuracy scores of different models
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

print('Mean Squared Error for linear regression:', mean_squared_error(y_salary_test,lr_y_pred_salary))
print('Variance score:', r2_score(y_salary_test, lr_y_pred_salary))


Mean Squared Error for linear regression: 15394728691.156902
Variance score: 0.30194522126503054


In [17]:
#Save model for making predictions
import pickle

filename = '/kaggle/working/model_v01.pkl'
pickle.dump(lr_model_salary, open(filename, 'wb'))


"""
Load the model later
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)
"""

In [None]:
#Normalize the data
from sklearn import preprocessing
import numpy as np

# normalize the data attributes
normalized_data = preprocessing.normalize(final_data)
#print("Normalized Data = ", normalized_data)

normalized_data=pd.DataFrame(normalized_data,columns=final_data.columns)
normalized_data.head()


In [None]:
# Create X and y on Normalized data
X=normalized_data[['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p','ssc_b_Central', 'ssc_b_Others', 'hsc_b_Central', 'hsc_b_Others','hsc_s_Arts', 'hsc_s_Commerce',
              'hsc_s_Science', 'degree_t_Comm&Mgmt','degree_t_Others', 'degree_t_Sci&Tech', 'workex_No', 'workex_Yes','specialisation_Mkt&Fin', 
              'specialisation_Mkt&HR']]

y_status=normalized_data['status']
y_salary=normalized_data['salary']

# Test train split for salary
X_train, X_test, y_salary_train, y_salary_test = train_test_split(X, y_salary, test_size=0.2, random_state=42)

# Test train split for status
X_train, X_test, y_status_train, y_status_test = train_test_split(X, y_status, test_size=0.2, random_state=42)

In [None]:
#Predict Status

#Linear Regression

In [None]:

#Logistic Regression: basic linear classifier (good to baseline)
#Random Forest: ensemble bagging classifier
#K-Nearest Neighbors: instance based classifier
#Support Vector Machines: maximum margin classifier
#Gaussian Naive Bayes: probabilistic classifier
#XGBoost: ensemble (extreme!) boosting classifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd

#Compare multiple models - Predict salary
def compare_model_salary(X_train, y_salary_train,X_test, y_salary_test):
        
    dfs = []
    models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC()), 
          ('GNB', GaussianNB()),
          ('XGB', XGBClassifier())
        ]
    results = []
    names = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    #target_names = ['malignant', 'benign']
    
    for name, model in models:
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
        cv_results = model_selection.cross_validate(model, X_train, y_salary_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_salary_train)
        y_pred = clf.predict(X_test)
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        results.append(cv_results)
        names.append(name)
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)
        
    final = pd.concat(dfs, ignore_index=True)
    
    return final


#call the function
compare_model_salary(X_train, y_salary_train, X_test, y_salary_test)

In [None]:
print(y_salary_train.shape)
print(y_salary_test.shape)