# Data Wrangling and Pre Processing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter("ignore")


In [5]:
# Load the data from the file recruitmentdataset.csv. (2)
df = pd.read_csv("recruitmentdataset.csv")
df.head()


Unnamed: 0,Id,gender,age,nationality,sport,ind-university_grade,ind-debateclub,ind-programming_exp,ind-international_exp,ind-entrepeneur_exp,ind-languages,ind-exact_study,ind-degree,company,decision
0,x8011e,female,24,German,Swimming,70,False,False,False,False,1,True,phd,A,True
1,x6077a,male,26,German,Golf,67,False,True,False,False,2,True,bachelor,A,False
2,x6006e,female,23,Dutch,Running,67,False,True,True,False,0,True,master,A,False
3,x2173b,male,24,Dutch,Cricket,70,False,True,False,False,1,True,master,A,True
4,x6241a,female,26,German,Golf,59,False,False,False,False,1,False,master,A,True


In [6]:

print(df.isnull().sum()) 
# No null values, no need for data wrangling for null values

Id                       0
gender                   0
age                      0
nationality              0
sport                    0
ind-university_grade     0
ind-debateclub           0
ind-programming_exp      0
ind-international_exp    0
ind-entrepeneur_exp      0
ind-languages            0
ind-exact_study          0
ind-degree               0
company                  0
decision                 0
dtype: int64


### Restricting dataset to Company = D only

In [7]:
df = df[df['company']=='D']
df.head()

Unnamed: 0,Id,gender,age,nationality,sport,ind-university_grade,ind-debateclub,ind-programming_exp,ind-international_exp,ind-entrepeneur_exp,ind-languages,ind-exact_study,ind-degree,company,decision
3000,x4293e,female,28,Dutch,Golf,65,False,False,True,False,1,False,phd,D,True
3001,x8138a,female,22,Dutch,Football,70,False,True,False,False,1,True,master,D,True
3002,x8454a,male,26,Dutch,Tennis,59,False,False,False,False,2,False,bachelor,D,False
3003,x9679f,male,23,Dutch,Tennis,69,False,True,False,True,2,True,master,D,True
3004,x8821g,female,21,Dutch,Football,66,False,False,False,True,2,False,bachelor,D,True


### Changing categorical data in to Numerical data to make them ready for machine learning model

In [14]:
df[['gender', 'nationality','sport','ind-degree']] = df[['gender', 'nationality','sport','ind-degree']].apply(lambda x: pd.factorize(x)[0])

### Changing boolean variables from True/False to 1/0.

In [12]:
df["ind-debateclub"] = df["ind-debateclub"].astype(int)
df["ind-programming_exp"] = df["ind-programming_exp"].astype(int)
df["ind-international_exp"] = df["ind-international_exp"].astype(int)
df["ind-entrepeneur_exp"] = df["ind-entrepeneur_exp"].astype(int)
df["ind-exact_study"] = df["ind-exact_study"].astype(int)
df["decision"] = df["decision"].astype(int)

In [15]:
df.head()

Unnamed: 0,Id,gender,age,nationality,sport,ind-university_grade,ind-debateclub,ind-programming_exp,ind-international_exp,ind-entrepeneur_exp,ind-languages,ind-exact_study,ind-degree,company,decision
3000,x4293e,0,28,0,0,65,0,0,1,0,1,0,0,D,1
3001,x8138a,0,22,0,1,70,0,1,0,0,1,1,1,D,1
3002,x8454a,1,26,0,2,59,0,0,0,0,2,0,2,D,0
3003,x9679f,1,23,0,2,69,0,1,0,1,2,1,1,D,1
3004,x8821g,0,21,0,1,66,0,0,0,1,2,0,2,D,1


----------------------------------------------------
----------------------------------------------------
----------------------------------------------------

# Model 1 :  Support Vector Machines with linear kernel
Model M1 incorporates Support Vector Machines with linear kernel (i.e. sklearn.svm.SVM) and should only use four indicators.

### Selecting last four indicators as descriptive features for the model. 

- ind-entrepeneur_exp	
- ind-languages	
- ind-exact_study	
- ind-degree




In [37]:
# extracting descriptive features
descriptive_features_model1 = df.iloc[:,9:13]
descriptive_features_model1


Unnamed: 0,ind-entrepeneur_exp,ind-languages,ind-exact_study,ind-degree
3000,0,1,0,0
3001,0,1,1,1
3002,0,2,0,2
3003,1,2,1,1
3004,1,2,0,2
...,...,...,...,...
3995,0,0,0,1
3996,1,2,1,2
3997,1,2,0,2
3998,0,1,1,2


### Setting Decision as Target Feature

In [26]:
# extracting target feature
target_feature = df.iloc[:,-1:]
print(target_feature)

      decision
3000         1
3001         1
3002         0
3003         1
3004         1
...        ...
3995         0
3996         0
3997         0
3998         0
3999         0

[1000 rows x 1 columns]


### Developing Model 1 with last 4 indicators.

In [42]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
model1 = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the data
model1.fit(descriptive_features_model1, target_feature)

#Predict the decision for the dataset
prediction_model1 = model1.predict(descriptive_features_model1)

In [43]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy of model1 with last 4 indicators:",metrics.accuracy_score(target_feature, prediction_model1)*100,'%')

Accuracy of model1 with last 4 indicators: 83.1 %


----------------------------------------------------
----------------------------------------------------
----------------------------------------------------

# Model 2: 

Build a prediction model M2 using the technique of your choice and any indicators you want. Does your model perform better? Explain how you trained and tested the model and why you have chosen these indicators.

### Selecting first four indicators as descriptive features for the model 2. 
- ind-university_grade,	
- ind-debateclub,	
- ind-programming_exp 	
- ind-international_exp 

In [41]:
# extracting descriptive features
descriptive_features_model2 = df.iloc[:,5:9]
descriptive_features_model2

Unnamed: 0,ind-university_grade,ind-debateclub,ind-programming_exp,ind-international_exp
3000,65,0,0,1
3001,70,0,1,0
3002,59,0,0,0
3003,69,0,1,0
3004,66,0,0,0
...,...,...,...,...
3995,63,0,0,0
3996,62,0,0,0
3997,60,1,0,0
3998,66,0,1,0


In [44]:
from sklearn.neighbors import KNeighborsClassifier

model2 = KNeighborsClassifier(n_neighbors=3)

#Train the model2 using the data
model2.fit(descriptive_features_model2,target_feature)

#Predict the decision for the dataset
prediction_model2 = model2.predict(descriptive_features_model2)

In [46]:

print("Accuracy of model2 with first 4 indicators:",metrics.accuracy_score(target_feature, prediction_model2)*100,'%')

Accuracy of model2 with first 4 indicators: 90.2 %
