In [17]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 200

In [3]:
data = pd.read_csv('Interview.csv')

In [4]:
data.shape

(1234, 28)

In [5]:
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

In [6]:
# Lets make the columns names easier to work with, by adding '_' where there's spaces. That way we invoke them as an object member
cols = data.columns
new_cols = []
for item in cols:
    new = item.replace(' ', '_')
    new_cols.append(new)
print(new_cols)

['Date_of_Interview', 'Client_name', 'Industry', 'Location', 'Position_to_be_closed', 'Nature_of_Skillset', 'Interview_Type', 'Name(Cand_ID)', 'Gender', 'Candidate_Current_Location', 'Candidate_Job_Location', 'Interview_Venue', 'Candidate_Native_location', 'Have_you_obtained_the_necessary_permission_to_start_at_the_required_time', 'Hope_there_will_be_no_unscheduled_meetings', 'Can_I_Call_you_three_hours_before_the_interview_and_follow_up_on_your_attendance_for_the_interview', 'Can_I_have_an_alternative_number/_desk_number._I_assure_you_that_I_will_not_trouble_you_too_much', 'Have_you_taken_a_printout_of_your_updated_resume._Have_you_read_the_JD_and_understood_the_same', 'Are_you_clear_with_the_venue_details_and_the_landmark.', 'Has_the_call_letter_been_shared', 'Expected_Attendance', 'Observed_Attendance', 'Marital_Status']


In [7]:
data.columns = new_cols

In [8]:
data = data.drop('Date_of_Interview', axis=1).drop('Name(Cand_ID)', axis=1)

In [9]:
# Now lets see where we have missing values
data.isnull().sum()

Client_name                                                                                             0
Industry                                                                                                1
Location                                                                                                1
Position_to_be_closed                                                                                   1
Nature_of_Skillset                                                                                      1
Interview_Type                                                                                          1
Gender                                                                                                  1
Candidate_Current_Location                                                                              1
Candidate_Job_Location                                                                                  1
Interview_Venue                               

In [10]:
# But we also have some Nan values that are strings - 'Na', 'NA'
data.replace(to_replace=['Na', 'NA'], value=np.nan, inplace=True)

In [11]:
data.isnull().sum()

Client_name                                                                                             0
Industry                                                                                                1
Location                                                                                                1
Position_to_be_closed                                                                                   1
Nature_of_Skillset                                                                                      1
Interview_Type                                                                                          1
Gender                                                                                                  1
Candidate_Current_Location                                                                              1
Candidate_Job_Location                                                                                  1
Interview_Venue                               

In [12]:
data[data['Are_you_clear_with_the_venue_details_and_the_landmark.'].isnull()]

Unnamed: 0,Client_name,Industry,Location,Position_to_be_closed,Nature_of_Skillset,Interview_Type,Gender,Candidate_Current_Location,Candidate_Job_Location,Interview_Venue,Candidate_Native_location,Have_you_obtained_the_necessary_permission_to_start_at_the_required_time,Hope_there_will_be_no_unscheduled_meetings,Can_I_Call_you_three_hours_before_the_interview_and_follow_up_on_your_attendance_for_the_interview,Can_I_have_an_alternative_number/_desk_number._I_assure_you_that_I_will_not_trouble_you_too_much,Have_you_taken_a_printout_of_your_updated_resume._Have_you_read_the_JD_and_understood_the_same,Are_you_clear_with_the_venue_details_and_the_landmark.,Has_the_call_letter_been_shared,Expected_Attendance,Observed_Attendance,Marital_Status
2,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Male,Chennai,Chennai,Hosur,Chennai,,,,,,,,Uncertain,No,Single
123,Standard Chartered Bank,BFSI,Chennai,Routine,Accounting Operations,Scheduled Walkin,Female,Chennai,Chennai,Chennai,Chennai,No,Yes,No,No,,,Yet to Check,Uncertain,Yes,Single
242,Standard Chartered Bank,BFSI,Bangalore,Routine,AML/KYC/CDD,Scheduled Walkin,Male,Bangalore,Bangalore,Bangalore,Chennai,,,,,,,,Uncertain,Yes,Single
243,Standard Chartered Bank,BFSI,Bangalore,Routine,AML/KYC/CDD,Scheduled Walkin,Female,Bangalore,Bangalore,Bangalore,Bangalore,No,,,,,,,Uncertain,No,Single
244,Standard Chartered Bank,BFSI,Bangalore,Routine,AML/KYC/CDD,Scheduled Walkin,Male,Bangalore,Bangalore,Bangalore,Chennai,,,,,,,,Uncertain,No,Single
245,Standard Chartered Bank,BFSI,Bangalore,Routine,AML/KYC/CDD,Scheduled Walkin,Male,Bangalore,Bangalore,Bangalore,Chennai,,,,,,,,Uncertain,Yes,Married
247,Standard Chartered Bank,BFSI,Bangalore,Routine,AML/KYC/CDD,Scheduled Walkin,Female,Bangalore,Bangalore,Bangalore,Allahabad,,,,,,,,Uncertain,No,Single
248,Standard Chartered Bank,BFSI,Bangalore,Routine,AML/KYC/CDD,Scheduled Walkin,Female,Bangalore,Bangalore,Bangalore,Chennai,,,,,,,,Uncertain,Yes,Single
251,Standard Chartered Bank,BFSI,Bangalore,Routine,AML/KYC/CDD,Scheduled Walkin,Male,Bangalore,Bangalore,Bangalore,Hyderabad,,,,,,,,Uncertain,Yes,Single
254,Standard Chartered Bank,BFSI,Bangalore,Routine,AML/KYC/CDD,Scheduled Walkin,Male,Bangalore,Bangalore,Bangalore,Bangalore,,,,,,,,Uncertain,No,Single


In [13]:
# lets check what will happen if we drop the rows with Nan for the column that has the most Nan.
data[~data['Are_you_clear_with_the_venue_details_and_the_landmark.'].isnull()].isnull().sum()

Client_name                                                                                           0
Industry                                                                                              0
Location                                                                                              0
Position_to_be_closed                                                                                 0
Nature_of_Skillset                                                                                    0
Interview_Type                                                                                        0
Gender                                                                                                0
Candidate_Current_Location                                                                            0
Candidate_Job_Location                                                                                0
Interview_Venue                                                 

In [14]:
# Well that was easy....
data = data[~data['Are_you_clear_with_the_venue_details_and_the_landmark.'].isnull()]

In [15]:
# Lets drop the last row where we have Nan
data = data[~data['Hope_there_will_be_no_unscheduled_meetings'].isnull()]

In [16]:
data.isnull().sum()

Client_name                                                                                           0
Industry                                                                                              0
Location                                                                                              0
Position_to_be_closed                                                                                 0
Nature_of_Skillset                                                                                    0
Interview_Type                                                                                        0
Gender                                                                                                0
Candidate_Current_Location                                                                            0
Candidate_Job_Location                                                                                0
Interview_Venue                                                 

In [18]:
# Now lets clean out the data a little bit.....
data.nunique()

Client_name                                                                                           15
Industry                                                                                               7
Location                                                                                              11
Position_to_be_closed                                                                                  7
Nature_of_Skillset                                                                                    84
Interview_Type                                                                                         5
Gender                                                                                                 2
Candidate_Current_Location                                                                            10
Candidate_Job_Location                                                                                 7
Interview_Venue                                        

In [19]:
# Our label column Observed_Attendance has too many values - 'Yes' 'yes' 'YES etc...
data.Observed_Attendance.unique()

array(['No', 'Yes', 'yes', 'no', 'yes ', 'No ', 'NO', 'no '], dtype=object)

In [20]:
data.Observed_Attendance = data.Observed_Attendance.apply(lambda a: 1 if a in ['Yes', 'yes', 'yes '] else 0)
print(data.Observed_Attendance.unique())

[0 1]


In [21]:
# Same issue with Expected Attendance
data.Expected_Attendance.unique()

array(['Yes', 'Uncertain', 'No', 'yes', '11:00 AM', '10.30 Am'],
      dtype=object)

In [22]:
data.Expected_Attendance = data.Expected_Attendance.apply(lambda a: 'Yes' if a in ['11:00 AM', '10.30 Am', 'yes'] else a)

In [23]:
data.Expected_Attendance.unique()

array(['Yes', 'Uncertain', 'No'], dtype=object)

In [24]:
data.Has_the_call_letter_been_shared.unique()

array(['Yes', 'Havent Checked', 'No', 'Need To Check', 'Not sure',
       'Not Sure', 'Not yet', 'no', 'yes'], dtype=object)

In [25]:
data.Has_the_call_letter_been_shared = data.Has_the_call_letter_been_shared.apply(lambda a: 'Yes' if a in ['Yes', 'yes'] else ('No' if a in ['No', 'no'] else 'Maybe'))

In [26]:
data.nunique()

Client_name                                                                                           15
Industry                                                                                               7
Location                                                                                              11
Position_to_be_closed                                                                                  7
Nature_of_Skillset                                                                                    84
Interview_Type                                                                                         5
Gender                                                                                                 2
Candidate_Current_Location                                                                            10
Candidate_Job_Location                                                                                 7
Interview_Venue                                        

In [27]:
data['Are_you_clear_with_the_venue_details_and_the_landmark.'].unique()

array(['Yes', 'No', 'No- I need to check', 'yes', 'no'], dtype=object)

In [28]:
data['Are_you_clear_with_the_venue_details_and_the_landmark.'] = data['Are_you_clear_with_the_venue_details_and_the_landmark.'].apply(lambda a: 'Yes' if a in ['Yes', 'yes'] else 'No')

In [29]:
data['Have_you_taken_a_printout_of_your_updated_resume._Have_you_read_the_JD_and_understood_the_same'].unique()

array(['Yes', 'No', 'No- will take it soon', 'Not yet', 'yes', 'Not Yet'],
      dtype=object)

In [30]:
data['Have_you_taken_a_printout_of_your_updated_resume._Have_you_read_the_JD_and_understood_the_same'] = data['Have_you_taken_a_printout_of_your_updated_resume._Have_you_read_the_JD_and_understood_the_same'].apply(lambda a: 'Yes' if a in ['Yes', 'yes'] else ('Not Yet' if a in ['Not Yet', 'Not yet'] else 'No'))
print(data['Have_you_taken_a_printout_of_your_updated_resume._Have_you_read_the_JD_and_understood_the_same'].unique())

['Yes' 'No' 'Not Yet']


In [31]:
data['Can_I_have_an_alternative_number/_desk_number._I_assure_you_that_I_will_not_trouble_you_too_much'].unique()

array(['Yes', 'No', 'No I have only thi number', 'yes'], dtype=object)

In [32]:
data['Can_I_have_an_alternative_number/_desk_number._I_assure_you_that_I_will_not_trouble_you_too_much'] = data['Can_I_have_an_alternative_number/_desk_number._I_assure_you_that_I_will_not_trouble_you_too_much'].apply(lambda a: 'Yes' if a in ['Yes', 'yes'] else 'No')
data['Can_I_have_an_alternative_number/_desk_number._I_assure_you_that_I_will_not_trouble_you_too_much'].unique()

array(['Yes', 'No'], dtype=object)

In [33]:
data['Can_I_Call_you_three_hours_before_the_interview_and_follow_up_on_your_attendance_for_the_interview'].unique()

array(['Yes', 'No', 'No Dont', 'yes'], dtype=object)

In [34]:
data['Can_I_Call_you_three_hours_before_the_interview_and_follow_up_on_your_attendance_for_the_interview'] = data['Can_I_Call_you_three_hours_before_the_interview_and_follow_up_on_your_attendance_for_the_interview'].apply(lambda a: 'Yes' if a in ['Yes', 'yes'] else 'No')
print(data['Can_I_Call_you_three_hours_before_the_interview_and_follow_up_on_your_attendance_for_the_interview'].unique())

['Yes' 'No']


In [35]:
data['Hope_there_will_be_no_unscheduled_meetings'].unique()

array(['Yes', 'No', 'yes', 'Not Sure', 'cant Say', 'Not sure'],
      dtype=object)

In [36]:
data['Hope_there_will_be_no_unscheduled_meetings'] = data['Hope_there_will_be_no_unscheduled_meetings'].apply(lambda a: 'Yes' if a in ['Yes', 'yes'] else ('No' if a == 'No' else 'Maybe'))
data['Hope_there_will_be_no_unscheduled_meetings'].unique()

array(['Yes', 'No', 'Maybe'], dtype=object)

In [37]:
data['Have_you_obtained_the_necessary_permission_to_start_at_the_required_time'].unique()

array(['Yes', 'No', 'Not yet', 'Yet to confirm', 'yes'], dtype=object)

In [38]:
data['Have_you_obtained_the_necessary_permission_to_start_at_the_required_time'] = data['Have_you_obtained_the_necessary_permission_to_start_at_the_required_time'].apply(lambda a: 'Yes' if a in ['Yes', 'yes'] else 'No')
data['Have_you_obtained_the_necessary_permission_to_start_at_the_required_time'].unique()

array(['Yes', 'No'], dtype=object)

In [39]:
data.nunique()

Client_name                                                                                           15
Industry                                                                                               7
Location                                                                                              11
Position_to_be_closed                                                                                  7
Nature_of_Skillset                                                                                    84
Interview_Type                                                                                         5
Gender                                                                                                 2
Candidate_Current_Location                                                                            10
Candidate_Job_Location                                                                                 7
Interview_Venue                                        

In [40]:
for col in data:
    print('%s: ' % col, data[col].unique())

Client_name:  ['Hospira' 'Aon Hewitt' 'UST' 'Standard Chartered Bank' 'ANZ' 'Pfizer'
 'Standard Chartered Bank Chennai' 'Aon hewitt Gurgaon' 'Astrazeneca'
 'Flextronics' 'Prodapt' 'Williams Lea' 'Barclays' 'Hewitt' 'Woori Bank']
Industry:  ['Pharmaceuticals' 'IT Services' 'BFSI' 'IT Products and Services'
 'Electronics' 'Telecom' 'IT']
Location:  ['Chennai' 'Gurgaon' 'Bangalore' 'Hyderabad' 'Gurgaonr' 'Delhi' 'chennai'
 '- Cochin- ' 'Noida' 'CHENNAI' 'chennai ']
Position_to_be_closed:  ['Production- Sterile' 'Selenium testing' 'Dot Net' 'AML' 'Trade Finance'
 'Routine' 'Niche']
Nature_of_Skillset:  ['Routine' 'Oracle' 'Accounting Operations' 'Banking Operations' 'Fresher'
 'AML/KYC/CDD' 'CDD KYC' 'Biosimiliars' 'RA Label' 'RA Publishing' 'EMEA'
 'generic drugs – RA' 'Regulatory' 'Analytical R & D' 'Analytical R&D'
 'Senior software engineer-Mednet' 'Tech lead-Mednet' 'Tech Lead- Mednet'
 'Technical Lead' 'TL' 'Sr Automation Testing' 'Senior Analyst'
 'production' 'Core Java' 'Java J2EE

In [41]:
# Editing Industry
data.Industry.unique()

array(['Pharmaceuticals', 'IT Services', 'BFSI',
       'IT Products and Services', 'Electronics', 'Telecom', 'IT'],
      dtype=object)

In [42]:
data.Industry = data.Industry.apply(lambda a: 'IT' if a in ['IT Services', 'IT Products and Services', 'IT'] else a)
data.Industry.unique()

array(['Pharmaceuticals', 'IT', 'BFSI', 'Electronics', 'Telecom'],
      dtype=object)

In [43]:
# Editing Location
data.Location.unique()

array(['Chennai', 'Gurgaon', 'Bangalore', 'Hyderabad', 'Gurgaonr',
       'Delhi', 'chennai', '- Cochin- ', 'Noida', 'CHENNAI', 'chennai '],
      dtype=object)

In [44]:
data.Location = data.Location.apply(lambda a: 'Gurgaon' if a in ['Gurgaon', 'Gurgaonr'] else ('Cochin' if a == '- Cochin- ' else a))
data.Location.unique()

array(['Chennai', 'Gurgaon', 'Bangalore', 'Hyderabad', 'Delhi', 'chennai',
       'Cochin', 'Noida', 'CHENNAI', 'chennai '], dtype=object)

In [45]:
data.Location = data.Location.apply(lambda a: a.upper())
data.Location.unique()

array(['CHENNAI', 'GURGAON', 'BANGALORE', 'HYDERABAD', 'DELHI', 'COCHIN',
       'NOIDA', 'CHENNAI '], dtype=object)

In [46]:
data.Location = data.Location.apply(lambda a: 'CHENNAI' if a in ['CHENNAI', 'CHENNAI '] else a)
data.Location.unique()

array(['CHENNAI', 'GURGAON', 'BANGALORE', 'HYDERABAD', 'DELHI', 'COCHIN',
       'NOIDA'], dtype=object)

In [47]:
data.Nature_of_Skillset = data.Nature_of_Skillset.apply(lambda a: a.upper())


In [48]:
data.Nature_of_Skillset = data.Nature_of_Skillset.apply(lambda a: 'ANALYTICAL_R&D' if a in ['ANALYTICAL R & D', 'ANALYTICAL R&D'] else ('TECHNICAL_LEAD' if a in ['TECHNICAL LEAD', 'TL'] else a))

In [49]:
data.Nature_of_Skillset = data.Nature_of_Skillset.apply(lambda a: 'LENDING_LIABILITY' if a in ['LENDING AND LIABILITIES', 'LENDING & LIABILITY', 'L & L', 'LENDING&LIABLITIES'] else a)

In [50]:
data[data.Nature_of_Skillset.isin(['10.00 AM', '9.00 AM', '11.30 AM', '9.30 AM'])]

Unnamed: 0,Client_name,Industry,Location,Position_to_be_closed,Nature_of_Skillset,Interview_Type,Gender,Candidate_Current_Location,Candidate_Job_Location,Interview_Venue,Candidate_Native_location,Have_you_obtained_the_necessary_permission_to_start_at_the_required_time,Hope_there_will_be_no_unscheduled_meetings,Can_I_Call_you_three_hours_before_the_interview_and_follow_up_on_your_attendance_for_the_interview,Can_I_have_an_alternative_number/_desk_number._I_assure_you_that_I_will_not_trouble_you_too_much,Have_you_taken_a_printout_of_your_updated_resume._Have_you_read_the_JD_and_understood_the_same,Are_you_clear_with_the_venue_details_and_the_landmark.,Has_the_call_letter_been_shared,Expected_Attendance,Observed_Attendance,Marital_Status
817,Standard Chartered Bank,BFSI,CHENNAI,Routine,10.00 AM,Scheduled Walk In,Male,chennai,Chennai,Chennai,Chennai,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,0,Single
818,Standard Chartered Bank,BFSI,CHENNAI,Routine,9.00 AM,Scheduled Walk In,Female,chennai,Chennai,Chennai,Chennai,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,0,Married
820,Standard Chartered Bank,BFSI,CHENNAI,Routine,11.30 AM,Scheduled Walk In,Male,Chennai,Chennai,Chennai,Chennai,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,1,Single
821,Standard Chartered Bank,BFSI,CHENNAI,Routine,9.30 AM,Scheduled Walk In,Male,Chennai,Chennai,Chennai,Chennai,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,1,Single


In [51]:
# Lets drop these 4 lines....
data = data[~data.Nature_of_Skillset.isin(['10.00 AM', '9.00 AM', '11.30 AM', '9.30 AM'])]

In [52]:
# Grouping all the JAVA related entries
data.Nature_of_Skillset = data.Nature_of_Skillset.apply(lambda a: 'JAVA' if 'JAVA' in a else a)

In [53]:
# Same for SCCM
data.Nature_of_Skillset = data.Nature_of_Skillset.apply(lambda a: 'SCCM' if 'SCCM' in a else a)

In [54]:
data.Nature_of_Skillset = data.Nature_of_Skillset.apply(lambda a: 'AML/KYC/CDD' if a in ['AML/KYC/CDD', 'CDD KYC'] else a)

In [55]:
data.Nature_of_Skillset = data.Nature_of_Skillset.apply(lambda a: 'TECH_LEAD' if a in ['TECH LEAD-MEDNET', 'TECH LEAD- MEDNET', 'TECHNICAL_LEAD'] else a)


In [56]:
data.Nature_of_Skillset = data.Nature_of_Skillset.apply(lambda a: 'COTS' if 'COTS' in a else a)

In [57]:
data.nunique()

Client_name                                                                                           15
Industry                                                                                               5
Location                                                                                               7
Position_to_be_closed                                                                                  7
Nature_of_Skillset                                                                                    40
Interview_Type                                                                                         5
Gender                                                                                                 2
Candidate_Current_Location                                                                            10
Candidate_Job_Location                                                                                 7
Interview_Venue                                        

In [58]:
data.Interview_Type.unique()

array(['Scheduled Walkin', 'Scheduled ', 'Walkin', 'Scheduled Walk In',
       'Walkin '], dtype=object)

In [59]:
# I assumed that a scheudle walkin is a like a scheduled interview....cause what ta hell is a scheduled walkin?
data.Interview_Type = data.Interview_Type.apply(lambda a: 'Scheduled' if 'Scheduled' in a else 'Walkin')

In [60]:
data.Candidate_Current_Location.unique()

array(['Chennai', 'Gurgaon', 'Bangalore', 'Hyderabad', 'Delhi', 'chennai',
       '- Cochin- ', 'Noida', 'CHENNAI', 'chennai '], dtype=object)

In [61]:
data.Candidate_Current_Location = data.Candidate_Current_Location.apply(lambda a: (a.upper()).strip())
data.Candidate_Current_Location.unique()

array(['CHENNAI', 'GURGAON', 'BANGALORE', 'HYDERABAD', 'DELHI',
       '- COCHIN-', 'NOIDA'], dtype=object)

In [62]:
print(data.Candidate_Job_Location.unique())
data.Candidate_Job_Location = data.Candidate_Job_Location.apply(lambda a: (a.upper()).strip())
print(data.Candidate_Job_Location.unique())

['Hosur' 'Bangalore' 'Chennai' 'Gurgaon' 'Visakapatinam' '- Cochin- '
 'Noida']
['HOSUR' 'BANGALORE' 'CHENNAI' 'GURGAON' 'VISAKAPATINAM' '- COCHIN-'
 'NOIDA']


In [63]:
print(data.Interview_Venue.unique())
data.Interview_Venue = data.Interview_Venue.apply(lambda a: (a.upper()).strip())
print(data.Interview_Venue.unique())

['Hosur' 'Gurgaon' 'Bangalore' 'Chennai' 'Hyderabad' '- Cochin- ' 'Noida']
['HOSUR' 'GURGAON' 'BANGALORE' 'CHENNAI' 'HYDERABAD' '- COCHIN-' 'NOIDA']


In [64]:
data.Candidate_Native_location = data.Candidate_Native_location.apply(lambda a: (a.upper()).strip())

In [65]:
data.Candidate_Native_location.unique()

array(['HOSUR', 'TRICHY', 'CHENNAI', 'GURGAON', 'NOIDA', 'DELHI /NCR',
       'COCHIN', 'TRIVANDRUM', 'BANGALORE', 'COIMBATORE', 'SALEM',
       'TANJORE', 'HYDERABAD', 'MUMBAI', 'PUNE', 'KOLKATA', 'PANJIM',
       'ALLAHABAD', 'CUTTACK', 'VISAKAPATINAM', 'ANANTAPUR', 'AHMEDABAD',
       'KURNOOL', 'VIJAYAWADA', 'VELLORE', 'PONDICHERRY', 'NAGERCOIL',
       'CHITOOR', 'AGRA', 'TUTICORIN', 'TIRUPATI', 'AMBUR', 'CHANDIGARH',
       'MYSORE', 'HISSAR', 'DELHI', 'KANPUR', 'LUCKNOW', '- COCHIN-',
       'WARANGAL'], dtype=object)

In [66]:
data['is_interview_in_native_town'] = data.apply(lambda a: True if a.Candidate_Native_location == a.Interview_Venue else False, axis=1)

In [67]:
data['is_interview_in_current_town'] = data.apply(lambda a: True if a.Candidate_Current_Location == a.Interview_Venue else False, axis=1)

In [68]:
data['is_interview_in_current_job_town'] = data.apply(lambda a: True if a.Candidate_Job_Location == a.Interview_Venue else False, axis=1)

In [69]:
data['is_job_in_native_town'] = data.apply(lambda a: True if a.Location == a.Candidate_Native_location else False, axis=1)
data['is_job_in_current_twon'] = data.apply(lambda a: True if a.Candidate_Current_Location == a.Location else False, axis=1)
data['is_job_in_current_job_town'] = data.apply(lambda a: True if a.Candidate_Job_Location == a.Location else False, axis=1)

In [70]:
data.shape

(961, 27)

In [71]:
features, labels = data.loc[:, data.columns != 'Observed_Attendance'], data.loc[:, 'Observed_Attendance']

In [72]:
print(features.shape)
print(labels.shape)

(961, 26)
(961,)


In [73]:
features_one_hot = pd.get_dummies(data=features)

In [74]:
features_one_hot.shape

(961, 167)

In [75]:
features_one_hot = features_one_hot.applymap(lambda a: 1 if a == True else (0 if a == False else 0))

In [76]:
from sklearn.model_selection import train_test_split

In [77]:
# Lets create our train, dev, and test sets
x_train, x_dev, y_train, y_dev = train_test_split(features_one_hot, labels, test_size=80, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=80, random_state=42)

In [78]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix

In [79]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=110, learning_rate=0.6)

In [80]:
model.fit(X=x_train, y=y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.6, n_estimators=110, random_state=None)

In [81]:
cross_validate(model, features_one_hot, labels, cv=5)



{'fit_time': array([0.30439496, 0.25031877, 0.25311041, 0.37816763, 0.35428143]),
 'score_time': array([0.01449609, 0.01427269, 0.01671219, 0.01732111, 0.01595998]),
 'test_score': array([0.79274611, 0.546875  , 0.69791667, 0.765625  , 0.63541667]),
 'train_score': array([0.74479167, 0.76723017, 0.77503251, 0.75162549, 0.77893368])}

In [82]:
test_preds = model.predict(x_test)

In [83]:
confusion_matrix(test_preds, y_test)

array([[ 5,  1],
       [19, 55]])

In [84]:
#using bagging classifier
from sklearn.ensemble import BaggingClassifier
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=110, learning_rate=0.6)
bagger = BaggingClassifier(base_estimator=clf, verbose=1, n_estimators=50)
bagger.fit(X=x_train, y=y_train)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.4s finished


BaggingClassifier(base_estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.6, n_estimators=110, random_state=None),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=50, n_jobs=1, oob_score=False,
         random_state=None, verbose=1, warm_start=False)

In [85]:
bagger.score(X=x_test, y=y_test)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


0.75

In [86]:
preds = bagger.predict(x_test)
confusion_matrix(preds, y_test)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


array([[ 5,  1],
       [19, 55]])

In [None]:
#Bagging is slower and does not seem to improve our scores