In [17]:
# Required libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn import preprocessing
tqdm.pandas()

# Configuring notebook env
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199
pd.set_option('display.expand_frame_repr', False)

# Data munging & preparation
df_job = pd.read_csv(r'D:\vanhack\data\JobsToPredict.csv')
df_job = df_job.rename(columns={'POSITION':'JobsPosition','Skills':'JobSkills','Responsibilities':'JobResponsibilities'})
df_job.columns = df_job.columns.str.lower()

df_available = pd.read_csv(r'D:\vanhack\data\AvailableCandidates.csv')
df_available = df_available.rename(columns={'Skills':'UserSkills'})
df_available.columns = df_available.columns.str.lower()
df_available = df_available[['userid','userskills','usersposition']]
df_available = df_available.reset_index(drop=True)

  from pandas import Panel


In [18]:
# Create a superset with all jobs available and all candidates available
full_df=[]
for i in range(len(df_available)):
    for j in range(len(df_job)):
        #a = round(len(list(set(Candidates_df['Skills'][i]) & set(HiredCandidates_df['Skills'][j])))/len(HiredCandidates_df['Skills'][j]),2)
        user_id = df_available['userid'][i]
        user_position = df_available['usersposition'][i]
        user_skills  = df_available['userskills'][i]
        job_id = df_job['jobid'][j]
        job_position = df_job['jobsposition'][j]
        job_skills = df_job['jobskills'][j]
        full_df.append({'userid': user_id, 'userskills': user_skills, 'usersposition': user_position, 'jobid': job_id, 'jobsposition': job_position, 'jobskills': job_skills })
full_df = pd.DataFrame(full_df)


In [None]:
# Feature engineering and calculating match score based on weights from lgb model
def create_features(row):

    user_skills = str(row['userskills']).split(',')
    job_skills = str(row['jobskills']).split(',')

    user_skills = [x.strip(' ') for x in user_skills]
    job_skills = [x.strip(' ') for x in job_skills]

    common_skills = []
    other_skills = []

    for skill in user_skills:
        skill = skill.strip()
        if skill in job_skills:
            common_skills.append(skill)
        else:
            other_skills.append(skill)
          
            
    row['userskills_no'] = len(user_skills)
    row['jobskills_no'] = len(job_skills)
    row['common_skills_no'] = len(common_skills)
    row['common_skills_text'] = common_skills
    
    row['common_skills_ratio'] = (len(common_skills)/len(job_skills)) # 1 is ideal
    row['other_skills_ratio'] = (len(other_skills)/len(user_skills)) # 0 is ideal
    row['job_candiate_skill_ratio'] = (len(job_skills)/len(user_skills)) # 1 is ideal

    #row['position_match'] = process.extractOne(str(row['user_UsersPosition']), str(row['job_POSITION']) ,scorer=fuzz.token_sort_ratio)[1]
    row['position_match'] = fuzz.token_sort_ratio(str(row['usersposition']), str(row['jobsposition']))
        
    #     score                  feature
    #        4            position_match
    #        3       common_skills_ratio
    #        3             userskills_no
    #        2  job_candiate_skill_ratio
    #        1        other_skills_ratio
          
    return row

[ 'common_skills_ratio','position_match','userskills_no','job_candiate_skill_ratio','other_skills_ratio']


df = full_df.progress_apply(create_features,axis=1)
df.to_csv('df.csv')
features = [ 'userskills_no','common_skills_ratio','other_skills_ratio','job_candiate_skill_ratio','position_match']
df[features] = preprocessing.MinMaxScaler().fit_transform(df[features])

#df['match_score'] =  4*df['common_skills_ratio'] + 5*df['position_match'] +df['userskills_no']# - 2*df['job_candiate_skill_ratio'] - df['other_skills_ratio']
df['match_score'] =  3*df['common_skills_ratio'] + 5*df['position_match'] +5*df['userskills_no']# - 2*df['job_candiate_skill_ratio'] - df['other_skills_ratio']

df['match_score'] = df['match_score'] * 10

df = df.sort_values(by='match_score',ascending=False)

  6%|██████████▎                                                                                                                                                             | 6132/100000 [00:42<11:48, 132.46it/s]

In [None]:
# Filtering to only top 10 candidates by match score for each job
result = pd.DataFrame(df.groupby('jobid').head(10)).reset_index()
result.drop('index', axis = 1, inplace=True)
result.sort_values('jobsposition', axis = 0, inplace=True)

# Filtering to only essential features in result
essential_features = ['jobid', 'jobsposition', 'jobskills', 'userid', 'usersposition', 'userskills', 'match_score']
result = result[essential_features]
result.to_csv('result.csv')

* Have not used features Years of experience, English level as they are not included in training data and the only way to use them in prediction would be to hard code importance based on intuition
* Used LGB Model because of
* Create viz on correlation between years of Exp and English level

Further room for improvements:
* Analyze responsblities data
* Get more training data
