In [32]:
import pandas as pd
import re
from datetime import datetime

In [17]:
# use google pre-trained model
from gensim.models.keyedvectors import KeyedVectors
# the file path should be changed to the local absolute path
word_vectors = KeyedVectors.load_word2vec_format("/Users/zhiyuzhang/Desktop/ocbang/Word2Vec/Google-source/GoogleNews-vectors-negative300.bin",  binary = True, limit = 1000000)

# deal with parsing dataset (requirements are shown below)

In [None]:
### JOB DESCRIPTION PART
## It is better to parse data into the following format. (dictionary level: company - position - id - details)
## Details should include following keys: 
# 1) Position country & city (better to use code to represent)
# 2) Recruiter
# 3) Type (self-defined)
# 4) Seniority
# 5) Degree Requirement & YOE Requirement (Seperated by |)
# 6) Responsibilities
# 7) Basic Qualifications (excluded 5)
# 8) Preferred Qualifications
# 9) Recruiting History

## Some import points needed to be notified:
# 1) 4,5,6,7,8 Could be None.
# 2) 1 could be a filtering requirement.
#    2 could be a sorting requirement/ vectorized requirement/ comparable requirement.
#    3 could be a comparable requirement.
#    4 could be a filtering requirement/ sorting requirement.
#    5,6,7,8 could be vectorized requirements.
#    9 could be a comparable requirement.
# 3) Some problems may occur in parsing data.
#   - 5 always appears in the first two sentences/ one sentence of qualification part.
#   -- one sentence: https://careers.tiktok.com/position/6986837310156720397/detail
#   -- two sentences: https://careers.tiktok.com/position/6992655739160774942/detail
#   -- If both basic and preferred qualications have 5, use preferred standard.
#   - Most JDs have good structure. (just represented by different key words)
#   -- Special condition: https://sg.talent.com/view?id=ec5949e2589e
#   - If basic qualifications and preferred qualifications should be treated differently?
#   - 6 always has general description and 7 always has specific terms.
#   - Should internship experiences be counted as working experience? - Depend on the seniority?
#   - For recruiting history, how long do we need to consider/ do we need to include all different positions a specific HR recruits.

"""
EXAMPLE:
company_info = {
    "Amazon":{
        "SDE":{"A000001":{"Position_Country&City":001123, "Recruiter":"Lam", "Type":"CS", "Seniority":"Manager", "Degree&YOE_Requirement": "at least bachelor with 3-year working experience","Responsibilities":"xxx","Basic_Qualifications":"xxx","Preferred_Qualifications":"xxx","Recruiting_History":["Axxxxxx","Axxxxxx","Axxxxxx"]}},
        "DS":{"A000002":{"Position_Country&City":001124, "Recruiter":"Tam", "Type":"DS", "Seniority":"Entry", "Degree&YOE_Requirement": "at least bachelor with 3-year working experience","Responsibilities":"xxx","Basic_Qualifications":"xxx","Preferred_Qualifications":"xxx","Recruiting_History":["Axxxxxx","Axxxxxx","Axxxxxx"]}}
            },
    "Google":{
        "Communication":{"G000001":{"Position_Country&City":001125, "Recruiter":"Bob", "Type":"Functional", "Seniority":"Manager", "Degree&YOE_Requirement": "at least bachelor with 3-year working experience","Responsibilities":"xxx","Basic_Qualifications":"xxx","Preferred_Qualifications":"xxx","Recruiting_History":["Axxxxxx","Axxxxxx","Axxxxxx"]}},
        "ML Researcher":{"G000002":{"Position_Country&City":001126, "Recruiter":"Kaite", "Type":"Research", "Seniority":"Entry", "Degree&YOE_Requirement": "at least bachelor with 3-year working experience","Responsibilities":"xxx","Basic_Qualifications":"xxx","Preferred_Qualifications":"xxx","Recruiting_History":["Axxxxxx","Axxxxxx","Axxxxxx"]}}
            },
}
"""

In [None]:
### CANDIDATE RESUME PART
## It is better to parse data into the following format. (dictionary level: id - name - details)
## Details should include following keys: 
# 1) Willing to work country & city (better to use code to represent)
# 2) Current Status (Working as a full-time/ Still trying to find a job)
# 3) Highest Degree & Major (Seperated by |)
# 4) YOE & Industry
# 5) Skillset
# 6) Working Experience
# 7) Project Experience
# 8) Posting History

## Some import points needed to be notified:
# 1) 5,6,7,8 could be None.
# 2) 1 could be a filtering requirement.
#    2 could be a sorting requirement.
#    3,4 could be a filtering requirement/ sorting requirement.
#    5,6,7 could be vectorized requirements.
#    8 could be a comparable requirement.
# 3) Some problems may occur in parsing data.
#   - 1 could be a flexible problem - if has work authorization & if needs sponsor
#   - Current status can be deducted from working experience, but could be inaccurate
#   - If treat the name of related course in skillset.
#   - For posting history, how long do we need to consider.
#   - Should internship experiences be counted as working experience? - Depend on the seniority?

"""
EXAMPLE:
candidates_info = {
    "C000001":{
        "Bob":{"Country&City":"US_Citizen",
        "Current_Status":"out of work",
        "Degree&Major":"MIT Master of Science in Computer Science",
        "YOE&Industry":"5 years with Software Engineer",
        "skills":"Python, JAVA, SQL, Data_Structure",
        "working_experiences":"Google...",
        "project_experiences":"recommendation system",
        "posting_history":["A000001","A000003"]},
            },
    "C000002":{
        "Tim":{"Country&City":"F1_Visa",
        "Current_Status":"out of work",
        "Degree&Major":"Stanford BS in Data Science",
        "YOE&Industry":"1 years with Data Analyst",
        "skills":"Python, JAVA, SQL, Statistics",
        "working_experiences":"TikTok...",
        "project_experiences":"Fraud Analysis",
        "posting_history":["A000002","A000004"]},
            },

}
"""

In [None]:
### RECRUITER PREFERENCE PART
## It is better to parse data into the following format. (dictionary level: company - Recruiter - positionID - details)
## Details should include following keys: 
# 1) Rcruiting Frequency
# 2) Interview Candidates List
# 3) Final Candidates List
# 4) Preferred Candidates Features ##TODO

In [18]:
# should be replaced by actual text, can also be parsed from a dataframe type of data
# Must do data cleaning before storing in this dictionary
company_info = {
    "TikTok":{
        "Site Reliability Engineer_Seattle /NYC":
        {"A000001":{"Position_Country&City":"001123", "Recruiter":"Audrey", "Type":"CS", "Seniority":"Entry", 
                    "Degree&YOE_Requirement": "Bachelor or above degree in Computer Science or a related technical discipline | 1. 3-5+ years experience in the deployment and administration of large-scale distributed systems",
                    "Responsibilities":"1. Help improve the whole lifecycle of infrastructure services from inception and design, throughout development, capacity planning and launch reviews, to deployment, operation and refinement; 2. Design and implement software platforms and monitor frameworks for efficient, automated and intelligent service-oriented architecture (SOA) governance; 3. Scale systems sustainability through mechanisms such as automation; evolve systems reliability, efficiency, and velocity by pushing for changes; 4. Maintain services to meet service-level-agreements (SLAs) or service-level-objectives (SLOs) by measuring and monitoring availability, performance, and overall system health； 5. Provide user support, incident responses and postmortems.",
                    "Basic_Qualifications":" 2. Familiar with Unix/Linux operating systems internals and administration, networking (e.g. TCP/IP, routing, network topologies and hardware), storage systems, and database systems; 3. Experience in one of the following programmings: C, C++, Java, Python, Go, Perl, Ruby or shell scripting; 4. Experience in debugging and optimizing code and automate routine tasks; 5. Experience in the development, test, deployment and administration of one of the following types of systems: Ngnix, Kubernetes, Docker, OpenStack, Hadoop, Spark, Flink, etc. is preferred; 6. Experience in designing and analyzing large-scale distributed systems is preferred; 7. Strong skills in problem solving and communication.",
                    "Preferred_Qualifications":"None",
                    "Recruiting_History":["Axxxxx1","Axxxxx2","Axxxxx3"]}},
        "Tech Lead Manager, Cloud Data Engineer":
        {"A000002":{"Position_Country&City":"001124", "Recruiter":"Audrey", "Type":"DS", "Seniority":"Manager", 
                    "Degree&YOE_Requirement": "BS or MS degree in Computer Science or related technical field or equivalent practical experience | • 5+ years of experience with leading engineering and/or SRE teams",
                    "Responsibilities":"You will identify issues and opportunities, design operational excellence strategies to bring the best out of our teams to to keep our community safe in the current fast-changing & uncertain environment. Drive consensus and realignment to ensure execution of such strategies.- You will identify deficiencies in communications / collaborations / operations in current cross-function and cross-location workflows. You will recognize the difficulty and pain points of cross-culture and cross-language communications, and continue to explore better ways of support to improve communication effectiveness and quality within the organization. You will design and deliver feasible projects with a thorough understanding of organizational structures and business needs, in order to optimize the above processes and enhance overall functioning efficiency. You will oversee the making and optimization of global processes such as headcount and budget approval processes.- You will design and develop people strategy by partnering with HR, Global Well-Being & Learning & Development leaders to improve talent development plans, competency models, that contribute to the organizational growth needs and implement projects that contributes to people well-being.",
                    "Basic_Qualifications":"You have a strong ability of planning and leading strategic complex programs with the ability to execute at a tactical level and complete project implementations. You have respect for other cultures and deep empathy for users. You are eager to shape a new space within a rapidly growing and changing tech company.You have a keen interest and acute knowledge in social media challenges, sensitivity to safety or regulatory issues, and experience with crisis management.",
                    "Preferred_Qualifications":"None",
                    "Recruiting_History":["Axxxxx4","Axxxxx5","Axxxxx6"]}}
            },
    "Google":{
        "Quality Assurance Engineer, Games":
        {"G000001":{"Position_Country&City":"001125", "Recruiter":"Audrey", "Type":"Engineer", "Seniority":"Entry", 
                    "Degree&YOE_Requirement": "None",
                    "Responsibilities":"1. Responsible for the product quality assurance for all features, including system functions, front-end/back-end performance, optimization as well as other aspects to ensure the smooth release of projects and provide good user experience. 2. Manage the overall quality risk of the project, promote the implementation and optimization of the testing process and quality standards, improve the efficiency of version control, and ensure the quality of each release. 3. Develop effective corporate communication strategies and ensure good collaboration between different teams.",
                    "Basic_Qualifications":"1. Strong background of software, understanding operating systems and computer networks. 2. Familiar with at least one programming or scripting language, such as C#, Java, Python, Go, etc. 3. Great knowledge of software QA methodologies, tools and processes. 4. 2+ years' experience of working on a software QA Engineer position 5. Good communication and teamwork skills, strong ability of organization and execution and be initiative during work. 6. Love playing games, have a great understanding of games and be willing to learn about game settings. 7. Fluent English and Mandarin will be a plus.",
                    "Preferred_Qualifications":"None",
                    "Recruiting_History":["Axxxxx7"]}},
        "Compliance Safety leader - Pico":
        {"G000002":{"Position_Country&City":"001126", "Recruiter":"Kaite", "Type":"Research", "Seniority":"Entry", 
                    "Degree&YOE_Requirement": "Bachelor's degree or above; | More than 5 years of business compliance related experience, team management experience is preferred;",
                    "Responsibilities":"1.Establish a mechanism related to Pico’s international compliance; plan the long-term strategic direction of the international compliance business, provide guaranteed value for the business, manage the compliance process in a systematic way, and ensure the safety of business products 2. Support the implementation of Pico's international compliance management framework, including implementing process formulation in accordance with management framework standards, communicating with stakeholders, recording decision-making, and promoting the maturity of business compliance management; 3. Responsible for the identification, governance, improvement, tracking and post-assessment of business compliance risks, cooperate with the legal team, convert legal compliance requirements into technical and process solutions, deal with corresponding risks, and further precipitate into standard compliance solutions , guidelines, improve the compliance efficiency of the business; cooperate with the internal audit and internal control team to implement internal compliance audits, including data permissions, cross-border transmission compliance, and APP/SDK compliance. 4.Promote business compliance review, identify compliance risks in a timely manner, and ensure timely delivery; 5. Summarize compliance demands, and cooperate with product R&D team to implement compliance capabilities.",
                    "Basic_Qualifications":"3. English can be used as a working language, with experience communicating with multinational/functional teams 4.Understand the laws and regulations of important countries, such as GDPR, CCPA, COPPA, etc.; 5.Have a certain understanding of the Internet industry, video, and content industries.",
                    "Preferred_Qualifications":"None",
                    "Recruiting_History":[]}}
            },
}

# create recruiting matrix

In [None]:
## example from linkedin

## create basic features (list)

In [26]:
# build the recruiting matrix framework
# three main aspects of matrix
hard_skills = ["Python","SQL","Java","C","C#","Data_Engineering","Statitics","Machine_Learning","Deep_Learning","NoSQL","Office","Excel"]
soft_skills = ["Languages","Leadership","Team","Communication","Plan"]
activities = ["Most_Recently_Recruiting","Company_Preference"]

## create extend features (dictionary)

In [27]:
# find related sql skills (this part of code can be overwritten with other skills)
def find_sql_skills(resume):
    sql_skill = re.compile(r"[a-z]*sql[a-z]*")
    skillset = sql_skill.search(resume.lower())
    if skillset == None:
        return None
    else:
        return True


In [30]:
# Extend each elements with dictionary (all lower case)

# hardskills can refer to workday skillset/ LinkedIn (web scraping)
hard_skills_extend = {
    "python":["python","jupyter","idle","pycharm","djan","spyder","pydev","rodeo","sublime_text","wing","eric_python","atom","thonny"],
    "sql":[find_sql_skills(resume),"oracle","sybase","db2","snowflake"],
    "data_engineering":["web_scraping","data_cleaning","data_gathering","feature_engineering","data_analysis","EDA","exploratory_data_analysis","data_visualization"],
    "machine_learning":["supervised_learning","unsupervised_learning","feature_engineering","sklearn","tensorflow","nerual_network"]
    # can be extended manually or with auto-refilled script, but not listed here
}

# this parts should add error inspection, because not all words can be included in the google resouce
soft_skills_extend = {core.lower():[i[0] for i in word_vectors.most_similar(positive=[core.lower()],topn=10)] for core in soft_skills}

In [31]:
print(hard_skills_extend,soft_skills_extend)

{'python': ['python', 'jupyter', 'idle', 'pycharm', 'djan', 'spyder', 'pydev', 'rodeo', 'sublime_text', 'wing', 'eric_python', 'atom', 'thonny'], 'sql': [None, 'oracle', 'sybase', 'db2', 'snowflake'], 'data_engineering': ['web_scraping', 'data_cleaning', 'data_gathering', 'feature_engineering', 'data_analysis', 'EDA', 'exploratory_data_analysis', 'data_visualization'], 'machine_learning': ['supervised_learning', 'unsupervised_learning', 'feature_engineering', 'sklearn', 'tensorflow', 'nerual_network']} {'languages': ['dialects', 'language', 'Languages', 'multilingual', 'languages_fluently', 'Hindi_Bengali', 'translations', 'Bahasa_Indonesia', 'vernacular_languages', 'English'], 'leadership': ['leader', 'Leadership', 'leaderships', 'leaders', 'chairmanship', 'organizational_structure', 'managerial_competence', 'organizational', 'stewardship', 'competence'], 'team': ['squad', 'teams', 'Team', 'teammates', 'league', 'sqaud', 'coach', 'players', 'championship', 'game'], 'communication': ['

## create final matrix (dataframe)

In [None]:
# Construct the dataframe
recruiting_matrix = pd.DataFrame(columns=hard_skills+soft_skills+activities)
recruiting_matrix

Unnamed: 0,Python,SQL,Data_Engineering,Statitics,Machine_Learning,Deep_Learning,NoSQL,Office,Excel,Languages,Leadership,Team_Working,Communication,Planning,Hard_Working,Most_Recently_Recruiting,Company_Preference


# define feature measurements

In [33]:
### It would be better if we have a system that requires the HRs and Candidates fill in the blank. (Parsing system can be an assist.)
### Two ways to implement - like what Workday is doing or/and like what LinkedIn is doing.

In [34]:
# create a dictionary that stores all company info in JSON/ Nested Dictionary format. (can be used to 1. vectorize the features 2. compare similar JD and Resume)
company_info = {
    "Amazon":{
        "SDE":{"A000001":{"edu_requirement":"bachelor degree in cs or related field","candidates_id":["Axxxxxx"]}},
        "DS":{"A000002":{"edu_requirement":"at least master or equal experience","candidates_id":["Axxxxxx"]}}
            },
    "Google":{
        "G000001":{"2020-10":{"edu_requirement":"PhD candidates","candidates_id":["Axxxxxx"]}},
        "G000002":{"2011-12":{"edu_requirement":"finish high school in the US","candidates_id":["Axxxxxx"]}}
            },

}

In [None]:
class MeasureHardSkillsHR():

    company_hardskill_info = {}

    def __init__(self,company_name,company_position,id):
        self.company_name = company_name
        self.company_position = company_position
        self.id =id
        self.post_date = datetime.now()
        self.hardskill_info = {"company":self.company_name,"position":self.company_position,"id":id,"post_date":self.post_date}

    # type1: multi-classification problem with considering thredshod
    def measure_degree(self):
        # get the edu_require with the above infomation
        edu_requirement = company_info[self.company_name][self.company_position][self.id]["edu_requirement"]
        # just example
        # edu_requirement = "bachelor degree in cs or related field"
        # edu_requirement = "at least four-year university experience"
        degree_type = {"kindergarden":1,"primary_scholl":2,"senior_high_school":3,"high_school":4,"bachelor":5,"master":6,"PhD":7,"post-doc":8}
        for i in edu_requirement:
            if i in degree_type:
                degree_vect = degree_type[i]
            else:
                continue
        self.hardskill_info["degree"] = degree_vect
        # this should be a threshod that could be used by candidates filtering
        return i
        
                

In [None]:
# create a dictionary that stores all candidates info in JSON/ Nested Dictionary format. (can be used to 1. vectorize the features 2. compare similar JD and Resume)
candidates_info = {
    "C000001":{
        "Bob":{"edu_background":"MIT Master of Science in Computer Science Related courses:...",
        "skills":"Python, JAVA, SQL, Data_Structure",
        "working_experiences":"Google...",
        "project_experiences":"recommendation system",
        "delivered_position_id":["A000001"]},
            },
    "C000002":{
        "Alice":{"edu_background":"Peking University Master in Data Science Related courses:...",
        "skills":"Python, SQL, Hadoop",
        "working_experiences":"Bytedance...",
        "project_experiences":"Marketing_Analysis",
        "delivered_position_id":["A000002"]},
            },

}

In [None]:
class MeasureHardSkillsCandidates():

    candidate_hardskill_info = {}
    
    def __init__(self,name,candidate_id):
        pass

    # type1: multi-classification problem with considering thredshod
    def measure_degree(self,):
        # get the edu_require with the above infomation
        edu_requirement = company_info[self.company_name][self.company_position][self.id]["edu_requirement"]
        # just example
        # edu_requirement = "bachelor degree in cs or related field"
        # edu_requirement = "at least four-year university experience"
        degree_type = {"kindergarden":1,"primary_scholl":2,"senior_high_school":3,"high_school":4,"bachelor":5,"master":6,"PhD":7,"post-doc":8}
        for i in edu_requirement:
            if i in degree_type:
                degree_vect = degree_type[i]
            else:
                continue
        self.hardskill_info["degree"] = degree_vect
        
