In [1]:
# Let's first install Cohere's python SDK
# %pip install cohere
# %pip install python-dotenv

In [2]:
import cohere
import pandas as pd
import requests
import datetime
from tqdm import tqdm
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', None)

import sys,os
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from logger import Logger
from dotenv import load_dotenv

In [3]:
load_dotenv()
# Paste your API key here. Remember to not share publicly
api_key = os.getenv('COHERE_API_KEY')

# log 
logger = Logger("../logs/jd_entity_extraction.log").get_app_logger()

In [4]:
# load the train dataset
try:
    df_train = pd.read_json('../data/relations_dev.json')
    logger.info('load data successfully')
except:
    logger.exception('failed to load dataset')

In [5]:
df_train.head()

Unnamed: 0,document,tokens,relations
0,"Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience","[{'text': 'Bachelor', 'start': 0, 'end': 8, 'token_start': 0, 'token_end': 0, 'entityLabel': 'DIPLOMA'}, {'text': 'Mechanical Engineering', 'start': 21, 'end': 43, 'token_start': 4, 'token_end': 5, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'Physical Science', 'start': 47, 'end': 63, 'token_start': 7, 'token_end': 8, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': '3+ years', 'start': 64, 'end': 72, 'token_start': 9, 'token_end': 11, 'entityLabel': 'EXPERIENCE'}, {'text': 'developing', 'start': 89, 'end': 99, 'token_start': 15, 'token_end': 15, 'entityLabel': 'SKILLS'}, {'text': 'fiber optic cables', 'start': 114, 'end': 132, 'token_start': 18, 'token_end': 20, 'entityLabel': 'SKILLS'}, {'text': 'connector related products', 'start': 137, 'end': 163, 'token_start': 22, 'token_end': 24, 'entityLabel': 'SKILLS'}]","[{'child': 4, 'head': 0, 'relationLabel': 'DEGREE_IN'}, {'child': 7, 'head': 0, 'relationLabel': 'DEGREE_IN'}, {'child': 15, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 18, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 22, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}]"
1,"10+ years of software engineering work experience. Technical experience in release automation engineering, CI/CD or related roles. Experience building and leading a software organization through product design, delivery and commercialization of consumer electronics devices. Experience recruiting and managing technical teams, including performance management. BS/MS in Computer Science. Experience in leading timeline, multi-partner initiatives. Organizational communication and coordination experience. PREFERRED 5+ years of experience with hands-on technical management, release engineering, tools engineering, DevOps, or related area.","[{'text': '10+ years', 'start': 0, 'end': 9, 'token_start': 0, 'token_end': 2, 'entityLabel': 'EXPERIENCE'}, {'text': 'software engineering', 'start': 13, 'end': 33, 'token_start': 4, 'token_end': 5, 'entityLabel': 'SKILLS'}, {'text': '5+ years', 'start': 515, 'end': 523, 'token_start': 77, 'token_end': 79, 'entityLabel': 'EXPERIENCE'}, {'text': 'technical management', 'start': 552, 'end': 572, 'token_start': 86, 'token_end': 87, 'entityLabel': 'SKILLS'}, {'text': 'release engineering', 'start': 574, 'end': 593, 'token_start': 89, 'token_end': 90, 'entityLabel': 'SKILLS'}, {'text': 'tools engineering', 'start': 595, 'end': 612, 'token_start': 92, 'token_end': 93, 'entityLabel': 'SKILLS'}, {'text': 'DevOps', 'start': 614, 'end': 620, 'token_start': 95, 'token_end': 95, 'entityLabel': 'SKILLS'}, {'text': 'BS/MS', 'start': 361, 'end': 366, 'token_start': 53, 'token_end': 55, 'entityLabel': 'DIPLOMA'}, {'text': 'Computer Science', 'start': 370, 'end': 386, 'token_start': 57, 'token_end': 58, 'entityLabel': 'DIPLOMA_MAJOR'}]","[{'child': 4, 'head': 0, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 86, 'head': 77, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 89, 'head': 77, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 92, 'head': 77, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 95, 'head': 77, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 57, 'head': 53, 'relationLabel': 'DEGREE_IN'}]"
2,"3+ years Swift & Objective-C and experience with iOS internals Experience building an entire app from scratch and ideally a portfolio of apps featured in the App Store Someone who knows every trick in the book on UI transitions, network communication and memory/battery efficiency Strong UI/design skill experience is a plus","[{'text': '3+ years', 'start': 0, 'end': 8, 'token_start': 0, 'token_end': 2, 'entityLabel': 'EXPERIENCE'}, {'text': 'Swift & Objective-C', 'start': 9, 'end': 28, 'token_start': 3, 'token_end': 7, 'entityLabel': 'SKILLS'}]","[{'child': 3, 'head': 0, 'relationLabel': 'EXPERIENCE_IN'}]"
3,"8+ years experience in software engineering leadership 5+ years people management experience including managing leaders and managing remotely across regions Strategic thinker with proven track record of transforming operations to provide customer experience through innovation and improvement Track record of working with VP, C-level Executives Experience deploying operational support models across enterprise organizations Communication/presentations experience Experience working with all levels of management internally and externally Experience meeting objectives in an entrepreneurial environment Collaboration and relationship-building experience BA/BS degree or higher","[{'text': '8+ years', 'start': 0, 'end': 8, 'token_start': 0, 'token_end': 2, 'entityLabel': 'EXPERIENCE'}, {'text': 'software engineering', 'start': 23, 'end': 43, 'token_start': 5, 'token_end': 6, 'entityLabel': 'SKILLS'}, {'text': '5+ years', 'start': 55, 'end': 63, 'token_start': 8, 'token_end': 10, 'entityLabel': 'EXPERIENCE'}, {'text': 'people management', 'start': 64, 'end': 81, 'token_start': 11, 'token_end': 12, 'entityLabel': 'SKILLS'}, {'text': 'managing leaders', 'start': 103, 'end': 119, 'token_start': 15, 'token_end': 16, 'entityLabel': 'SKILLS'}]","[{'child': 5, 'head': 0, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 11, 'head': 8, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 15, 'head': 8, 'relationLabel': 'EXPERIENCE_IN'}]"
4,"BS degree in Computer Science or related field. 7+ years C++ experience, including C++11 features and principles. 5+ years experience creating software for real-time environments such as games or robotics. 2+ years experience managing software engineers. Proven track record of software development, including shipping one or more products on large code bases that span platforms and tools. Problem solving and optimization experience. Communication experience and demonstrated experience working across disciplines and teams to drive solutions. PREFERRED Hands-on experience with 3D computer vision algorithms including Calibration, SLAM, Reconstruction, Mapping, Localization, Sensor Fusion, State Estimation and Image Processing Experience with designing (products or open-source/git software) of inertial/optical sensing devices Publication in leading workshops or conferences such as CVPR, ECCV, ICCV, SIGGRAPH, ICCP, RSS, ICRA, etc.","[{'text': 'BS', 'start': 0, 'end': 2, 'token_start': 0, 'token_end': 0, 'entityLabel': 'DIPLOMA'}, {'text': 'Computer Science', 'start': 13, 'end': 29, 'token_start': 3, 'token_end': 4, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': '7+ years', 'start': 48, 'end': 56, 'token_start': 9, 'token_end': 11, 'entityLabel': 'EXPERIENCE'}, {'text': 'C++', 'start': 57, 'end': 60, 'token_start': 12, 'token_end': 12, 'entityLabel': 'SKILLS'}, {'text': 'C++11', 'start': 83, 'end': 88, 'token_start': 16, 'token_end': 16, 'entityLabel': 'SKILLS'}, {'text': '5+ years', 'start': 114, 'end': 122, 'token_start': 21, 'token_end': 23, 'entityLabel': 'EXPERIENCE'}, {'text': 'creating software for real-time environments', 'start': 134, 'end': 178, 'token_start': 25, 'token_end': 31, 'entityLabel': 'SKILLS'}, {'text': 'games', 'start': 187, 'end': 192, 'token_start': 34, 'token_end': 34, 'entityLabel': 'SKILLS'}, {'text': 'robotics', 'start': 196, 'end': 204, 'token_start': 36, 'token_end': 36, 'entityLabel': 'SKILLS'}, {'text': '2+ years', 'start': 206, 'end': 214, 'token_start': 38, 'token_end': 40, 'entityLabel': 'EXPERIENCE'}, {'text': 'managing software engineers', 'start': 226, 'end': 253, 'token_start': 42, 'token_end': 44, 'entityLabel': 'SKILLS'}]","[{'child': 3, 'head': 0, 'relationLabel': 'DEGREE_IN'}, {'child': 12, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 16, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 25, 'head': 21, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 34, 'head': 21, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 36, 'head': 21, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 42, 'head': 38, 'relationLabel': 'EXPERIENCE_IN'}]"


In [6]:
# load the train dataset
try:
    df_test= pd.read_json('../data/relations_test.json')
    logger.info('load data successfully')
except:
    logger.exception('failed to load dataset')

In [7]:
# Let's glance at the dataset
df_test.head()

Unnamed: 0,document,tokens,relations
0,"\nCurrently holding a faculty, industry, or government researcher position.\nPh.D. and publications in machine learning, AI, computer science, statistics, applied mathematics, data science, or related technical fields.\nExperience leading a team in solving analytical problems using quantitative approaches.\nExperience manipulating and analyzing data from different sources.\nExperience in theoretical and empirical research and for answering questions with research.\nAbility to communicate research for public audiences of peers.\nKnowledge in a programming language.\nAbility to obtain and maintain work authorization in the country of employment in 2018.\n\nPREFERRED \n1+ year(s) of work experience in a university, industry, or government lab(s), in a role with primary emphasis on AI research.\nExperience driving original scholarship in collaboration with a team.\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, CVPR, ICML, ICLR, ICCV, and ACL).\nExperience in developing and debugging in C/C++, Python, C# and/or Java.","[{'text': 'Ph.D.', 'start': 75, 'end': 80, 'token_start': 14, 'token_end': 14, 'entityLabel': 'DIPLOMA'}, {'text': 'machine learning', 'start': 101, 'end': 117, 'token_start': 18, 'token_end': 19, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'AI', 'start': 119, 'end': 121, 'token_start': 21, 'token_end': 21, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'computer science', 'start': 123, 'end': 139, 'token_start': 23, 'token_end': 24, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'statistics', 'start': 141, 'end': 151, 'token_start': 26, 'token_end': 26, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'applied mathematics', 'start': 153, 'end': 172, 'token_start': 28, 'token_end': 29, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'data science', 'start': 174, 'end': 186, 'token_start': 31, 'token_end': 32, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': '1+ year(s', 'start': 664, 'end': 673, 'token_start': 113, 'token_end': 115, 'entityLabel': 'EXPERIENCE'}, {'text': 'university', 'start': 699, 'end': 709, 'token_start': 122, 'token_end': 122, 'entityLabel': 'SKILLS'}, {'text': 'industry', 'start': 711, 'end': 719, 'token_start': 124, 'token_end': 124, 'entityLabel': 'SKILLS'}, {'text': 'government lab(s)', 'start': 724, 'end': 741, 'token_start': 127, 'token_end': 129, 'entityLabel': 'SKILLS'}, {'text': 'AI', 'start': 778, 'end': 780, 'token_start': 138, 'token_end': 138, 'entityLabel': 'SKILLS'}]","[{'child': 18, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 21, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 23, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 26, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 28, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 31, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 122, 'head': 113, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 124, 'head': 113, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 127, 'head': 113, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 138, 'head': 113, 'relationLabel': 'EXPERIENCE_IN'}]"
1,"\n2+ years experience in the online advertising or research\nBS/BA Degree in Economics, Statistics, Political Science, History, Psychology\nExperience with presenting and partnering with technical and non-technical teams\nExperience communicating analyses and results to any audience\nExperience working with structured and unstructured data-sets, statistical software such as R, STATA, SPSS, SAS as well as data extraction tools such as Hive and/or SQL\n\nPREFERRED \nExperience with causal measurement, machine learning or lab-based research\nExperience in quantitative field, such as consulting, market research, strategy and planning, or user experience research\nStrong organizational and project management skills\nMasters in quantitative field or a MBA","[{'text': '2+ years', 'start': 1, 'end': 9, 'token_start': 1, 'token_end': 3, 'entityLabel': 'EXPERIENCE'}, {'text': 'online advertising', 'start': 28, 'end': 46, 'token_start': 7, 'token_end': 8, 'entityLabel': 'SKILLS'}, {'text': 'research', 'start': 50, 'end': 58, 'token_start': 10, 'token_end': 10, 'entityLabel': 'SKILLS'}, {'text': 'BS/BA', 'start': 59, 'end': 64, 'token_start': 12, 'token_end': 14, 'entityLabel': 'DIPLOMA'}, {'text': 'Economics', 'start': 75, 'end': 84, 'token_start': 17, 'token_end': 17, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'Statistics', 'start': 86, 'end': 96, 'token_start': 19, 'token_end': 19, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'Political Science', 'start': 98, 'end': 115, 'token_start': 21, 'token_end': 22, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'History', 'start': 117, 'end': 124, 'token_start': 24, 'token_end': 24, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'Psychology', 'start': 126, 'end': 136, 'token_start': 26, 'token_end': 26, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'Masters', 'start': 710, 'end': 717, 'token_start': 126, 'token_end': 126, 'entityLabel': 'DIPLOMA'}, {'text': 'quantitative field', 'start': 721, 'end': 739, 'token_start': 128, 'token_end': 129, 'entityLabel': 'DIPLOMA_MAJOR'}]","[{'child': 7, 'head': 1, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 10, 'head': 1, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 17, 'head': 12, 'relationLabel': 'DEGREE_IN'}, {'child': 19, 'head': 12, 'relationLabel': 'DEGREE_IN'}, {'child': 21, 'head': 12, 'relationLabel': 'DEGREE_IN'}, {'child': 24, 'head': 12, 'relationLabel': 'DEGREE_IN'}, {'child': 26, 'head': 12, 'relationLabel': 'DEGREE_IN'}, {'child': 128, 'head': 126, 'relationLabel': 'DEGREE_IN'}]"
2,"\nBA/BS\n5+ years of program or project management experience\n2+ years of technical project/program management experience\nTrack record of operating independently\nExperience understanding user needs, gathering requirements, and defining scope\nCommunication experience interacting with a variety of audiences from engineers, to vendors, to research leaders\nTrack record of building cross-functional relationships\n\nPREFERRED \nExperience working with UX Research and/or UX Design","[{'text': '5+ years', 'start': 7, 'end': 15, 'token_start': 5, 'token_end': 7, 'entityLabel': 'EXPERIENCE'}, {'text': 'project management', 'start': 30, 'end': 48, 'token_start': 11, 'token_end': 12, 'entityLabel': 'SKILLS'}, {'text': '2+ years', 'start': 60, 'end': 68, 'token_start': 15, 'token_end': 17, 'entityLabel': 'EXPERIENCE'}, {'text': 'technical project', 'start': 72, 'end': 89, 'token_start': 19, 'token_end': 20, 'entityLabel': 'SKILLS'}, {'text': 'program management', 'start': 90, 'end': 108, 'token_start': 22, 'token_end': 23, 'entityLabel': 'SKILLS'}]","[{'child': 11, 'head': 5, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 19, 'head': 15, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 22, 'head': 15, 'relationLabel': 'EXPERIENCE_IN'}]"
3,"\nCurrently enrolled in a full-time degree program and returning to the program after the completion of the internship.\nPublications or experience in fields related to in machine learning, AI, computer vision, natural language processing, computational neuroscience, optimization, computer science, statistics, applied mathematics, or data science.\nExperience solving analytical problems using quantitative approaches.\nAbility to manipulate and analyze large scale, high-dimensionality data from varying sources.\nExperience in utilizing theoretical and empirical research to solve problems.\nAbility to communicate research for public audiences of peers.\nKnowledge in a programming language.\nAbility to obtain and maintain work authorization in the country of employment in 2018.\n\nPREFERRED \nPursuing a Ph.D. degree in Computer Science or related field\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, ICML, ICLR, CVPR, ICCV, ECCV, ACL, NAACL, and EMNLP).\nExperience building systems based on machine learning and/or deep learning methods.\nResearch and software engineer experience demonstrated via an internship, work experience, or coding competitions.\nKnowledge in Python, Lua, C++, C, C# and/or Java.","[{'text': 'Ph.D.', 'start': 801, 'end': 806, 'token_start': 137, 'token_end': 137, 'entityLabel': 'DIPLOMA'}, {'text': 'Computer Science', 'start': 817, 'end': 833, 'token_start': 140, 'token_end': 141, 'entityLabel': 'DIPLOMA_MAJOR'}]","[{'child': 140, 'head': 137, 'relationLabel': 'DEGREE_IN'}]"
4,"\nCurrently enrolled in a full-time degree program and returning to the program after the completion of the internship.\nPublications or experience in fields related to in machine learning, AI, computer vision, natural language processing, computational neuroscience, optimization, computer science, statistics, applied mathematics, or data science.\nExperience solving analytical problems using quantitative approaches.\nAbility to manipulate and analyze large scale, high-dimensionality data from varying sources.\nExperience in utilizing theoretical and empirical research to solve problems.\nAbility to communicate research for public audiences of peers.\nKnowledge in a programming language.\nAbility to obtain and maintain work authorization in the country of employment in 2019.\n\nPREFERRED \nPursuing a Ph.D. degree in Computer Science or related field.\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, ICML, ICLR, CVPR, ICCV, ECCV, ACL, NAACL, and EMNLP).\nExperience building systems based on machine learning and/or deep learning methods.\nResearch and software engineer experience demonstrated via an internship, work experience, or coding competitions.\nKnowledge in Python, Lua, C++, C, C# and/or Java.\n","[{'text': 'Ph.D.', 'start': 801, 'end': 806, 'token_start': 137, 'token_end': 137, 'entityLabel': 'DIPLOMA'}, {'text': 'Computer Science', 'start': 817, 'end': 833, 'token_start': 140, 'token_end': 141, 'entityLabel': 'DIPLOMA_MAJOR'}]","[{'child': 140, 'head': 137, 'relationLabel': 'DEGREE_IN'}]"


In [8]:
from preprocessing import Preporcess
preprocess = Preporcess() 

# preprocess the tokens for both the training and testing dataset
df_train['label'] = df_train['tokens'].apply(preprocess.preprocess_tokens)
df_test['label'] = df_test['tokens'].apply(preprocess.preprocess_tokens)

In [9]:
df_train['label']

0                                                                                            DIPLOMA:Bachelor\nDIPLOMA_MAJOR:Mechanical Engineering,Physical Science\nEXPERIENCE:3+ years\nSKILLS:developing,fiber optic cables,connector related products\n
1                                                                              EXPERIENCE:10+ years,5+ years\nSKILLS:software engineering,technical management,release engineering,tools engineering,DevOps\nDIPLOMA:BS/MS\nDIPLOMA_MAJOR:Computer Science\n
2                                                                                                                                                                                                          EXPERIENCE:3+ years\nSKILLS:Swift & Objective-C\n
3                                                                                                                                                             EXPERIENCE:8+ years,5+ years\nSKILLS:software engineering,people management,managin

In [10]:
train_data = preprocess.preprocess_document(df_train)
train_data

["Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience\n\nExtracted Text:\nDIPLOMA:Bachelor\nDIPLOMA_MAJOR:Mechanical Engineering,Physical Science\nEXPERIENCE:3+ years\nSKILLS:developing,fiber optic cables,connector related products\n----\n",
 '10+ years of software engineering work experience. Technical experience in release automation engineering, CI/CD or related

In [11]:
test_data = preprocess.entity_extraction(df_test)
test_data

[' Currently holding a faculty, industry, or government researcher position. Ph.D. and publications in machine learning, AI, computer science, statistics, applied mathematics, data science, or related technical fields. Experience leading a team in solving analytical problems using quantitative approaches. Experience manipulating and analyzing data from different sources. Experience in theoretical and empirical research and for answering questions with research. Ability to communicate research for public audiences of peers. Knowledge in a programming language. Ability to obtain and maintain work authorization in the country of employment in 2018.  PREFERRED  1+ year(s) of work experience in a university, industry, or government lab(s), in a role with primary emphasis on AI research. Experience driving original scholarship in collaboration with a team. First-author publications at peer-reviewed AI conferences (e.g. NIPS, CVPR, ICML, ICLR, ICCV, and ACL). Experience in developing and debu

In [None]:
coh = cohere.Client(api_key)