In [21]:
# !which conda
# !which python

In [22]:
# !pip install transformers
# !pip install sentence-transformers

# !pip -q install langchain huggingface_hub openai
# !pip install pandas
# !pip install numpy
# !pip install python-dotenv

# !pip install fireworks-ai

In [23]:
from dotenv import load_dotenv

load_dotenv()

True

In [24]:
import pandas as pd

file_path = './data/Benchmark dataset/tech_test_annotations.csv'
test_dataset = pd.read_csv(file_path)

test_dataset

Unnamed: 0,sentence,span,sub_span,label
0,django postgresql redis,django,,LABEL NOT PRESENT
1,django postgresql redis,postgresql,,PostgreSQL
2,django postgresql redis,redis,,LABEL NOT PRESENT
3,django postgresql redis,django,,LABEL NOT PRESENT
4,django postgresql redis,postgresql,,PostgreSQL
...,...,...,...,...
1159,Independently and quick adoption of new techno...,quick adoption of new technologies,,emergent technologies
1160,Willingness to speak at conferences and engage...,speak at conferences,,address an audience
1161,Willingness to speak at conferences and engage...,engage in the community,,communicate with target community
1162,Effective oral and written English communicati...,communication,,communication


In [25]:
test_dataset.shape

(1164, 4)

In [26]:
file_path = './data/Benchmark dataset/tech_validation_annotations.csv'
validation_dataset = pd.read_csv(file_path)

validation_dataset

Unnamed: 0,sentence,span,sub_span,label
0,javascript reactjs java,javascript,,JavaScript
1,javascript reactjs java,reactjs,,LABEL NOT PRESENT
2,javascript reactjs java,java,,Java (computer programming)
3,javascript reactjs java,javascript,,JavaScript
4,javascript reactjs java,reactjs,,LABEL NOT PRESENT
...,...,...,...,...
288,2 ) Knowledge of public and private cloud offe...,public and private cloud offerings,public cloud offerings,cloud technologies
289,2 ) Knowledge of public and private cloud offe...,public and private cloud offerings,private cloud offerings,LABEL NOT PRESENT
290,3 ) Data warehousing/ETL experience would be u...,Data warehousing/ETL,,implement data warehousing techniques
291,4 ) Knowledge of fixed income instruments and ...,fixed income instruments,,LABEL NOT PRESENT


In [27]:
validation_dataset.shape

(293, 4)

In [28]:
def dataset_insights(df):
    filtered_dataset = df[~df['label'].isin(['LABEL NOT PRESENT', 'UNDERSPECIFIED'])]
    unique_skills_with_label = filtered_dataset['span'].nunique()
    total_skills_with_label = filtered_dataset['span'].count()
    label_not_present_count = df[df['label'] == 'LABEL NOT PRESENT']['span'].count()
    underspecified_count = df[df['label'] == 'UNDERSPECIFIED']['span'].count()
    insights = {
        'Unique Skills with Labels': unique_skills_with_label,
        'Total Annotated Skills': total_skills_with_label,
        'LABEL NOT PRESENT Count': label_not_present_count,
        'UNDERSPECIFIED Count': underspecified_count
    }

    return insights

dataset_insights(test_dataset)

{'Unique Skills with Labels': 437,
 'Total Annotated Skills': 674,
 'LABEL NOT PRESENT Count': 390,
 'UNDERSPECIFIED Count': 100}

In [29]:
dataset_insights(validation_dataset)

{'Unique Skills with Labels': 89,
 'Total Annotated Skills': 153,
 'LABEL NOT PRESENT Count': 127,
 'UNDERSPECIFIED Count': 13}

In [30]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('intfloat/e5-large-v2')

query_embedding = model.encode('Java (computer programming)')
passage_embedding = model.encode(['javascript reactjs java',
                                  'Able to frame problems drive teams to collect data guide the analysis and to present the data to influence leadership'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Similarity: tensor([[0.8491, 0.7260]])


In [31]:
file_path = './data/ESCO/skills_en.csv'
esco_dataset = pd.read_csv(file_path)

esco_dataset

Unnamed: 0,conceptType,conceptUri,skillType,reuseLevel,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,scopeNote,definition,inScheme,description
0,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/0005c151-5b5a...,skill/competence,sector-specific,manage musical staff,manage staff of music\ncoordinate duties of mu...,,released,2016-12-20T17:43:43Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Assign and manage staff tasks in areas such as...
1,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00064735-8fad...,skill/competence,occupation-specific,supervise correctional procedures,oversee prison procedures\nmanage correctional...,,released,2016-12-20T20:17:49Z,,,http://data.europa.eu/esco/concept-scheme/memb...,Supervise the operations of a correctional fac...
2,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/000709ed-2be5...,skill/competence,sector-specific,apply anti-oppressive practices,apply non-oppressive practices\napply an anti-...,,released,2016-12-20T19:18:19Z,,,http://data.europa.eu/esco/concept-scheme/skil...,"Identify oppression in societies, economies, c..."
3,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/0007bdc2-dd15...,skill/competence,sector-specific,control compliance of railway vehicles regulat...,monitoring of compliance with railway vehicles...,,released,2016-12-20T20:02:19Z,,,http://data.europa.eu/esco/concept-scheme/skil...,"Inspect rolling stock, components and systems ..."
4,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00090cc1-1f27...,skill/competence,cross-sector,identify available services,establish available services\nidentify availab...,,released,2016-12-20T20:15:17Z,,,http://data.europa.eu/esco/concept-scheme/memb...,Identify the different services available for ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13891,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffef5eb3-a15e...,skill/competence,sector-specific,remediate healthcare user's occupational perfo...,restore healthcare user's occupational perform...,,released,2016-12-20T19:25:53Z,,,http://data.europa.eu/esco/concept-scheme/memb...,"Remediate or restore the cognitive, sensorimot..."
13892,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff0b074-5a76...,skill/competence,sector-specific,install transport equipment lighting,install transport equipment illumination\nfix ...,,released,2016-12-20T20:03:21Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Install lighting elements in transport equipme...
13893,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff0e2cd-d0bd...,knowledge,sector-specific,natural language processing,natural language processing\nNLP,,released,2022-07-05T14:34:09.904Z,,,http://data.europa.eu/esco/concept-scheme/skil...,The technologies which enable ICT devices to u...
13894,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff5bc45-b506...,skill/competence,cross-sector,coordinate construction activities,reviewing construction progress\nconstruction ...,,released,2016-12-20T18:22:35Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Coordinate the activities of several construct...


In [32]:
# !docker run --name skill-extraction-redis -d -p 6379:6379 -v ./redis_data:/data redis/redis-stack:latest

import redis 

client = redis.Redis(host='localhost', port=6379, db=0)

keys = client.keys('skill:*')

print(len(keys))

for key in keys[0:5]:
    skill_data = client.json().get(key)
    label = skill_data['label']

    print(f"Key: {key.decode('utf-8')}, Label: {label}" )

13896
Key: skill:06063, Label: conserve new media
Key: skill:06483, Label: maintain gaming equipment
Key: skill:07497, Label: economics
Key: skill:03605, Label: examine cell specimens microscopically
Key: skill:08146, Label: strategic planning


In [33]:
import numpy as np

def create_query_table(query, queries, encoded_queries, extra_params={}):
    results_list = []
    for i, encoded_query in enumerate(encoded_queries):
        result_docs = (
            client.ft("idx:skills")
            .search(
                query,
                {
                    "query_vector": np.array(
                        encoded_query, dtype=np.float32
                    ).tobytes()
                }
                | extra_params,
            )
            .docs
        )
        for doc in result_docs:
            vector_score = round(1 - float(doc.vector_score), 2)
            results_list.append(
                {
                    "query": queries[i],
                    "score": vector_score,
                    "label": doc.label,
                }
            )

    # Optional: convert the table to Markdown using Pandas
    queries_table = pd.DataFrame(results_list)
    queries_table.sort_values(
        by=["query", "score"], ascending=[True, False], inplace=True
    )
    queries_table["query"] = queries_table.groupby("query")["query"].transform(
        lambda x: [x.iloc[0]] + [""] * (len(x) - 1)
    )
    queries_table.to_markdown(index=False)
    
    return queries_table

In [34]:
from redis.commands.search.query import Query

queries = ["plan business strategy"]
encoded_queries = model.encode(queries)

query = (
    Query("(*)=>[KNN 5 @embedding $query_vector AS vector_score]")
    .sort_by("vector_score")
    .return_fields('vector_score', 'label')
    .dialect(2)
)


create_query_table(query, queries, encoded_queries)

Unnamed: 0,query,score,label
0,plan business strategy,0.95,plan marketing strategy
1,,0.92,develop business plans
2,,0.91,analyse business plans
3,,0.9,plan marketing campaigns
4,,0.9,make strategic business decisions


In [35]:
import os

def load_text_files(folder_path):
    text_data_with_names = []
    text_contents_only = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                # Store content and file names in a list of dictionaries
                text_data_with_names.append({
                    'file_name': filename,
                    'content': content
                })
                # Store just the content in a separate list
                text_contents_only.append(content)
    
    return text_data_with_names, text_contents_only

folder_path = './data/Job postings'
job_postings_with_names, job_postings_contents = load_text_files(folder_path)

for job_posting in job_postings_with_names:
    print(job_posting['file_name'], len(job_posting['content']))

5.txt 2820
4.txt 4828
3.txt 1497
2.txt 2192
1.txt 1108


In [36]:
from langchain import LLMChain, PromptTemplate
from langchain.memory import ConversationBufferWindowMemory

template = """
You are a recruiter.
Your task is to thoroughly analyze the following job posting. 
Extract a complete list of required skills, encompassing both technical and interpersonal aspects. 
Ensure this list is non-repetitive, and reflective of the varied skill sets demanded. 
Present your findings in a single python array. Each skill should be a string enclosed in double quotes 
Output just the array, wihtout any other text. 
Start your response with "[" symbol, and finish with "]". Make sure your response doesn't include anything else.

Job posting:
{job_posing}

Response:"""

prompt_template = PromptTemplate(input_variables=["job_posing"], template=template)

In [37]:
from langchain.llms.fireworks import Fireworks 

llm = Fireworks(model="accounts/fireworks/models/llama-v2-70b-chat", max_tokens=256, temperature=1)

In [38]:
responses = []
for job_posting in job_postings_contents:
    prompt_val = prompt_template.invoke({"job_posing": job_posting})
    prompt = prompt_val.to_string()
    output = llm(prompt)
    responses.append(output)

for response in responses:
    print(response)


[
"Node/Nest JS", 
"ReactJS",
"PostgreSQL",
"MongoDB",
"Redis",
"Python",
"REST APIs",
"AWS cloud services",
"Kubernetes",
"Infrastructure as Code (IAAC)",
"OpenAI API",
"LLM propmt engineering",
"Twilio",
"CRMs",
"Mail",
"Follow Up Boss",
"Outlook",
"Architecture",
"Design",
"Development",
"Team management",
"Mentoring",
"Code reviews",
"Git repositories",
"Coding practices",
"Security",
"Efficiency",
"Cloud infrastructure",
"Performance",
"Scalability",
"UX/UI",
"Sales",
"Product development",
"Dashboard",
"Call center software",
"Speech to text",
"Transcribe",
"Conversation analyze software"
]


[
"Microsoft Dynamics 365",
"ERP systems",
"project management",
"Microsoft Project",
"project planning",
"task distribution",
"project documentation",
"budget management",
"time management",
"scope management",
"risk management",
"client communication",
"team management",
"interviewing",
"recruiting",
"project methodology",
"ERP implementation",
"data collection",
"Go-Live",
"Ukrainian lan

In [39]:
import ast

skills_lists = []

for response in responses:
    array_start = response.find('[')
    array_end = response.find(']', array_start)

    array_string = response[array_start:array_end+1].strip()
    skills_lists.append(ast.literal_eval(array_string))

In [40]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

for skills in skills_lists[:5]:  # Process only the first 5 arrays
    queries = skills  # Each sublist is an array of skills/queries
    encoded_queries = model.encode(queries)

    # Constructing the query might involve replacing placeholders
    query = (
        Query("(*)=>[KNN 5 @embedding $query_vector AS vector_score]")
        .sort_by("vector_score")
        .return_fields('vector_score', 'label')
        .dialect(2)
    )

    # Call the function that creates and prints the table
    table = create_query_table(query, queries, encoded_queries)
    print(table)
    print("\n")  # Just to add space between tables

# Reset pandas options after printing
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')

                             query  score                                          label
35              AWS cloud services   0.86                    develop with cloud services
36                                   0.85                             cloud technologies
37                                   0.84                              SAP Data Services
38                                   0.84                  manage cloud data and storage
39                                   0.83                           automate cloud tasks
85                    Architecture   0.91                           architectural design
86                                   0.89                           architectural theory
87                                   0.88                         landscape architecture
88                                   0.87                          historic architecture
89                                   0.87                                    archaeology
65                   

                                     query  score                                             label
80                      ERP implementation   0.87                  government policy implementation
81                                           0.85           manage government policy implementation
82                                           0.85                check implementation security plan
83                                           0.85                                business processes
84                                           0.85                    implement strategic management
5                              ERP systems   0.89                       database management systems
6                                            0.88                                e-commerce systems
7                                            0.88                                       ATM systems
8                                            0.86                       learning management systems


                                                query  score                                              label
25  Ability to occasionally collaborate (join meet...   0.85         collaborate in company's daily operations 
26                                                      0.85           consider time zones in execution of work
27                                                      0.83                          cooperate with colleagues
28                                                      0.83                             liaise with colleagues
29                                                      0.83       use communication and collaboration software
20  Ability to work independently in a remote-firs...   0.87                         handle tasks independently
21                                                      0.86                  work independently in agriculture
22                                                      0.86                                 work indepe

                                                query  score                                              label
60                       Bazel (build tool by Google)   0.83           Cain and Abel (penetration testing tool)
61                                                      0.82  Jenkins (tools for software configuration mana...
62                                                      0.82                                       Apache Maven
63                                                      0.81  Chef (tools for software configuration managem...
64                                                      0.81         John The Ripper (penetration testing tool)
0                            Desire to learn and grow   0.90                   demonstrate willingness to learn
1                                                       0.85                              demonstrate curiosity
2                                                       0.85                          strive for company