In [1]:
import re
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_sm")

yoe_matcher = Matcher(nlp.vocab)
degree_matcher = Matcher(nlp.vocab)

yoe_patterns = [
    # 2 years experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": {"IN": ["experience", "exp"]}}],
    # 2 years of experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "of"}, {"LOWER": {"IN": ["experience", "exp"]}}],
    # 2 years work experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": {"IN": ["work", "working", "professional"]}},{"LOWER": "experience"}],
    # 2 years of work experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "of"}, {"LOWER": {"IN": ["work", "working", "professional"]}},{"LOWER": "experience"}],
    # experience of 3 years
    [{"LOWER": {"IN": ["experience", "exp"]}}, {"LOWER": {"IN": ["of", ""]}}, {"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}],
    # 2 years as a developer / 2 years in a similar role
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": {"IN": ["as", "in"]}}],
    # 1-2 years experience
    [{"LIKE_NUM": True}, {"IS_SPACE": True, "OP": "*"}, {"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}],
    # 2 years of relevant
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "of"}, {"LOWER": "relevant"}],
    # 2 years relevant 
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "relevant"}],
    # 2 years of hands on experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "of"}, {"LOWER": "hands"}, {"LOWER": "on"}, {"LOWER": "experience"}],
    # 2 years hands on experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "hands"}, {"LOWER": "on"}, {"LOWER": "experience"}],
]

degree_patterns = [
    [{"LOWER": {"IN": ["bachelor", "bachelors", "undergraduate"]}}],
    [{"LOWER": "master"}, {"LOWER": {"IN": ["'s", "s"]}}],
    [{"LOWER": "mba"}],
    [{"LOWER": {"IN": ["phd", "doctorate"]}}],
]

for pattern in yoe_patterns:
    yoe_matcher.add("YEARS_EXPERIENCE", [pattern])

for pattern in degree_patterns:
    degree_matcher.add("DEGREE", [pattern])

text = '''
Ciena is committed to our people-first philosophy. Our teams enjoy a culture focused on prioritizing a personalized and flexible work environment that empowers an individual’s passions, growth, wellbeing and belonging. We’re a technology company that leads with our humanity—driving our business priorities alongside meaningful social, community, and societal impact. Not ready to apply? Join our Talent Community to get relevant job alerts straight to your inbox. Responsibilities Full Account Level management responsibilities of the Services Project Management team responsible for the successful implementation of customer projects, with high complexity, within a Customer network. Ensure Service target margins are achieved, contributing to accurate revenue recognition, ensuring committed schedules and deliverables are met, and that projects meet Customer’s expectations for quality and service. Analyze project proposals to determine time frame, funding limitations and appropriate process for accomplishing projects. Identify and schedule project deliverables, milestones, and required activities and tasks. Provide leadership and motivation to project team members throughout the project life cycle and confer with project staff to outline work plan. Establish work plan and staffing for project activities, iterations or phases, and arrange for recruitment or assignment of project personnel. Implement project Communication plan. Document and communicate project status, obstacles and resolutions to management. Perform risk assessment and implement mitigation plans. Services Project Manager may support pre-sales activities, providing project management subject matter expertise and contributing to Customer proposals and RFP responses, as required. Interface across Ciena teams and Customer operations teams, Sales, Bid Management, Service Delivery Groups, and Finance to ensure project deliverables and financial targets are met. May be required to contribute to process management optimization efforts associated with project management Preferred Qualifications Critical Thinking / Analysis- Analyzes Multi-dimensional problems; gathers information over extended periods of time and applies complex concepts to generate possible solutions. Judgment - Makes Complex decisions/Judgment taking into account multiple alternatives. Decisiveness - Delivers decisions in the face of competing alternatives Outstanding planning, scheduling, and coordination skills Breadth of technology, Services and business acumen. Strong business judgment and ability to think through complex business issues. Works collaboratively in matrix environment, with many stakeholders Results oriented, Collaborator and Accountable Outstanding verbal and written communications. Must have proficiency with Project Management tools Minimum Qualifications Bachelor’s degree in Telecom/Computer Networking Engineering or related field More than three years working experience in project management Experience in managing project of Optical and/or Packet transport network is advantage PMP Certified is advantage Experience having worked in matrix organization that includes demonstrating strong influencing and collaborative skills across multiple teams Not ready to apply? Join our Talent Community to get relevant job alerts straight to your inbox. At Ciena, we are committed to building and fostering an environment in which our employees feel respected, valued, and heard. Ciena values the diversity of its workforce and respects its employees as individuals. We do not tolerate any form of discrimination. Ciena is an Equal Opportunity Employer, including disability and protected veteran status. If contacted in relation to a job opportunity, please advise Ciena of any accommodation measures you may require."
'''

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text = re.sub('[^A-Za-z0-9]+', ' ', text)
text = text.lower()
doc = nlp(text)

yoe_matches = yoe_matcher(doc)
yoe_found = False

degree_matches = degree_matcher(doc)

print('Years of Experience Found:')
for match_id, start, end in yoe_matches:
    span = doc[start:end]
    print(span.text)

print('\nDate NER:')
for ent in doc.ents:
    if ent.label_ == "DATE" and re.search(r'\b(\d+)\s+years?\b', ent.text):
        print(ent.text)
    
print("\nDEGREE:")
for match_id, start, end in degree_matches:
    span = doc[start:end]
    print(span.text)

Years of Experience Found:
three years working experience

Date NER:

DEGREE:
bachelor


In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

job_skills = "Cascading Style Sheets (CSS),Front-End Development,HTML,JavaScript Frameworks,Laravel,Mobile Application Development,Vue.js,Web Applications,Doctrine (PHP),Web Architecture"
my_skills = "Back-End Web Development,Cascading Style Sheets (CSS),JavaScript,Object-Oriented Programming (OOP),Software Development,Web Development,Databases,JSON,MongoDB,PostgreSQL"

job_skills = job_skills.lower().split(",")
my_skills = my_skills.lower().split(",")

job_skills = [skill.strip() for skill in job_skills]
my_skills = [skill.strip() for skill in my_skills]

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# model = SentenceTransformer("all-mpnet-base-v2")
# model = SentenceTransformer('all-roberta-large-v1')
score = 0
sen = job_skills + my_skills
sen_embeddings = model.encode(sen)
job_embeddings = sen_embeddings[:len(job_skills)]
my_embeddings = sen_embeddings[len(job_skills):]
matches = []
for i in range(len(job_skills)):
    if job_skills[i] in my_skills:
        score += 1
        matches.append({"matched_skill": job_skills[i], "user_skill": my_skills[i], "similarity": 100})
        continue
    similarities = cosine_similarity([job_embeddings[i]], my_embeddings)[0]
    max_similarity = max(similarities)
    max_index = np.argmax(similarities)
    if max_similarity > 0.4:
        matches.append({"matched_skill": job_skills[i], "user_skill": my_skills[max_index], "similarity": max_similarity * 100})
        score += max_similarity
print(f'You have a {round(score/len(job_skills) * 100, 3)}% match with the job description')
for i in matches:
    print(i)    

You have a 52.695% match with the job description
{'matched_skill': 'cascading style sheets (css)', 'user_skill': 'back-end web development', 'similarity': 100}
{'matched_skill': 'front-end development', 'user_skill': 'back-end web development', 'similarity': 75.9119987487793}
{'matched_skill': 'html', 'user_skill': 'javascript', 'similarity': 54.000067710876465}
{'matched_skill': 'javascript frameworks', 'user_skill': 'javascript', 'similarity': 62.70819902420044}
{'matched_skill': 'mobile application development', 'user_skill': 'software development', 'similarity': 57.21094608306885}
{'matched_skill': 'vue.js', 'user_skill': 'javascript', 'similarity': 41.346943378448486}
{'matched_skill': 'web applications', 'user_skill': 'web development', 'similarity': 70.79660892486572}
{'matched_skill': 'web architecture', 'user_skill': 'web development', 'similarity': 64.97543454170227}


In [25]:
# Original data
matches = [{'matched_skill': 'data structures', 'user_skill': 'data engineering', 'similarity': 58.722}, {'matched_skill': 'programming', 'user_skill': 'python', 'similarity': 61.316}, None]
match_id = [1, 2]

filtered_matches = [match for match in matches if match is not None]

json_data = pd.DataFrame(filtered_matches).to_json(orient="records")

final_df = pd.DataFrame({'match_id': match_id, 'skills': [json_data]})

print(final_df)


ValueError: All arrays must be of the same length

In [37]:
match_ids = [1, 2]

json_data = pd.DataFrame([{'matched_skill': 'computer science', 'user_skill': 'data science', 'similarity': 71.507}, {'matched_skill': 'data structures', 'user_skill': 'data engineering', 'similarity': 58.722}, {'matched_skill': 'programming', 'user_skill': 'python', 'similarity': 61.316}]).to_json(orient="records")

json_data2 = pd.DataFrame([{"matched_skill":"data analysis","user_skill":"data analysis","similarity":100}]).to_json(orient="records")

final_df = pd.DataFrame({'match_id': match_ids, 'skills': [json_data, json_data2]})
print(final_df.iloc[0]['skills'])

[{"matched_skill":"computer science","user_skill":"data science","similarity":71.507},{"matched_skill":"data structures","user_skill":"data engineering","similarity":58.722},{"matched_skill":"programming","user_skill":"python","similarity":61.316}]


In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

job_titles = "Business Development"
my_titles = "Software Engineer,Fullstack Developer,Backend Engineer"

job_titles = job_titles.lower().split(",")
my_titles = my_titles.lower().split(",")

model = SentenceTransformer("all-mpnet-base-v2")
# model = SentenceTransformer('all-roberta-large-v1')
score = 0
sen = job_titles + my_titles
sen_embeddings = model.encode(sen)
job_embeddings = sen_embeddings[:len(job_titles)]
my_embeddings = sen_embeddings[len(job_titles):]
matches = []
for i in range(len(job_titles)):
    similarities = cosine_similarity([job_embeddings[i]], my_embeddings)[0]
    print(similarities)
    max_similarity = max(similarities)
    max_index = np.argmax(similarities)
    if max_similarity >= 0.5:
        matches.append({"Skill": job_titles[i], "Matched Skill": my_titles[max_index], "Similarity": round(max_similarity * 100, 3)})
        score += max_similarity
print(f'You have a {round(score/len(job_titles) * 100, 3)}% match with the job title')
for i in matches:
    print(i)    

[0.38538784 0.3254546  0.4138806 ]
You have a 0.0% match with the job title


In [5]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

text =''' 
Company Description About Grab and our workplace Grab is Southeast Asia’s leading superapp. We are dedicated to improving the lives of millions of users across the region by providing them everyday services such as deliveries, mobility, financial services, enterprise services and others. More than that, we provide the opportunity for them to have a better life. And that aspiration starts inside Grab because we believe in a seamless blend of work and home life, making every aspect of life better for all. Guided by The Grab Way, which spells out our mission, how we believe we can achieve it, and our operating principles—the 4Hs: Heart, Hunger, Honour and Humility—we work to create economic empowerment for the people of Southeast Asia. With our unwavering commitment to our values, we believe that we're more than a service provider; we're agents of positive change. Job Description Get to know Merchant Experience Tech Family At Business Platform our goal is to make merchants happy, and help them thrive in their business. We support all businesses, between tiny merchants (like a warung or hawker) and big, international franchises. Merchants have a lot of different businesses: restaurants, supermarkets, fashion, online stores, people who sell on social media, and many more. We level the playing field between tiny merchants and big merchants by giving them the same tools to grow their business, to attract customers, to become more efficient and be able to concentrate on what they do best inside one GrabMerchant Super App and Portal. If you go to a restaurant, chances are high you have seen some of the products of our team already. We have millions of very different merchant end-users who earn money in Southeast Asia that use our tools: cashiers, store owners, managers, merchant back office for marketing, and reporting. Stability, quality and innovation to engage more customers are the most important qualities for them running their business. Our tools collect and analyze billions of transactions every month that lead to merchant insights on how they can improve their business. We are a distributed team with two thirds in Indonesia, and third in Singapore. Our communication is in English, both in spoken and written form. Our team has direct end-user contact, and impact on the bottom line for merchants and thus, Grab Get to know the role We are seeking talented & passionate Engineers to join our team, you will have opportunities to work on multiple backend services as well as participating in merchant immersions, talking directly to the end-users, identifying their challenges and how we can help them in their work life. It is very important that our team members take initiatives to identify problems, and have the right mindset and skill sets to solve them. The Day-to-Day Activities You use technology to solve well defined problems, building individual components or features based on well defined tasks. You understand the requirements of your projects and use that understanding in your designs. You understand your codebase and systems, ensuring reliability through design reviews, monitoring, alerting, and applying OE (Operational Excellence) standards. You take ownership of your code and ensure it’s readable, maintainable, and well-tested. You understand and apply the appropriate data structures and algorithms. You give clear, actionable feedback during code reviews and respond well to feedback from others. You respond promptly to issues and keep the working team constantly updated. Your tasks are delivered on time and with high quality, and you’re able to explain your solutions to other technical stakeholders through both verbal and written communication. Qualifications The Must-Haves You can write clean code in any language (C++, C, Java, Scala, Rust, Haskell, OCaml, Erlang, Python, Ruby, PHP, Node.JS, C#, etc.), and are willing to learn Golang Ability to write functionally correct, modular, readable and maintainable code Awareness of basic security concepts Understanding of common data structures and common algorithms A good understanding of the clean architecture principles
'''

job_experience = [text]

my_experience = ["2 years as Data Scientist at PT. XYZ Indonesia", "Worked on various projects using Python", "Developed machine learning models using scikit-learn", "Used deep learning for image classification", "Created data pipelines for data processing", "Used Self Supervised Learning for text encoding"]

# summary = summarizer(text[:1024], max_length=250, min_length=50, do_sample=False)
# job_experience[0] = summary[0]['summary_text']
# print(job_experience)

# job_experience = [preprocess_text(exp) for exp in job_experience]
# my_experience = [preprocess_text(exp) for exp in my_experience]

model = SentenceTransformer("all-roberta-large-v1")
concat = job_experience + my_experience

embeddings = []
batch_size = 32
for i in range(0, len(concat), batch_size):
    batch = concat[i:i + batch_size]
    embeddings.extend(model.encode(batch, show_progress_bar=False))
embeddings = np.array(embeddings)

job_embeddings = embeddings[:len(job_experience)]
user_embeddings = embeddings[len(job_experience):]

similarities = cosine_similarity(user_embeddings, job_embeddings)

score = np.mean(np.max(similarities, axis=1))
matches = []
for i in range(len(my_experience)):
    best_match_idx = np.argmax(similarities[i])
    best_similarity = similarities[i][best_match_idx]
    matches.append({"Experience": my_experience[i], "Similarity": best_similarity * 100})
print(f'You have a {round(score * 100, 3)}% match with the job description')
print(matches)

KeyboardInterrupt: 

In [None]:
import json
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, SentencesDataset, models, InputExample, losses
from torch.utils.data import DataLoader

with open('about.json') as f:
    json_data = json.load(f)

data = []

model = SentenceTransformer('all-roberta-large-v1')

for item in json_data:
    anchor = item['anchor']
    positive = item['positive']
    negatives = item['negatives']
    
    data.append(InputExample(texts=[anchor, positive], label=1))
    
    for negative in negatives:
        data.append(InputExample(texts=[anchor, negative], label=0))

In [None]:
train_dataset = SentencesDataset(data, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.ContrastiveLoss(model)
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=1,
          warmup_steps=10,  
          output_path='output/trained_model')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration: 100%|██████████| 3/3 [00:45<00:00, 15.22s/it]
Epoch: 100%|██████████| 1/1 [00:45<00:00, 45.70s/it]


In [None]:
model = SentenceTransformer('output/trained_model')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

sentence1 = "Experience in project management and team leadership."
sentence2 = "Leader of an organization that manages Technologies"

embedding1 = model.encode([sentence1])[0]
embedding2 = model.encode([sentence2])[0]

similarity = cosine_similarity([embedding1], [embedding2])[0][0]
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.4986788332462311


In [35]:
user_title = "Data Scientist,Data Analyst,Data Engineer,Machine Learning Engineer"
user_skills = "Python,Machine Learning,Deep Learning,Data Science,Data Analysis,Data Engineering,SQL,Tensorflow,Pytorch,Scikit-learn,Natural Language Processing,R"
user_degree = "Bachelor's Degree"
user_experience = 1
user_id = 1
user_data = {
    "user_id": user_id,
    "job_title": user_title,
    "skills": user_skills,
    "degree": user_degree,
    "years_of_experience": user_experience
}
user_data = pd.DataFrame(user_data, index=[0])
user_data['user_id'].iloc[0]

1

In [1]:
import pandas as pd

data = pd.read_csv("job_for_migration.csv")
skills = pd.read_csv("skills.csv")
data.head()

Unnamed: 0,job_id,job_title,company_id,job_type,job_level,work_model,skills,min_experience,degree,skills.1,about,qualifications
0,3935168115,DevOps Engineer,1,Contract,Entry level,On-site,"Agile Application Development,Confluence,Conti...",4.0,bachelor,"Agile Application Development,Confluence,Conti...",Requirements : Memiliki pengalaman minimal 4 t...,"Bachelor's Degree,4+ years of work experience ..."
1,3951984858,"Software Engineer, Backend - Business Platform",2,Full-time,Associate,Hybrid,"Back-End Web Development,Code Review,Communica...",,,"Back-End Web Development,Code Review,Communica...",Company Description About Grab and our workpla...,
2,3954206104,Sr. Officer-Rebuy & CVM (Circle Jakarta Raya),3,Full-time,Associate,On-site,"Project Management,Analytical Skills,Customer ...",5.0,bachelor,"Project Management,Analytical Skills,Customer ...",Develop and implement product strategies aimed...,"Bachelor's Degree,5+ years of work experience ..."
3,3966866576,Product Innovation,4,Full-time,Mid-Senior level,On-site,"Collaboration Tools,Concept Development,Creati...",2.0,bachelor,"Collaboration Tools,Concept Development,Creati...",REQUIREMENTS: A graduate of Bachelor/Master's ...,"Bachelor's Degree,Can start immediately,2+ yea..."
4,3965410324,Account Executive,5,Full-time,Entry level,On-site,"Communication,Microsoft Office,Sales",1.0,bachelor,"Communication,Microsoft Office,Sales",Responsibilities Build and maintain client rel...,"Bachelor's Degree,1+ years of work experience ..."


In [4]:
job_skills

0      3935168115
1      3951984858
2      3954206104
3      3966866576
4      3965410324
          ...    
984    3966845899
985    3921442968
986    3975184878
987    3973946771
988    3969560987
Name: job_id, Length: 989, dtype: int64

In [5]:
def get_skill_id(skill_name):
    try:
        skill_names = skill_name.split(",")
    except:
        skill_names = []
    skill_ids = []
    for skill in skill_names:
        skill = skill.strip() 
        matching_skills = skills[skills['skills'] == skill]
        if not matching_skills.empty:
            skill_ids.append(str(matching_skills['id'].iloc[0]))
        else:
            print(f"Skill '{skill}' not found in the DataFrame.")
    return skill_ids

skill_ids = data['skills'].apply(get_skill_id)

In [9]:
job_skills = pd.concat([job_skills, skill_ids], axis=1)

In [11]:
# Use the explode method to flatten the skills list
job_skills_flattened = job_skills.explode('skills')

# Rename the columns for clarity (optional)
job_skills_flattened = job_skills_flattened.rename(columns={'skills': 'skill_id'})

In [18]:
job_skills_flattened.isna().sum()

job_id       0
skill_id    28
dtype: int64

In [19]:
job_skills_flattened = job_skills_flattened.dropna(subset=['skill_id'])

In [21]:
job_skills_flattened.to_csv("job_skills.csv", index=False)

In [22]:
data = pd.read_csv("job_for_migration.csv")
data.isna().sum()

job_id              0
job_title           0
company_id          0
job_type           11
job_level         152
work_model         89
skills             28
min_experience    518
degree            493
skills.1           28
about               1
qualifications    836
dtype: int64

In [128]:
job_skills.head()

0                       [1, 2, 3, 4, 5, 6, 7]
1          [8, 9, 10, 11, 12, 13, 14, 15, 16]
2    [17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
3        [27, 28, 29, 30, 31, 32, 33, 34, 35]
4                                [10, 36, 37]
Name: skills, dtype: object

In [78]:
def get_company_id(company_name):
    company = companies[companies['company_name'] == company_name]
    if company.empty:
        return None
    return company['id'].iloc[0]

data['company_id'] = data['company_name'].apply(get_company_id)

In [79]:
data[['company_id', 'company_name']].sample(5)

Unnamed: 0,company_id,company_name
437,217.0,Tekvaly
940,86.0,Cermati.com
171,3.0,PT. Indosat Tbk
639,56.0,Rocketindo
654,8.0,TikTok


In [24]:
column_order = ['job_id', 'job_title', 'company_id', 'job_type', "job_level", 'work_model', 'skills', 'min_experience', 'degree', 'about', 'qualifications']

In [25]:
data = data[column_order]
data.head()

Unnamed: 0,job_id,job_title,company_id,job_type,job_level,work_model,skills,min_experience,degree,about,qualifications
0,3935168115,DevOps Engineer,1,Contract,Entry level,On-site,"Agile Application Development,Confluence,Conti...",4.0,bachelor,Requirements : Memiliki pengalaman minimal 4 t...,"Bachelor's Degree,4+ years of work experience ..."
1,3951984858,"Software Engineer, Backend - Business Platform",2,Full-time,Associate,Hybrid,"Back-End Web Development,Code Review,Communica...",,,Company Description About Grab and our workpla...,
2,3954206104,Sr. Officer-Rebuy & CVM (Circle Jakarta Raya),3,Full-time,Associate,On-site,"Project Management,Analytical Skills,Customer ...",5.0,bachelor,Develop and implement product strategies aimed...,"Bachelor's Degree,5+ years of work experience ..."
3,3966866576,Product Innovation,4,Full-time,Mid-Senior level,On-site,"Collaboration Tools,Concept Development,Creati...",2.0,bachelor,REQUIREMENTS: A graduate of Bachelor/Master's ...,"Bachelor's Degree,Can start immediately,2+ yea..."
4,3965410324,Account Executive,5,Full-time,Entry level,On-site,"Communication,Microsoft Office,Sales",1.0,bachelor,Responsibilities Build and maintain client rel...,"Bachelor's Degree,1+ years of work experience ..."


In [83]:
data['company_id'].isnull().sum()
data['company_id'] = data['company_id'].fillna(0).astype(int) 

In [85]:
data.sample(5)

Unnamed: 0,job_id,job_title,company_id,job_type,job_level,work_model,skills,min_experience,degree,skills.1,about,qualifications
104,3941469729,Software Engineer,6,Full-time,Mid-Senior level,On-site,"Back-End Web Development,Java,Analytical Skill...",3.0,bachelor,"Back-End Web Development,Java,Analytical Skill...",It's fun to work in a company where people tru...,
503,3976903819,TikTok Shop - Agency Partner Intern (Indonesia...,8,Full-time,,,"Analytical Skills,Critical Thinking,Data Analy...",,bachelor,"Analytical Skills,Critical Thinking,Data Analy...",Responsibilities TikTok is the leading destina...,
859,3948665572,Web Programmer,419,Full-time,Entry level,Hybrid,"Cascading Style Sheets (CSS),HTML,Java,JavaScr...",,,"Cascading Style Sheets (CSS),HTML,Java,JavaScr...",Build Web or API using Java Spring MVC. Handle...,
914,3953972345,Risk Analyst,13,Full-time,Entry level,On-site,"Analytical Skills,Attention to Detail,Communic...",2.0,bachelor,"Analytical Skills,Attention to Detail,Communic...","As a Fraud Risk Analyst at Gojek, you will pla...",
493,3930734892,Software Engineer (Android) - Travel Activities,6,Full-time,Entry level,On-site,"Application Development,Back-End Web Developme...",0.0,bachelors,"Application Development,Back-End Web Developme...",It's fun to work in a company where people tru...,


In [26]:
data.to_csv("job_for_migration.csv", index=False)

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("job_for_migration.csv")
data.isna().sum()

job_id              0
job_title           0
company_id          0
job_type           11
job_level         152
work_model         89
skills             28
min_experience      0
degree            493
about               1
qualifications    836
dtype: int64

In [3]:
# fill missing min_experience with -1
data['min_experience'] = data['min_experience'].fillna(-1)

In [9]:
data2 = pd.read_csv("job_preprocessed.csv")
data2 = data2[['job_id', 'location']]

In [11]:
merged_data = pd.merge(data, data2, on='job_id')
merged_data.columns

Index(['job_id', 'job_title', 'company_id', 'job_type', 'job_level',
       'work_model', 'skills', 'min_experience', 'degree', 'about',
       'qualifications', 'location'],
      dtype='object')

In [17]:
new_col = ['job_id', 'job_title', 'company_id', 'job_type', 'job_level',
       'work_model','location', 'skills', 'min_experience', 'degree', 'about',
       'qualifications']
merged_data = merged_data[new_col]
merged_data.to_csv("job_for_migration.csv", index=False)

In [16]:
data.isna().sum()

job_id              0
job_title           0
company_id          0
job_type           11
job_level         152
work_model         89
skills             28
min_experience      0
degree            493
about               1
qualifications    836
dtype: int64

In [4]:
data.isna().sum()
data.to_csv("job_for_migration2.csv", index=False)