In [1]:
import pandas as pd

In [2]:
skills_df = pd.read_csv('./Data/Scoring/data-science-skills-normalized-2.csv')
exp_df = pd.read_csv('./Data/Scoring/exp_category.csv')
degree_df = pd.read_csv('./Data/Scoring/degree.csv', delimiter = ";")

In [3]:
data = pd.read_csv('./Data/Profile.csv')

In [4]:
data = data.drop('Unnamed: 0', axis =1)
data = data.rename(columns = {'highest_degree': 'degree'})

In [5]:
data

Unnamed: 0,full_name,skills,Normalized Experience,degree
0,Hafizh Adi,"time series analysis,python (bahasa pemrograma...",2.916667,Bachelor
1,Anthony Jaya,"sql,exploratory data analysis,crop insurance,e...",3.166667,Bachelor
2,Dininta Annisa,,1.500000,Master
3,Andi Mardinsyah,"data science,machine learning,deep learning,ph...",5.833333,Master
4,Sherin Sari,,0.333333,Bachelor
...,...,...,...,...
95,Fian Adinata,"python (programming language),sql,machine lear...",2.583333,Bachelor
96,Almaira Ayudhiya,"sas (programming language),credit scoring,data...",3.250000,Bachelor
97,L P,"data analysis,sql,programming,java,php,android...",3.166667,Bachelor
98,Firda Devi,"supervised learning,unsupervised learning,arti...",2.583333,Master


In [6]:
import re
from typing import Dict, List, Tuple

def create_skill_matcher(skills_df: pd.DataFrame) -> Dict[str, tuple]:
    """Create skill matching dictionary with weights"""
    skill_matcher = {}
    for _, row in skills_df.iterrows():
        variations = [row['skill']]
        if pd.notna(row['aliases']):
            variations.extend([v.strip() for v in row['aliases'].split(',')])
        for variation in variations:
            skill_matcher[variation.lower()] = (row['skill'], row['weight'])
    return skill_matcher

def create_degree_weights(degree_df: pd.DataFrame) -> Dict[str, float]:
    """Create degree weight dictionary"""
    return dict(zip(degree_df['degree'].str.lower(), degree_df['weight']))

def create_exp_weights(exp_df: pd.DataFrame) -> Dict[str, float]:
    """Create experience weight dictionary"""
    return dict(zip(exp_df['Experience Category'], exp_df['Weight']))

def preprocess_text(text: str) -> str:
    """Clean and standardize text"""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    return text

def score_skills(skill_text: str, skill_matcher: Dict[str, tuple]) -> Tuple[float, List[str]]:
    """Score skills and return total score and matched skills"""
    skill_text = preprocess_text(skill_text)
    words = skill_text.split()
    
    matched_skills = set()
    total_score = 0
    
    # Single word skills
    for word in words:
        if word in skill_matcher:
            skill, weight = skill_matcher[word]
            matched_skills.add(skill)
            total_score += weight
    
    # Multi-word skills (2-3 words)
    for i in range(len(words)):
        for j in range(2, 4):
            if i + j <= len(words):
                phrase = ' '.join(words[i:i+j])
                if phrase in skill_matcher:
                    skill, weight = skill_matcher[phrase]
                    matched_skills.add(skill)
                    total_score += weight
    
    return total_score, sorted(list(matched_skills))

def score_degree(degree: str, degree_weights: Dict[str, float]) -> float:
    """Score education level"""
    if pd.isna(degree) or not degree:
        return 0
    degree = preprocess_text(degree)
    return degree_weights.get(degree, 0)


def score_experience(years: float, exp_weights: dict) -> float:
    """Score experience level based on normalized years of experience and experience weight dictionary"""
    if pd.isna(years):
        return 0
    
    # Map the years to the correct experience category
    if years < 1:
        category = 'Fresh Graduate'
    elif 1 <= years < 2:
        category = '1 - 2 Years'
    elif 2 <= years < 3:
        category = '2 - 3 Years'
    elif 3 <= years < 5:
        category = '3 - 5 Years'
    elif years >= 5:
        category = '5+ Years'
    else:
        category = 'Not Stated Specifically in Years'

    # Look up the weight in the dictionary
    return exp_weights.get(category, 0)


def calculate_final_score(skill_score: float, degree_score: float, exp_score: float) -> float:
    """Calculate final weighted score
    
    Weights:
    - Skills: 60% of total score
    - Experience: 25% of total score
    - Education: 15% of total score
    """
    normalized_skill_score = min(skill_score / 180, 1) * 55
    normalized_exp_score = (exp_score / 4) * 35
    normalized_degree_score = (degree_score / 2) * 10
    
    return normalized_skill_score + normalized_exp_score + normalized_degree_score

def score_resume(resume: pd.Series, skill_matcher: Dict[str, tuple], 
                degree_weights: Dict[str, float], exp_weights: Dict[str, float]) -> Dict:
    """Score a single resume and return detailed results"""
    skill_score, matched_skills = score_skills(resume['skills'], skill_matcher)
    degree_score = score_degree(resume['degree'], degree_weights)
    exp_score = score_experience(resume['Normalized Experience'], exp_weights)
    
    final_score = calculate_final_score(skill_score, degree_score, exp_score)
    
    return {
        'full_name': resume['full_name'],
        'final_score': final_score,
        'skill_score': skill_score,
        'degree_score': degree_score,
        'experience_score': exp_score,
        'matched_skills': matched_skills,
        'skill_count': len(matched_skills),
        'experience_years': resume['Normalized Experience']
    }

def score_all_resumes(resumes_df: pd.DataFrame, skill_matcher: Dict[str, tuple],
                     degree_weights: Dict[str, float], exp_weights: Dict[str, float]) -> pd.DataFrame:
    """Score all resumes and return detailed results"""
    results = []
    for _, resume in resumes_df.iterrows():
        results.append(score_resume(resume, skill_matcher, degree_weights, exp_weights))
    
    results_df = pd.DataFrame(results)
    return results_df

In [7]:
# First, create your lookup dictionaries
skill_matcher = create_skill_matcher(skills_df)
degree_weights = create_degree_weights(degree_df)
exp_weights = create_exp_weights(exp_df)

# Score all resumes
results_df = score_all_resumes(data, skill_matcher, degree_weights, exp_weights)


In [8]:
data

Unnamed: 0,full_name,skills,Normalized Experience,degree
0,Hafizh Adi,"time series analysis,python (bahasa pemrograma...",2.916667,Bachelor
1,Anthony Jaya,"sql,exploratory data analysis,crop insurance,e...",3.166667,Bachelor
2,Dininta Annisa,,1.500000,Master
3,Andi Mardinsyah,"data science,machine learning,deep learning,ph...",5.833333,Master
4,Sherin Sari,,0.333333,Bachelor
...,...,...,...,...
95,Fian Adinata,"python (programming language),sql,machine lear...",2.583333,Bachelor
96,Almaira Ayudhiya,"sas (programming language),credit scoring,data...",3.250000,Bachelor
97,L P,"data analysis,sql,programming,java,php,android...",3.166667,Bachelor
98,Firda Devi,"supervised learning,unsupervised learning,arti...",2.583333,Master


In [9]:
results_df.sort_values('final_score', ascending = False).head(15)

Unnamed: 0,full_name,final_score,skill_score,degree_score,experience_score,matched_skills,skill_count,experience_years
98,Firda Devi,82.5,187,2,2.0,"[aws, bigquery, docker, excel, gcp, git, hadoo...",18,2.583333
6,Sandy Utama,73.944444,152,2,2.0,"[llm, matlab, mongodb, mysql, neural_networks,...",15,2.916667
79,Muhamad Hatab,65.791667,156,1,1.5,"[bigquery, excel, git, keras, matplotlib, mysq...",14,1.416667
1,Anthony Jaya,65.472222,112,1,3.0,"[bigquery, eda, etl, gcp, neural_networks, nlp...",9,3.166667
69,Alfa Rabi,64.361111,137,1,2.0,"[bigquery, docker, eda, matplotlib, neural_net...",14,2.666667
21,Zein Himami,64.055556,91,2,3.0,"[bigquery, docker, etl, excel, neural_networks...",10,3.0
17,Jordi Hasianta,59.972222,94,1,3.0,"[etl, gcp, neural_networks, nlp, numpy, opencv...",9,3.0
20,Irfan Amal,59.791667,120,2,1.5,"[aws, docker, java, neural_networks, nlp, nump...",12,1.333333
18,Sa'dan,59.472222,121,1,2.0,"[bigquery, excel, gcp, jupyter, mysql, nlp, nu...",11,2.666667
42,Putri Mukhlashin,57.333333,69,2,3.0,"[azure, excel, python, r, sql]",5,3.333333


In [10]:
exp_df

Unnamed: 0.1,Unnamed: 0,Experience Category,Weight
0,5,Fresh Graduate,0.0
1,6,1 Year,1.0
2,3,1 - 2 Years,1.5
3,4,2 - 3 Years,2.0
4,0,3 - 5 Years,3.0
5,2,5+ Years,4.0
6,1,Not Stated Specifically in Years,


In [11]:
results_df[['full_name', 'final_score', 'skill_score', 'degree_score', 'experience_score']].sort_values('final_score', ascending = False).head(15)

Unnamed: 0,full_name,final_score,skill_score,degree_score,experience_score
98,Firda Devi,82.5,187,2,2.0
6,Sandy Utama,73.944444,152,2,2.0
79,Muhamad Hatab,65.791667,156,1,1.5
1,Anthony Jaya,65.472222,112,1,3.0
69,Alfa Rabi,64.361111,137,1,2.0
21,Zein Himami,64.055556,91,2,3.0
17,Jordi Hasianta,59.972222,94,1,3.0
20,Irfan Amal,59.791667,120,2,1.5
18,Sa'dan,59.472222,121,1,2.0
42,Putri Mukhlashin,57.333333,69,2,3.0


In [12]:
export = results_df[['full_name', 'final_score', 'skill_score', 'degree_score', 'experience_score']].sort_values('final_score', ascending = False).reset_index()
export

Unnamed: 0,index,full_name,final_score,skill_score,degree_score,experience_score
0,98,Firda Devi,82.500000,187,2,2.0
1,6,Sandy Utama,73.944444,152,2,2.0
2,79,Muhamad Hatab,65.791667,156,1,1.5
3,1,Anthony Jaya,65.472222,112,1,3.0
4,69,Alfa Rabi,64.361111,137,1,2.0
...,...,...,...,...,...,...
95,74,Kevin Jonathan,5.000000,0,1,0.0
96,54,Lalu Pratama,5.000000,0,1,0.0
97,67,Salsabila Yasmin,5.000000,0,1,0.0
98,4,Sherin Sari,5.000000,0,1,0.0


In [13]:
export.to_csv('./Data/Analysis/scoring_result2.csv')