<a href="https://colab.research.google.com/github/cbonnin88/Talent-Analysis/blob/main/HireSight_application_reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
!pip install streamlit spacy PyMuPDF scikit-learn



In [49]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [50]:
!pip install streamlit pyngrok



# **Creating the Streamlit Application**

In [73]:
%%writefile app.py
import streamlit as st
import spacy
import re
import fitz # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

@st.cache_resource
def load_spacy_model():
  return spacy.load('en_core_web_sm')

nlp = load_spacy_model()

skills_db = {
    'Programming & Scripting': [
        'python','r','scala','sql','bash','shell'
    ],
    'Database & Data Warehousing': [
        'bigquery','snowflake','redshift','postgresql','mysql','sql servers',
        'mongodb','cassandra','neo4j','firebase','teradata','data warehouse',
        'data lakehouse','data mart','etl'
    ],
    'Cloud Platforms': [
        'gcp','google cloud platform','aws','amazon web services','azure',
        's3','ec2','lambda','gcs','google cloud storage','cloud functions',
        'azure blob storage','heroku','digitalocean'
    ],
    "Data Processing & ETL": [
        'apache spark', 'spark', 'pyspark', 'hadoop', 'mapreduce', 'kafka',
        'airflow', 'dbt', 'luigi', 'data bricks', 'fivetran', 'stitch'
    ],
    "Data Visualization & BI": [
        'tableau', 'power bi', 'looker', 'looker studio', 'qlik', 'spotfire',
        'matplotlib', 'seaborn', 'plotly', 'ggplot', 'd3.js', 'metabase',
        'superset'
    ],
    "Machine Learning & Data Science": [
        'machine learning', 'deep learning', 'natural language processing', 'nlp',
        'computer vision', 'scikit-learn', 'tensorflow', 'keras', 'pytorch',
        'pandas', 'numpy', 'scipy', 'statsmodels', 'xgboost', 'lightgbm',
        'regression', 'classification', 'clustering', 'a/b testing', 'experimentation',
        'polars'
    ],
    "HR Information Systems (HRIS)": [
        'workday', 'sap successfactors', 'oracle hcm', 'bamboohr', 'peoplesoft',
        'adp', 'applicant tracking system', 'ats','people analytics'
    ],
    "Project Management & DevOps": [
        'agile', 'scrum', 'kanban', 'jira', 'git', 'github', 'gitlab', 'docker',
        'kubernetes', 'ci/cd', 'jenkins', 'terraform'
    ],
    "Soft Skills": [
        'communication', 'leadership', 'teamwork', 'collaboration', 'problem-solving',
        'analytical skills', 'critical thinking', 'creativity', 'adaptability',
        'stakeholder management'
    ]
}


# Function to Extract Text from the PDF
def extract_text_from_pdf(resume):
  try:
    file_bytes = resume.read()
    with fitz.open(stream=file_bytes, filetype='pdf') as doc:
      text=''
      for page in doc:
        text += page.get_text()
    return text
  except Exception as e:
    return f'Error Reading PDF: {e}'


# Function to Clean and Preprocess Text
def clean_text(text):
  doc = nlp(text.lower())
  cleaned_tokens = []
  for token in doc:
    if not token.is_stop and not token.is_punct and not token.is_space:
      cleaned_tokens.append(token.lemma_) # Using lemma for root form of the word
  return ' '.join(cleaned_tokens)



def extract_skills_categorized(text, skills_db):
  found_skills = {}
  cleaned_text = ' ' + text.lower() + ' ' # Pad for easier matching

  for category, skills in skills_db.items():
    category_skills = []
    for skill in skills:
      if re.search(r'\b' + re.escape(skill) + r'\b',cleaned_text):
        category_skills.append(skill)

    if category_skills:
      found_skills[category] = list(set(category_skills)) # Using set to remove the duplicates
  return found_skills



def calculate_similarity(resume_text,jd_text):
  vectorizer = TfidfVectorizer()
  tfidf_matrix = vectorizer.fit_transform([resume_text, jd_text])
  return cosine_similarity(tfidf_matrix)[0,1]



st.set_page_config(page_title='HireSight | AI Resume Matcher', page_icon='🚀')

st.title('HireSight: AI-Powered Resume & Job Matcher')

st.markdown("""
  Welcome to HireSight ! This tool helps you analyze how well a resume matches a job description.
  - **Upload a resume** in PDF format
  - **Paste the job description** into the text box
  - Click **'Analyze'** to see the results
""")

st.divider()

#-- Layout with columns --
col1,col2 = st.columns(2)

with col1:
  st.header('📄 Job Description')
  jd_text = st.text_area('Paste the full Job Description here:', height=300)

with col2:
  st.header('🧑‍💻 Candidate Resume')
  resume_file = st.file_uploader('Upload resume (PDF only)', type=['pdf'])


# -- Analysis Button and Results --
if st.button('Analyze ✨', type='primary'):
  if resume_file is not None and jd_text:
    with st.spinner('Analyzing... this may take a moment.'):
      # 1. Extracting and cleaning text
      resume_text = extract_text_from_pdf(resume_file)

      if 'Error reading PDF' in resume_text:
        st.error(resume_text)
      else:
        cleaned_resume = clean_text(resume_text)
        cleaned_jd = clean_text(jd_text)

        # 2. Calculate Similarity Score
        match_score = calculate_similarity(cleaned_resume, cleaned_jd)

        st.header('📊 Analysis Results')
        st.metric(label='**Overall Match Score**', value=f'{match_score:.2%}')
        st.progress(match_score)

        # 3. Extract and Compare Skills
        st.subheader('Skills Analysis')
        resume_skills_cat = extract_skills_categorized(cleaned_jd, skills_db)
        jd_skills_cat = extract_skills_categorized(cleaned_jd, skills_db)

        flat_resume_skills = [skill for skills_list in resume_skills_cat.values() for skill in skills_list]
        flat_jd_skills = [skill for skills_list in jd_skills_cat.values() for skill in skills_list]

        matching_skills = set(flat_resume_skills) & set(flat_jd_skills)
        missing_skills = set(flat_jd_skills) & set(flat_resume_skills)

        if missing_skills:
          st.warning(f'**❌ Key Skills Missing from Resume:**{', '.join(missing_skills)}')
        else:
          st.balloons()
          st.success('🎉 All required skills from the job description appear to be present!')

        st.divider()

        # 4. Display Detected Skills in Resume
        st.subheader('Detected Skills in Resume')
        if resume_skills_cat:
          for category, skills in resume_skills_cat.items():
            st.markdown(f'**{category}:**')
            st.write(', '.join(skills))
        else:
          st.write('No specific skills from the database were deteced in the resume')
else:
  st.error('Please upload a resume and provide a job description to analyze')

Overwriting app.py


In [61]:
from pyngrok import ngrok
from google.colab import userdata
import time

In [62]:
try:
    ngrok_token = userdata.get('NGROK_AUTH_TOKEN')
    ngrok.set_auth_token(ngrok_token)
except Exception as e:
    print(f"Could not set ngrok token. Make sure it's saved in Colab Secrets. Error: {e}")


In [74]:
ngrok.kill()

In [64]:
!nohup streamlit run app.py --server.port 8501 &

nohup: appending output to 'nohup.out'


In [65]:
time.sleep(5)

In [66]:
try:
    public_url = ngrok.connect(8501)
    print(f"Click the URL to open your Streamlit app: {public_url}")
except Exception as e:
    print(f"Could not connect to ngrok. Error: {e}")
    print("Please check your ngrok token and make sure no other tunnels are running on your account.")


Click the URL to open your Streamlit app: NgrokTunnel: "https://b0e58a082e39.ngrok-free.app" -> "http://localhost:8501"
