In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text


In [None]:
!pip install spacy pdfplumber nltk scikit-learn pandas numpy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m136.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install pdfplumber




In [None]:
raw_text = extract_text_from_pdf("Resume(2).pdf")
print(raw_text[:500])

Rushi roy
(617) 432-8765
rushiroy6t@gmail.com
EDUCATION
Tufts University, Medford, MA May 2025
Bachelor of Science in Chemical Engineering
Relevant Courses: Physical Chemistry of Polymers, Hazardous Waste Treatment Technology, Chemical and
Biological Separations, Surface and Colloid Chemistry, Reactor Design, Data Structures, Process Dynamics and
Controls
ENGINEERING EXPERIENCE
Aramco Services Company May 2025 – Present
Gas Separations Intern, Cambridge, MA
• Separated gas molecules for experime


In [None]:
import re
import nltk
import spacy
from nltk.corpus import stopwords

# download stopwords
nltk.download("stopwords")

# load spacy model
nlp = spacy.load("en_core_web_sm")

STOPWORDS = set(stopwords.words("english"))

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # make lowercase
    text = text.lower()

    # remove emails
    text = re.sub(r"\S+@\S+", " ", text)

    # remove phone numbers
    text = re.sub(r"\+?\d[\d -]{8,}\d", " ", text)

    # remove urls
    text = re.sub(r"http\S+|www\S+", " ", text)

    # remove special characters and numbers
    text = re.sub(r"[^a-z\s]", " ", text)

    # tokenize + lemmatize
    doc = nlp(text)
    tokens = []

    for token in doc:
        if token.text not in STOPWORDS and len(token.text) > 2:
            tokens.append(token.lemma_)

    return " ".join(tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
test_text = """
Looking for a Data Science Intern!
Skills: Python, SQL, Machine Learning
Email: hr@test.com
"""

print(clean_text(test_text))


look data science intern skill python sql machine learn email   



In [None]:
cleaned_resume_text = clean_text(raw_text)
print(cleaned_resume_text[:500])


rushi roy 
              
 
 education tuft university medford may     
 bachelor science chemical engineering relevant course physical chemistry polymer hazardous waste treatment technology chemical biological separation surface colloid chemistry reactor design datum structure process dynamic control engineering experience aramco service company may         present gas separation intern cambridge 
   separate gas molecule experiment cast membrane post processing characterization 
   experiment 


In [None]:
import pandas as pd

jobs_df = pd.read_csv("clean_jobs.csv")
print(jobs_df.head())


   id            title    company           location  \
0   1     Data Analyst       Meta       New York, NY   
1   2     Data Analyst       Meta  San Francisco, CA   
2   3     Data Analyst       Meta    Los Angeles, CA   
3   4     Data Analyst       Meta     Washington, DC   
4   5  Data Analyst II  Pinterest        Chicago, IL   

                                                link    source date_posted  \
0  https://www.linkedin.com/jobs/view/data-analys...  LinkedIn  2025-04-14   
1  https://www.linkedin.com/jobs/view/data-analys...  LinkedIn  2025-04-14   
2  https://www.linkedin.com/jobs/view/data-analys...  LinkedIn  2025-04-14   
3  https://www.linkedin.com/jobs/view/data-analys...  LinkedIn  2025-04-14   
4  https://www.linkedin.com/jobs/view/data-analys...  LinkedIn  2025-04-16   

   work_type  employment_type  \
0        NaN              NaN   
1        NaN              NaN   
2        NaN              NaN   
3        NaN              NaN   
4        NaN              NaN

In [None]:
print(jobs_df.columns)



Index(['id', 'title', 'company', 'location', 'link', 'source', 'date_posted',
       'work_type', 'employment_type', 'description'],
      dtype='object')


In [None]:
jobs_df["clean_description"] = jobs_df["description"].apply(clean_text)


In [None]:
print(jobs_df["clean_description"].head())


0    social measurement team grow team high visibil...
1    social measurement team grow team high visibil...
2    social measurement team grow team high visibil...
3    social measurement team grow team high visibil...
4    pinterest million people around world come pla...
Name: clean_description, dtype: object


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
resume_text = cleaned_resume_text
job_texts = jobs_df["clean_description"].tolist()


In [None]:
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform([resume_text] + job_texts)

In [None]:
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]


In [None]:
jobs_df["similarity_score"] = similarity_scores
jobs_df[["title", "company", "similarity_score"]].head()


Unnamed: 0,title,company,similarity_score
0,Data Analyst,Meta,0.036706
1,Data Analyst,Meta,0.036706
2,Data Analyst,Meta,0.036759
3,Data Analyst,Meta,0.036759
4,Data Analyst II,Pinterest,0.024215


In [None]:
recommended_jobs = jobs_df.sort_values(
    by="similarity_score",
    ascending=False
)


In [None]:
recommended_jobs[["title", "company", "location", "similarity_score"]].head(5)


Unnamed: 0,title,company,location,similarity_score
833,** ******** - ******,******,"*******, *******, ******",0.076493
1006,Contract Senior Quality Analyst - Data Migration,Park Place Technologies,"Cleveland, OH",0.076431
863,Swim Instructor,Solar Swim,"Antioch, CA, US",0.068694
785,Data Scientist,Caterpillar Inc.,"Mossville, IL",0.068315
558,Data Engineer,Indiana University Indianapolis,"Indianapolis, IN",0.063226


In [None]:
jobs_df["location"].head(10)

Unnamed: 0,location
0,"New York, NY"
1,"San Francisco, CA"
2,"Los Angeles, CA"
3,"Washington, DC"
4,"Chicago, IL"
5,"New York, NY"
6,"Los Angeles, CA"
7,"New York, NY"
8,"New York, NY"
9,"New York, NY"


In [None]:
remote_jobs = recommended_jobs[
    recommended_jobs["location"].str.contains("remote", case=False, na=False)
]

remote_jobs[["title", "company", "location", "similarity_score"]].head(5)


Unnamed: 0,title,company,location,similarity_score
905,Business Analyst,Lams Technology LLC,"Remote, US",0.041544
1000,Senior Data Engineer,Mercury Insurance Company,"Remote, US",0.040867
925,Senior Data Analyst,Smith.ai,"Remote, US",0.036256
995,Senior Data Collection Engineer (Spain),Centric Software,"Remote, US",0.035491
924,Entry Level Full Stack AI Engineer,HYPERPROOF,"Remote, US",0.034575


In [None]:
jobs_df["work_type"].value_counts()

Unnamed: 0_level_0,count
work_type,Unnamed: 1_level_1


In [None]:
recommended_jobs["work_type"] = recommended_jobs["work_type"].astype(str)

In [None]:
remote_work_jobs = recommended_jobs[
    recommended_jobs["work_type"]
    .fillna("")
    .astype(str)
    .str.contains("remote", case=False)
]

In [None]:
remote_work_jobs = recommended_jobs[
    recommended_jobs["work_type"].str.contains("remote", case=False, na=False)
]

remote_work_jobs[["title", "company", "work_type", "similarity_score"]].head(5)


Unnamed: 0,title,company,work_type,similarity_score


In [None]:
recommended_jobs["work_type"].value_counts()


Unnamed: 0_level_0,count
work_type,Unnamed: 1_level_1
,1048


In [None]:
hybrid_jobs = recommended_jobs[
    recommended_jobs["work_type"]
    .fillna("")
    .astype(str)
    .str.contains("nan", case=False)
]

hybrid_jobs[["title", "company", "work_type", "similarity_score"]].head(5)


Unnamed: 0,title,company,work_type,similarity_score
833,** ******** - ******,******,,0.076493
1006,Contract Senior Quality Analyst - Data Migration,Park Place Technologies,,0.076431
863,Swim Instructor,Solar Swim,,0.068694
785,Data Scientist,Caterpillar Inc.,,0.068315
558,Data Engineer,Indiana University Indianapolis,,0.063226


In [None]:
cambridge_jobs = recommended_jobs[
    recommended_jobs["location"]
    .fillna("")
    .astype(str)
    .str.contains("cambridge", case=False)
]
cambridge_jobs[["title", "company", "location", "similarity_score"]].head(5)


Unnamed: 0,title,company,location,similarity_score
139,Data Scientist,Raspberry Pi Foundation,"Cambridge, England, United Kingdom",0.028346
967,Machine Learning Research Engineer,Luminance,"Cambridge, England, UK",0.023388


In [None]:
US_jobs = recommended_jobs[
    recommended_jobs["location"]
    .fillna("")
    .astype(str)
    .str.contains("US", case=False)
]

US_jobs[["title", "company", "location", "similarity_score"]].head(5)


Unnamed: 0,title,company,location,similarity_score
863,Swim Instructor,Solar Swim,"Antioch, CA, US",0.068694
328,Data Analyst,ExxonMobil,"Houston, TX",0.062568
55,Data Analyst,ExxonMobil,"Houston, TX",0.059973
973,Senior Software Engineer (AI Integration),Fivecast,"Adelaide, South Australia, Australia",0.057283
795,"Data Scientist, Load Forecasting",NRG Energy,"Houston, TX",0.054339


In [3]:
import numpy as np

def top_matching_keywords(resume_text, job_text, top_n=5):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([resume_text, job_text])

    feature_names = np.array(vectorizer.get_feature_names_out())

    resume_vec = tfidf[0].toarray().flatten()
    job_vec = tfidf[1].toarray().flatten()

    # shared importance = product
    shared_score = resume_vec * job_vec

    if shared_score.sum() == 0:
        return []

    top_indices = shared_score.argsort()[-top_n:][::-1]
    return feature_names[top_indices].tolist()


In [10]:
import pandas as pd

jobs_df = pd.read_csv("clean_jobs.csv")  



FileNotFoundError: [Errno 2] No such file or directory: 'clean_jobs.csv'

In [11]:
!ls


'ls' is not recognized as an internal or external command,
operable program or batch file.
