# Python Web Scraper for Job Listings 

In [16]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

driver = webdriver.Chrome() 
driver.get("https://www.indeed.com/jobs?q=data+scientist&l=United+States")

driver.implicitly_wait(10)
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")
driver.quit()

job_titles, companies, locations, salaries, descs = [], [], [], [], []

for job_card in soup.find_all("div", class_="job_seen_beacon"):
    # Job Title
    title = job_card.find("h2", class_="jobTitle")
    job_titles.append(title.text.strip() if title else "N/A")

    # Company Name
    company = job_card.find("span", attrs={"data-testid": "company-name"})
    companies.append(company.text.strip() if company else "N/A")

    # Location
    location = job_card.find("div", attrs={"data-testid": "text-location"})
    locations.append(location.text.strip() if location else "N/A")

    # Salary
    salary = job_card.find("div", attrs={"data-testid": "attribute_snippet_testid"})
    salaries.append(salary.text.strip() if salary else "N/A")

    # Job Description
    desc = job_card.find("div", attrs={"role": "presentation"})
    descs.append(desc.text.strip() if desc else "N/A")


jobs_df = pd.DataFrame({
    "Title": job_titles,
    "Company": companies,
    "Location": locations,
    "Salary": salaries,
    "Descriptions": descs
})

In [17]:
from pypdf import PdfReader

reader = PdfReader('Venkatesan_Akshay_Resume.pdf')
page = reader.pages[0]
resume = page.extract_text()
print(resume)

AKSHAYVENKATESANP:(408)429-5391|av3157@columbia.edu|linkedin.com/in/akshay-venkatesan
EDUCATION
ColumbiaUniversity NewYork,NYMSinDataScience ExpectedDec2024Courses:AppliedMachineLearning,AppliedDeepLearning,AlgorithmsforDataScience, CausalInference, StatisticalInference, ComputerSystemsforDataScience
UniversityofWashington Seattle, WABSinAppliedMathematics,Minors inDataScienceandComputationalFinance Mar2023Dean’sListeveryquarterUWPurpleandGoldScholarCourses:ArtificialIntelligence, DatabaseManagement,StatisticsandDataAnalysis, DataStructuresandAlgorithms
WORKEXPERIENCE
Walmart Connect Hoboken,NJDataScienceIntern June2024–Aug2024● DevelopedaMarkovChainadattributionmodeltomeasureimpactofadsoncustomerpurchaserate● StreamlinedmodeldeploymentbycreatingreusablePythonMarkovAttributionClasswhichusesfilteredSQLdatastacks● DesignedanestimationtechniquefortheMarkovattributionmodelwhichimprovedruntimeby50%● GeneratedAttributedRevenue, Share, andROASreportsforMarkov, rule-basedandsiloedapproachestoi

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

documents = [resume] + jobs_df["Descriptions"]
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(documents)

cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
best_matches = cosine_similarities.flatten()
best_matching_jobs = pd.DataFrame({
    "Similarity Score": best_matches
})

if "Similarity Score" not in jobs_df.columns:
    jobs_df = jobs_df.join(best_matching_jobs)
jobs_df = jobs_df.sort_values(by="Similarity Score", ascending=False)
print(jobs_df.head())

                                Title                    Company  \
1   Quantitative Analyst - AI Trainer             DataAnnotation   
0                      Data Scientist                      Gierd   
6                      Data Scientist                 ECU Health   
12                     Data Scientist  Texas Children's Hospital   
9                      Data Scientist                 Demandbase   

                                  Location                      Salary  \
1                 Remote in Fort Wayne, IN            From $40 an hour   
0                      Sandpoint, ID 83864  $115,000 - $150,000 a year   
6           Remote in Greenville, NC 27835                         N/A   
12                             Houston, TX                         N/A   
9   Hybrid work in San Francisco, CA 94107                         N/A   

                                         Descriptions  Similarity Score  
1   In this role you will need to hold an expert l...          0.549692 