<a href="https://colab.research.google.com/github/akshatamadavi/data_mining/blob/main/Job_Recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Job Recommendation System
# Matches your resume and skills to New Grad positions from SimplifyJobs repo

# Cell 1: Install Dependencies

In [None]:
# !pip install -q gradio PyPDF2 pdfplumber spacy scikit-learn requests beautifulsoup4 pandas
# !python -m spacy download en_core_web_sm

In [None]:
# Cell 2: Import Libraries
import gradio as gr
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import pdfplumber
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import io

In [None]:
# Cell: Fix Asyncio Event Loop for Colab + Gradio + CrewAI
import nest_asyncio
import asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Reset the event loop
try:
    loop = asyncio.get_event_loop()
    if loop.is_running():
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
except RuntimeError:
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

print("‚úÖ Asyncio event loop configured for Colab")


‚úÖ Asyncio event loop configured for Colab


In [None]:
# Cell 3: Load spaCy model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Cell 4: Resume Parsing Function
def parse_resume(pdf_file):
    """
    Parse resume PDF to extract skills, education, and experience.
    """
    if pdf_file is None:
        return {"skills": [], "education": [], "experience": [], "text": ""}

    text = ""
    try:
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
    except Exception as e:
        return {"skills": [], "education": [], "experience": [], "text": f"Error: {str(e)}"}

    # Process with spaCy
    doc = nlp(text)

    # Extract skills (simple keyword matching)
    skill_keywords = ["python", "java", "javascript", "c++", "sql", "react", "node",
                     "machine learning", "data", "aws", "docker", "kubernetes", "tensorflow",
                     "pytorch", "git", "agile", "scrum", "rest api", "mongodb", "postgresql"]

    skills = []
    text_lower = text.lower()
    for skill in skill_keywords:
        if skill in text_lower:
            skills.append(skill)

    # Extract education (simple pattern matching)
    education = []
    education_keywords = ["bachelor", "master", "phd", "bs", "ms", "b.s.", "m.s.", "university", "college"]
    for sent in doc.sents:
        sent_text = sent.text.lower()
        if any(keyword in sent_text for keyword in education_keywords):
            education.append(sent.text.strip())

    return {
        "skills": list(set(skills)),
        "education": education[:3],  # Limit to first 3 matches
        "text": text
    }

In [None]:
# Cell 5: Crawl SimplifyJobs - HTML Parser (WORKING!)
def fetch_jobs():
    url = "https://raw.githubusercontent.com/SimplifyJobs/New-Grad-Positions/dev/README.md"
    try:
        response = requests.get(url, timeout=15)
        content = response.text

        # Use BeautifulSoup to parse HTML tables
        soup = BeautifulSoup(content, 'html.parser')
        tables = soup.find_all('table')

        jobs = []
        for table in tables:
            rows = table.find_all('tr')
            for row in rows[1:]:  # Skip header
                cells = row.find_all('td')
                if len(cells) >= 3:
                    company = cells[0].get_text(strip=True)
                    role = cells[1].get_text(strip=True)
                    location = cells[2].get_text(strip=True)

                    if company and role:
                        jobs.append({
                            'company': company,
                            'role': role,
                            'location': location,
                            'text': f"{company} {role} {location}"
                        })

        print(f"‚úÖ Fetched {len(jobs)} jobs from {len(tables)} tables")
        return pd.DataFrame(jobs)
    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame()

In [None]:
# Debug: Test fetch_jobs function
test_df = fetch_jobs()
print(f"Number of jobs fetched: {len(test_df)}")
print(f"\nFirst 3 jobs:")
if not test_df.empty:
    print(test_df.head(3))
else:
    print("No jobs found - DataFrame is empty")

‚úÖ Fetched 1400 jobs from 12 tables
Number of jobs fetched: 1400

First 3 jobs:
            company                                           role  \
0            Circle      New Grad Software Engineer - Backend 2026   
1  American Express  Software Engineer 1 - Enterprise Architecture   
2             CLEAR             Software Engineer ‚Äì New Grad - Web   

                                            location  \
0  12 locationsSalt Lake City, UTBoston, MASeattl...   
1                                         London, UK   
2                                                NYC   

                                                text  
0  Circle New Grad Software Engineer - Backend 20...  
1  American Express Software Engineer 1 - Enterpr...  
2       CLEAR Software Engineer ‚Äì New Grad - Web NYC  


In [None]:
# Cell 6: Job Matching Function
def match_jobs(resume_file, visa_required, user_skills, job_type):
    """
    Match user profile with job listings.
    """
    # Parse resume
    resume_data = parse_resume(resume_file)
    resume_skills = resume_data['skills']

    # Combine with user-entered skills
    if user_skills:
        extra_skills = [s.strip().lower() for s in user_skills.split(',')]
        all_skills = list(set(resume_skills + extra_skills))
    else:
        all_skills = resume_skills

    # Fetch jobs
    jobs_df = fetch_jobs()

    if jobs_df.empty:
        return "No jobs found. Please try again.", pd.DataFrame(), f"Parsed Skills: {all_skills}"

    # Filter by job type
    if job_type == "Internship":
        jobs_df = jobs_df[jobs_df['role'].str.contains('Intern', case=False, na=False)]
    elif job_type in ["Full-time", "Part-time"]:
        jobs_df = jobs_df[~jobs_df['role'].str.contains('Intern', case=False, na=False)]

    # Filter by visa sponsorship (if required)
    if visa_required:
        # This is a placeholder - the repo doesn't always have visa info clearly marked
        # You can enhance this based on actual data structure
        pass

    # Calculate similarity scores
    if len(all_skills) == 0:
        return "No skills found in resume. Please add skills manually.", pd.DataFrame(), "No skills detected"

    user_profile = " ".join(all_skills)
    job_texts = jobs_df['text'].tolist()

    # Use TF-IDF to compute similarity
    vectorizer = TfidfVectorizer()
    all_texts = [user_profile] + job_texts

    try:
        tfidf_matrix = vectorizer.fit_transform(all_texts)
        user_vector = tfidf_matrix[0:1]
        job_vectors = tfidf_matrix[1:]

        similarities = cosine_similarity(user_vector, job_vectors)[0]
        jobs_df['score'] = similarities

        # Get top 10
        top_jobs = jobs_df.nlargest(10, 'score')[['company', 'role', 'location', 'score']]
        top_jobs['score'] = top_jobs['score'].round(3)

        summary = f"""üìã **Parsed Skills:** {', '.join(resume_skills)}
üîß **All Skills Used:** {', '.join(all_skills)}
üìö **Education:** {', '.join(resume_data['education']) if resume_data['education'] else 'Not detected'}
üéØ **Job Type:** {job_type}
‚úÖ **Total Jobs Found:** {len(jobs_df)}
        """

        return summary, top_jobs, resume_data['text'][:500] + "..."

    except Exception as e:
        return f"Error: {str(e)}", pd.DataFrame(), ""

In [None]:
# Cell 7: Create Gradio Interface
def create_interface():
    with gr.Blocks(title="Job Recommendation System", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # üöÄ New Grad Job Recommendation System
        ### Match your resume with top New Grad positions from SimplifyJobs

        Upload your resume, specify your preferences, and get personalized job recommendations!
        """)

        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### üìù Input Your Details")
                resume_file = gr.File(label="Upload Resume (PDF)", file_types=[".pdf"])
                visa_required = gr.Checkbox(label="Visa Sponsorship Required?", value=False)
                user_skills = gr.Textbox(
                    label="Additional Skills (comma-separated, optional)",
                    placeholder="e.g., Python, Machine Learning, AWS"
                )
                job_type = gr.Radio(
                    ["Full-time", "Internship", "Part-time"],
                    label="Job Type",
                    value="Full-time"
                )
                submit_btn = gr.Button("üîç Find Top 10 Jobs", variant="primary", size="lg")

            with gr.Column(scale=2):
                gr.Markdown("### üéØ Results")
                summary_output = gr.Markdown(label="Summary")
                jobs_output = gr.Dataframe(
                    label="Top 10 Matching Jobs",
                    headers=["Company", "Role", "Location", "Match Score"],
                    interactive=False
                )

        with gr.Accordion("üìÑ View Resume Text", open=False):
            resume_text_output = gr.Textbox(label="Parsed Resume Text (Preview)", lines=5)

        submit_btn.click(
            fn=match_jobs,
            inputs=[resume_file, visa_required, user_skills, job_type],
            outputs=[summary_output, jobs_output, resume_text_output]
        )

        gr.Markdown("""
        ---
        üí° **Tips:**
        - Make sure your resume clearly lists your skills and education
        - Add extra skills in the text box if they're not detected
        - Jobs are fetched live from [SimplifyJobs GitHub Repo](https://github.com/SimplifyJobs/New-Grad-Positions)
        """)

    return demo

# Launch the interface
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True, debug=True)

  with gr.Blocks(title="Job Recommendation System", theme=gr.themes.Soft()) as demo:


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a720b90ba1ed44839d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


‚úÖ Fetched 1400 jobs from 12 tables
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a720b90ba1ed44839d.gradio.live


In [None]:
# ============================================
# MULTI-AGENT JOB RECOMMENDATION SYSTEM
# Using CrewAI for intelligent resume parsing
# and job matching
# ============================================

In [None]:
# Install Multi-Agent Dependencies
!pip install -q crewai crewai-tools langchain-openai langchain-community
!pip install -q pdfplumber PyPDF2