In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
github_token = os.getenv("GITHUB_TOKEN")
groq_api_key = os.getenv("GROG_API_KEY")

In [2]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    temperature=0, 
    groq_api_key=groq_api_key, 
    model_name="llama-3.1-70b-versatile"
)

## part 1 (extracting the job offer )

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.indeed.com/jobs?q=ai&l=Chicago%2C+IL&from=searchOnDesktopSerp&vjk=d53bad1ef075b73c")
page_data = loader.load().pop().page_content
print(page_data)

USER_AGENT environment variable not set, consider setting it to identify your requests.






Ai Jobs, Employment in Chicago, IL | Indeed.com










































        Skip to main content






HomeCompany reviewsFind salariesSign inSign inEmployers / Post Job1 new updateStart of main content


Keyword : all jobs&nbsp;Edit location input box labelSearchDate postedLast 24 hoursLast 3 daysLast 7 daysLast 14 daysRemoteHybrid workRemoteWithin 25 milesExact location onlyWithin 5 milesWithin 10 milesWithin 15 milesWithin 25 milesWithin 35 milesWithin 50 milesWithin 100 milesPay$90,000+$115,000+$135,000+$160,000+$185,000+Job typeFull-timePart-timeContractTemporaryNewEncouraged to applyfilterLocationChicago, ILItasca, ILEvanston, ILOak Brook, ILRosemont, ILRiver Forest, ILNiles, ILOakbrook Terrace, ILWood Dale, ILBensenville, ILDes Plaines, ILSkokie, ILMaywood, ILIllinoisElk Grove Village, ILCompanyAccentureNorthwestern MedicinePwCVelvetech, LLCDeloitteCapgeminiCapb InfotekNorthwestern UniversityGoogleThe University of ChicagoCodalNorthern Trust Corp.NTT DA

In [4]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    """
    ### SCRAPED TEXT FROM WEBSITE:
    {page_data}
    ### INSTRUCTION:
    The scraped text is from a career page of a website and describes a single job posting.
    Extract and return the details in JSON format, including the following keys:
    
    - `title`: The main position title for the job.
    - `requirements`: A list of all specific requirements mentioned for the job.
    - `responsibilities`: A concise summary of the main responsibilities for this role.
    - `experience_years`: The required experience in years as an integer.
      - If specific years are provided, use that value.
      - If experience is implied (e.g., "senior" or "junior"), set `experience_years` to `>0`.
      - If no indication of experience is given, set it to 0.

    Only return the valid JSON.
    ### VALID JSON (NO PREAMBLE):    
    """
)

# Assuming `page_data` contains the job posting text.
chain_extract = prompt | llm 
res = chain_extract.invoke(input={'page_data': page_data})
print(res.content)


```
{
  "title": "Senior Data Scientist (AI)",
  "requirements": [
    "Proven experience as a Data Scientist or similar role",
    "Experience with exploratory data analysis and pretraining models",
    "Familiarity with AI tools and techniques, and the ability to train others in their use",
    "Prior experience Gen AI testing 5 TOSCA (AI capabilities)"
  ],
  "responsibilities": "Collect and analyze data, integrate AI insights, and present findings to stakeholders",
  "experience_years": ">0"
}
```


In [5]:
from langchain_core.output_parsers import JsonOutputParser

json_parser = JsonOutputParser()
job_offer = json_parser.parse(res.content)
job_offer 

{'title': 'Senior Data Scientist (AI)',
 'requirements': ['Proven experience as a Data Scientist or similar role',
  'Experience with exploratory data analysis and pretraining models',
  'Familiarity with AI tools and techniques, and the ability to train others in their use',
  'Prior experience Gen AI testing 5 TOSCA (AI capabilities)'],
 'responsibilities': 'Collect and analyze data, integrate AI insights, and present findings to stakeholders',
 'experience_years': '>0'}

## part 2 extracting the skills of the user 

## read me file 

In [6]:
github_readme_link="https://github.com/amenallahbenothmen"

In [7]:
def get_github_readme_raw_link(username):
    return f"https://github.com/{username}/{username}/raw/main/README.md"

In [8]:
username = "amenallahbenothmen"
link = get_github_readme_raw_link(username)

In [9]:
loader2 = WebBaseLoader(link)
page_data_2= loader2.load().pop().page_content
print(page_data_2)

# 💫 About Me:
Hi there! 👋I'm Amenallah, a final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science. Here's a snapshot of my expertise:📊 Data Visualization – Crafting insightful visualizations for data-driven decision-making.🛠️ Model Development – Building and fine-tuning models to tackle complex problems.🤖 Machine Learning & Deep Learning – Applying advanced algorithms and neural networks to unlock AI potential.🌐 Exploring Large Language Models (LLMs) – Currently enhancing my AI skills with the latest in NLP and generative models.Feel free to browse my repositories to see my work in action!


## 🌐 Socials:
[![LinkedIn](https://img.shields.io/badge/LinkedIn-%230077B5.svg?logo=linkedin&logoColor=white)](https://linkedin.com/in/www.linkedin.com/in/amen-allah-ben-othmen-662b78274) 

# 💻 Tech Stack:
![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54) ![NumPy](https://img.shields.io/ba

In [10]:

prompt_2 = PromptTemplate.from_template(
    """
    ### SCRAPED TEXT FROM GITHUB README:
    {page_data_2}
    
    ### INSTRUCTION:
    The scraped text above is from a GitHub README file that provides details about an individual's professional background. 
    Your task is to extract and format the following information as valid JSON:
    
    - `role`: A list of roles or titles of the individual, with the main or primary role listed first. If no specific role is mentioned, use the area the individual is specializing in as the primary role.
    - `experience_duration`: The number of years of experience as an integer. If the individual is a student, set `experience_duration` to `0`.
    - `experience`: A brief summary of relevant experience, including areas of expertise, specific domains, or specializations (e.g., data science, machine learning, cloud computing).
    - `skills`: A list of key technical and non-technical skills highlighted in the README, such as programming languages, frameworks, tools, and soft skills.
    - `description`: A concise summary that introduces the individual's background, education, or current focus.
    
    Please follow these rules:
    - Only include information that is explicitly mentioned in the README text.
    - For `experience_duration`, provide an integer (e.g., `2` for two years of experience, or `0` if the individual is a student).
    - If no specific role is found, infer the primary role based on the individual's area of specialization.
    - If a specific field is not available, return an empty string ("") for text fields or an empty list ([]) for the `skills` field.
    - Do not add any explanatory text outside of the JSON format.
    
    ### VALID JSON (NO PREAMBLE) 

    """
)
chain_extract_2= prompt_2 | llm 
res_2 = chain_extract_2.invoke(input={'page_data_2':page_data_2})
print(res_2.content)

```json
{
    "role": ["Data Science Student", "Data Scientist"],
    "experience_duration": 0,
    "experience": "Data Visualization, Model Development, Machine Learning & Deep Learning, Exploring Large Language Models (LLMs)",
    "skills": ["Python", "NumPy", "Anaconda", "Flutter", "AmazonDynamoDB", "Firebase", "MySQL", "Figma", "Matplotlib", "Pandas", "scikit-learn", "Scipy", "TensorFlow", "Plotly", "GitHub Actions", "GitHub", "Git"],
    "description": "Final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science."
}
```


In [11]:

json_parser = JsonOutputParser()
read_me = json_parser.parse(res_2.content)
read_me

{'role': ['Data Science Student', 'Data Scientist'],
 'experience_duration': 0,
 'experience': 'Data Visualization, Model Development, Machine Learning & Deep Learning, Exploring Large Language Models (LLMs)',
 'skills': ['Python',
  'NumPy',
  'Anaconda',
  'Flutter',
  'AmazonDynamoDB',
  'Firebase',
  'MySQL',
  'Figma',
  'Matplotlib',
  'Pandas',
  'scikit-learn',
  'Scipy',
  'TensorFlow',
  'Plotly',
  'GitHub Actions',
  'GitHub',
  'Git'],
 'description': "Final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science."}

## github api 

In [12]:
import requests

In [13]:
import requests
import json
from langchain_core.prompts import PromptTemplate

# Summarization function for requirements and README using LLM
def llm_summarize(content):
    """Summarize content using the LLM, if content is too long."""
    prompt_template = PromptTemplate.from_template(
        """
        ### CONTENT TO SUMMARIZE:
        {content}

        ### INSTRUCTION:
        Provide a concise summary of the content above, focusing on the main technologies, functionality, and purpose if relevant.

        ### SUMMARY:
        """
    )
    chain = prompt_template | llm
    result = chain.invoke(input={'content': content})
    return result.content.strip()

# Function to get content from requirements.txt and README.md in each project
def get_project_content(username, repo_name, token):
    headers = {"Authorization": f"token {token}"}
    contents_url = f"https://api.github.com/repos/{username}/{repo_name}/contents"
    
    # Check for requirements.txt
    requirements_url = f"{contents_url}/requirements.txt"
    requirements_response = requests.get(requirements_url, headers=headers)
    requirements_content = requests.get(requirements_response.json()["download_url"]).text if requirements_response.status_code == 200 else None
    
    # Check for README.md
    readme_url = f"{contents_url}/README.md"
    readme_response = requests.get(readme_url, headers=headers)
    readme_content = requests.get(readme_response.json()["download_url"]).text if readme_response.status_code == 200 else None
    
    return requirements_content, readme_content

# Main function to fetch and format user projects as JSON
def get_user_projects(username, token):
    headers = {"Authorization": f"token {token}"}
    projects = []
    
    # GitHub API URL to fetch user's repositories
    url = f"https://api.github.com/users/{username}/repos"
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        repos = response.json()
        
        # Skip the first repository, assumed to be the profile README
        for repo in repos[1:]:  # Start from the second item
            repo_name = repo['name']
            repo_url = repo['html_url']
            language = repo['language']
            
            # Get requirements and README content
            requirements_content, readme_content = get_project_content(username, repo_name, token)
            
            # Generate a concise description based on project type, technologies, and goal
            project_type, technologies, primary_goal = generate_llm_project_summary(repo_name, language, requirements_content, readme_content)
            
            # Append project data as a dictionary
            project_data = {
                "Project Name": repo_name,
                "URL": repo_url,
                "Primary Language": language,
                "Project Type": project_type,
                "Main Technologies": technologies.split(", "),  # Splitting by comma for a list format
                "Primary Goal": primary_goal
            }
            projects.append(project_data)
    else:
        print(f"Failed to fetch repositories: {response.status_code}")
    
    # Return the projects list in JSON format
    return json.dumps(projects, indent=4)

# Function to summarize and extract project type, technologies, and primary goal
def generate_llm_project_summary(repo_name, language, requirements_content, readme_content):
    """Generates project type, main technologies, and primary goal using the custom LLM."""
    
    # Summarize requirements and README content using the custom LLM
    summarized_requirements = llm_summarize(requirements_content) if requirements_content else ""
    summarized_readme = llm_summarize(readme_content) if readme_content else ""

    # Create input data for the LLM based on the summaries
    input_data = (
        f"Project Name: {repo_name}\n"
        f"Primary Language: {language}\n"
        f"Requirements Summary: {summarized_requirements}\n"
        f"README Summary:\n{summarized_readme}"
    )

    # Define the LLM prompt template to create a structured project summary
    prompt_template = PromptTemplate.from_template(
        """
        ### PROJECT INFORMATION:
        {input_data}

        ### INSTRUCTION:
        Based on the project name, language, requirements, and README summary above, provide:
        - Project Type (e.g., Machine Learning, Data Visualization, Web Application)
        - Main Technologies used in the project
        - The primary goal or purpose of the project

        ### OUTPUT (AS THREE SEPARATE LINES WITHOUT LABELS):
        """
    )

    # Invoke the LLM with the summarized input
    llm_chain = prompt_template | llm
    result = llm_chain.invoke(input={'input_data': input_data})

    # Split the output into project type, technologies, and goal
    output_lines = result.content.strip().splitlines()
    project_type = output_lines[0] if len(output_lines) > 0 else ""
    technologies = output_lines[1] if len(output_lines) > 1 else ""
    primary_goal = output_lines[2] if len(output_lines) > 2 else ""

    return project_type, technologies, primary_goal

# Example usage
username = "amenallahbenothmen"
token = github_token
projects_output = get_user_projects(username, token)
print(projects_output)


[
    {
        "Project Name": "BTC_PRICE_PREDICTION_MODEL",
        "URL": "https://github.com/amenallahbenothmen/BTC_PRICE_PREDICTION_MODEL",
        "Primary Language": "Jupyter Notebook",
        "Project Type": "Machine Learning and Data Science",
        "Main Technologies": [
            "pandas",
            "numpy",
            "yfinance",
            "tensorflow",
            "keras-tuner",
            "joblib",
            "Flask",
            "Flask-Cors",
            "and streamlit"
        ],
        "Primary Goal": "Predicting Bitcoin prices using machine learning and data science techniques."
    },
    {
        "Project Name": "Cold-Outreach-Email-Generator",
        "URL": "https://github.com/amenallahbenothmen/Cold-Outreach-Email-Generator",
        "Primary Language": "Jupyter Notebook",
        "Project Type": "Natural Language Processing (NLP) and Machine Learning",
        "Main Technologies": [
            "python-dotenv",
            "transformers",
         

In [14]:
projects = json_parser.parse(projects_output)
projects

[{'Project Name': 'BTC_PRICE_PREDICTION_MODEL',
  'URL': 'https://github.com/amenallahbenothmen/BTC_PRICE_PREDICTION_MODEL',
  'Primary Language': 'Jupyter Notebook',
  'Project Type': 'Machine Learning and Data Science',
  'Main Technologies': ['pandas',
   'numpy',
   'yfinance',
   'tensorflow',
   'keras-tuner',
   'joblib',
   'Flask',
   'Flask-Cors',
   'and streamlit'],
  'Primary Goal': 'Predicting Bitcoin prices using machine learning and data science techniques.'},
 {'Project Name': 'Cold-Outreach-Email-Generator',
  'URL': 'https://github.com/amenallahbenothmen/Cold-Outreach-Email-Generator',
  'Primary Language': 'Jupyter Notebook',
  'Project Type': 'Natural Language Processing (NLP) and Machine Learning',
  'Main Technologies': ['python-dotenv',
   'transformers',
   'langchain',
   'bs4',
   'requests',
   'chromadb',
   'aiohttp'],
  'Primary Goal': 'Automated generation of personalized recommendation letters using NLP and machine learning.'},
 {'Project Name': 'Emotio

In [15]:
import chromadb
client=chromadb.Client()
collection = client.create_collection(name='user_data_collection')

In [16]:
def register_readme_in_chromadb(readme_data):
    # Convert list fields to comma-separated strings
    role_str = ", ".join(readme_data['role']) if isinstance(readme_data['role'], list) else readme_data['role']
    skills_str = ", ".join(readme_data['skills']) if isinstance(readme_data['skills'], list) else readme_data['skills']
    
    # Add the document to ChromaDB with formatted metadata
    collection.add(
        documents=[readme_data['description']],
        ids=["readme"],
        metadatas={
            "type": "README",
            "role": role_str,
            "experience_duration": readme_data['experience_duration'],
            "experience": readme_data['experience'],
            "skills": skills_str
        }
    )

In [17]:
def register_projects_in_chromadb(projects):
    for project in projects:
        # Convert main technologies list to a comma-separated string
        main_technologies_str = ", ".join(project["Main Technologies"]) if isinstance(project["Main Technologies"], list) else project["Main Technologies"]
        
        collection.add(
            documents=[project["Primary Goal"]],
            ids=[project["Project Name"]],
            metadatas={
                "type": "Project",
                "project_name": project["Project Name"],
                "url": project["URL"],
                "primary_language": project["Primary Language"],
                "project_type": project["Project Type"],
                "main_technologies": main_technologies_str
            }
        )


In [18]:
def query_for_application_letter(job_title, job_requirements, job_responsibilities):
    query_text = (
        f"Generate a job application letter for the position '{job_title}'. "
        f"Focus on user projects, skills, and experiences that demonstrate alignment with the following requirements:\n{job_requirements}\n"
        f"and responsibilities:\n{job_responsibilities}\n"
        "Identify relevant information to emphasize the user's qualifications and enthusiasm for the position."
    )

    results = collection.query(
        query_texts=[query_text],
        n_results=5  # Retrieve the top 5 relevant documents
    )
    
    return results


In [27]:
def generate_application_letter(job_title, job_requirements, job_responsibilities, query_results):
    # Extract the relevant project documents from the query results
    relevant_projects = "\n\n".join(
        [f"- {doc[0]}: {doc[1]}" for doc in zip(query_results["documents"], query_results["metadatas"])]
    )

    # Prepare the input data for the application letter
    input_data = (
        f"Job Title: {job_title}\n"
        f"Requirements: {job_requirements}\n"
        f"Responsibilities: {job_responsibilities}\n"
        f"Relevant User Projects:\n{relevant_projects}"
    )

    # Define a prompt template to generate the application letter
    prompt_template = PromptTemplate.from_template(
        """
        ### JOB AND USER INFORMATION:
        {input_data}

        ### INSTRUCTION:
        Write a job application letter for the user, emphasizing their relevant skills and experiences in alignment with the job title, requirements, and responsibilities. Mention specific projects by name, placing the project name in parentheses, and briefly explain how each project supports the user’s qualifications and demonstrates their expertise in relation to the job’s demands.


        ### APPLICATION LETTER:
        """
    )

    # Generate the letter using the LLM
    llm_chain = prompt_template | llm
    result = llm_chain.invoke(input={'input_data': input_data})
    
    return result.content.strip()


In [20]:
register_readme_in_chromadb(read_me)
register_projects_in_chromadb(projects)

In [21]:
job_title = job_offer.get("title", "AI-related role")
job_requirements = "; ".join(job_offer.get("requirements", []))  # Join list to single string
job_responsibilities = job_offer.get("responsibilities", "N/A")


In [28]:
query_results = query_for_application_letter(job_title, job_requirements, job_responsibilities)

# Generate the recommendation letter
recommendation_letter = generate_application_letter(job_title, job_requirements, job_responsibilities, query_results)
print(recommendation_letter)

[Your Name]
[Your Address]
[City, State ZIP Code]
[Date]

[Recipient’s Name]
[Recipient’s Title]
[Company Name]
[Company Address]
[City, State ZIP Code]

Dear [Recipient’s Name],

I am excited to apply for the Senior Data Scientist (AI) position at [Company Name]. As a highly motivated and experienced data scientist with a strong background in machine learning and AI, I am confident that I possess the skills and expertise required to excel in this role.

With a proven track record of collecting and analyzing data, integrating AI insights, and presenting findings to stakeholders, I am well-equipped to handle the responsibilities of this position. My experience in exploratory data analysis and pretraining models has allowed me to develop a deep understanding of AI tools and techniques, which I am eager to apply in this role.

As a data science student and data scientist, I have had the opportunity to work on various projects that demonstrate my expertise in AI and machine learning. For i