In [3]:
import os
from dotenv import load_dotenv
load_dotenv()
github_token = os.getenv("GITHUB_TOKEN")
groq_api_key = os.getenv("GROG_API_KEY")

In [6]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    temperature=0, 
    groq_api_key=groq_api_key, 
    model_name="llama-3.1-70b-versatile"
)

In [7]:
import chromadb
client=chromadb.Client()
collection=client.create_collection(name='test_collection')

In [8]:
collection.add(
    documents=[
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids=["id3", "id4"],
    metadatas=[
        {"url": "https://en.wikipedia.org/wiki/New_York_City"},
        {"url": "https://en.wikipedia.org/wiki/New_Delhi"}
    ]
)

In [9]:
results = collection.query(
    query_texts=["Query is about liberty"],
    n_results=4
)
results

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'ids': [['id3', 'id4']],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about Delhi']],
 'uris': None,
 'data': None,
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_York_City'},
   {'url': 'https://en.wikipedia.org/wiki/New_Delhi'}]],
 'distances': [[1.252912163734436, 1.541961669921875]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

## part 1 (extracting the job offer )

In [10]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.indeed.com/jobs?q=ai&l=&from=searchOnHP&vjk=cd7115d3a75e7bcd")
page_data = loader.load().pop().page_content
print(page_data)

USER_AGENT environment variable not set, consider setting it to identify your requests.






Ai Jobs, Employment | Indeed.com









































        Skip to main content






HomeCompany reviewsFind salariesSign inSign inEmployers / Post Job1 new updateStart of main content

Keyword : all jobs&nbsp;Edit location input box labelTip: Enter your city or zip code in the "where" box to show results in your area.SearchDate postedLast 24 hoursLast 3 daysLast 7 daysLast 14 daysRemoteRemoteHybrid workPay$45,000+$70,000+$85,000+$130,000+$170,000+Job typeFull-timeContractPart-timeTemporaryInternshipNewEncouraged to applyfilterLocationRemoteNew York, NYSan Francisco, CASeattle, WAAtlanta, GAChicago, ILAustin, TXBoston, MASan Jose, CAWashington, DCHouston, TXMountain View, CADallas, TXSanta Clara, CACompanyOutlier AIAccentureDeloitteHIIGoogleAmazon.comPwCMetaNTT DATAAmazon Web ServicesEXLMarsh McLennan AgencyVantagepoint ai, LLCAppleJPMorganChasePosted byEmployerStaffing agencyExperience levelEntry LevelMid LevelSenior LevelNo Experience RequiredEducationNo E

In [11]:
from langchain_core.prompts import PromptTemplate

prompt=PromptTemplate.from_template(
            """
        ### SCRAPED TEXT FROM WEBSITE:
        {page_data}
        ### INSTRUCTION:
        The scraped text is from the career's page of a website.
        Your job is to extract the job postings and return them in JSON format containing the 
        following keys: `role`, `experience`, `skills` and `description`.
        Only return the valid JSON.
        ### VALID JSON (NO PREAMBLE):    
        """
)
chain_extract = prompt | llm 
res = chain_extract.invoke(input={'page_data':page_data})
print(res.content)


[
  {
    "role": "AI Prompt Writer - Immediate Start",
    "experience": "Experienced English writer",
    "skills": "Human feedback, AI models",
    "description": "Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced English writer who would like to…"
  },
  {
    "role": "AI Engineer (LLM) (100% remote - US)",
    "experience": "Not specified",
    "skills": "Digital finance, AI development",
    "description": "Join Tether and Shape the Future of Digital Finance. At Tether, we’re not just building products, we’re pioneering a global financial revolution."
  },
  {
    "role": "AI Writing Trainer",
    "experience": "Experienced English writer",
    "skills": "Human feedback, AI models",
    "description": "Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced English writer who would like to…"
  },
  {
    "role": "AI Content Moderato

In [12]:
type(res.content)

str

In [13]:
from langchain_core.output_parsers import JsonOutputParser

json_parser = JsonOutputParser()
json_res = json_parser.parse(res.content)
json_res

[{'role': 'AI Prompt Writer - Immediate Start',
  'experience': 'Experienced English writer',
  'skills': 'Human feedback, AI models',
  'description': 'Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced English writer who would like to…'},
 {'role': 'AI Engineer (LLM) (100% remote - US)',
  'experience': 'Not specified',
  'skills': 'Digital finance, AI development',
  'description': 'Join Tether and Shape the Future of Digital Finance. At Tether, we’re not just building products, we’re pioneering a global financial revolution.'},
 {'role': 'AI Writing Trainer',
  'experience': 'Experienced English writer',
  'skills': 'Human feedback, AI models',
  'description': 'Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced English writer who would like to…'},
 {'role': 'AI Content Moderator (Tier 1)',
  'experience': 'Experienced English wri

## part 2 extracting the skills of the user 

## read me file 

In [14]:
github_readme_link="https://github.com/amenallahbenothmen"

In [15]:
def get_github_readme_raw_link(username):
    return f"https://github.com/{username}/{username}/raw/main/README.md"

In [16]:
username = "amenallahbenothmen"
link = get_github_readme_raw_link(username)

In [17]:
loader2 = WebBaseLoader(link)
page_data_2= loader2.load().pop().page_content
print(page_data_2)

# 💫 About Me:
Hi there! 👋I'm Amenallah, a final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science. Here's a snapshot of my expertise:📊 Data Visualization – Crafting insightful visualizations for data-driven decision-making.🛠️ Model Development – Building and fine-tuning models to tackle complex problems.🤖 Machine Learning & Deep Learning – Applying advanced algorithms and neural networks to unlock AI potential.🌐 Exploring Large Language Models (LLMs) – Currently enhancing my AI skills with the latest in NLP and generative models.Feel free to browse my repositories to see my work in action!


## 🌐 Socials:
[![LinkedIn](https://img.shields.io/badge/LinkedIn-%230077B5.svg?logo=linkedin&logoColor=white)](https://linkedin.com/in/www.linkedin.com/in/amen-allah-ben-othmen-662b78274) 

# 💻 Tech Stack:
![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54) ![NumPy](https://img.shields.io/ba

In [18]:

prompt_2 = PromptTemplate.from_template(
    """
    ### SCRAPED TEXT FROM GITHUB README:
    {page_data_2}
    
    ### INSTRUCTION:
    The scraped text above is from a GitHub README file that provides details about an individual's professional background. 
    Your task is to extract and format the following information as valid JSON:
    
    - `role`: A list of roles or titles of the individual, with the main or primary role listed first. If no specific role is mentioned, use the area the individual is specializing in as the primary role.
    - `experience_duration`: The number of years of experience as an integer. If the individual is a student, set `experience_duration` to `0`.
    - `experience`: A brief summary of relevant experience, including areas of expertise, specific domains, or specializations (e.g., data science, machine learning, cloud computing).
    - `skills`: A list of key technical and non-technical skills highlighted in the README, such as programming languages, frameworks, tools, and soft skills.
    - `description`: A concise summary that introduces the individual's background, education, or current focus.
    
    Please follow these rules:
    - Only include information that is explicitly mentioned in the README text.
    - For `experience_duration`, provide an integer (e.g., `2` for two years of experience, or `0` if the individual is a student).
    - If no specific role is found, infer the primary role based on the individual's area of specialization.
    - If a specific field is not available, return an empty string ("") for text fields or an empty list ([]) for the `skills` field.
    - Do not add any explanatory text outside of the JSON format.
    
    ### VALID JSON (NO PREAMBLE) 

    """
)
chain_extract_2= prompt_2 | llm 
res_2 = chain_extract_2.invoke(input={'page_data_2':page_data_2})
print(res_2.content)

```json
{
  "role": ["Data Science Student", "Data Scientist"],
  "experience_duration": 0,
  "experience": "Data Visualization, Model Development, Machine Learning & Deep Learning, Exploring Large Language Models (LLMs)",
  "skills": [
    "Python",
    "NumPy",
    "Anaconda",
    "Flutter",
    "Amazon DynamoDB",
    "Firebase",
    "MySQL",
    "Figma",
    "Matplotlib",
    "Pandas",
    "scikit-learn",
    "Scipy",
    "TensorFlow",
    "Plotly",
    "GitHub Actions",
    "GitHub",
    "Git"
  ],
  "description": "Final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science."
}
```


In [19]:

json_parser = JsonOutputParser()
json_res = json_parser.parse(res_2.content)
json_res

{'role': ['Data Science Student', 'Data Scientist'],
 'experience_duration': 0,
 'experience': 'Data Visualization, Model Development, Machine Learning & Deep Learning, Exploring Large Language Models (LLMs)',
 'skills': ['Python',
  'NumPy',
  'Anaconda',
  'Flutter',
  'Amazon DynamoDB',
  'Firebase',
  'MySQL',
  'Figma',
  'Matplotlib',
  'Pandas',
  'scikit-learn',
  'Scipy',
  'TensorFlow',
  'Plotly',
  'GitHub Actions',
  'GitHub',
  'Git'],
 'description': "Final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science."}

## github api 

In [12]:
import json

def get_user_projects(username, token):
    headers = {"Authorization": f"token {token}"}
    projects = []
    
    # GitHub API URL to fetch user's repositories
    url = f"https://api.github.com/users/{username}/repos"
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        repos = response.json()
        
        # Skip the first repository, assumed to be the profile README
        for repo in repos[1:]:  # Start from the second item
            repo_name = repo['name']
            repo_url = repo['html_url']
            language = repo['language']
            
            # Get requirements and README content
            requirements_content, readme_content = get_project_content(username, repo_name, token)
            
            # Generate a concise description based on project type, technologies, and goal
            project_type, technologies, primary_goal = generate_llm_project_summary(repo_name, language, requirements_content, readme_content)
            
            # Add each project’s details as a dictionary
            project_data = {
                "Project Name": repo_name,
                "URL": repo_url,
                "Primary Language": language,
                "Project Type": project_type,
                "Main Technologies": technologies,
                "Primary Goal": primary_goal
            }
            projects.append(project_data)
    else:
        print(f"Failed to fetch repositories: {response.status_code}")
    
    return json.dumps(projects, indent=4)

def generate_llm_project_summary(repo_name, language, requirements_content, readme_content):
    """Generates project type, main technologies, and primary goal using the custom LLM."""
    
    # Summarize requirements and README content using the custom LLM
    summarized_requirements = llm_summarize(requirements_content) if requirements_content else ""
    summarized_readme = llm_summarize(readme_content) if readme_content else ""

    # Create input data for the LLM based on the summaries
    input_data = (
        f"Project Name: {repo_name}\n"
        f"Primary Language: {language}\n"
        f"Requirements Summary: {summarized_requirements}\n"
        f"README Summary:\n{summarized_readme}"
    )

    # Define the LLM prompt template to create a structured project summary
    prompt_template = PromptTemplate.from_template(
        """
        ### PROJECT INFORMATION:
        {input_data}

        ### INSTRUCTION:
        Based on the project name, language, requirements, and README summary above, provide:
        - Project Type (e.g., Machine Learning, Data Visualization, Web Application)
        - Main Technologies used in the project
        - The primary goal or purpose of the project

        ### OUTPUT (AS THREE SEPARATE LINES WITHOUT LABELS):
        """
    )

    # Invoke the LLM with the summarized input
    llm_chain = prompt_template | llm
    result = llm_chain.invoke(input={'input_data': input_data})

    # Split the output into project type, technologies, and goal
    output_lines = result.content.strip().splitlines()
    project_type = output_lines[0] if len(output_lines) > 0 else ""
    technologies = output_lines[1] if len(output_lines) > 1 else ""
    primary_goal = output_lines[2] if len(output_lines) > 2 else ""

    return project_type, technologies, primary_goal


In [13]:
username = "amenallahbenothmen"  
token = github_token
projects_output = get_user_projects(username, token)
print(projects_output)

[
    {
        "Project Name": "BTC_PRICE_PREDICTION_MODEL",
        "URL": "https://github.com/amenallahbenothmen/BTC_PRICE_PREDICTION_MODEL",
        "Primary Language": "Jupyter Notebook",
        "Project Type": "Machine Learning",
        "Main Technologies": "Python, pandas, numpy, tensorflow, matplotlib, plotly, Flask, Streamlit, dvc, mlflow",
        "Primary Goal": "Predicting Bitcoin prices using machine learning models and deploying the model as a web application."
    },
    {
        "Project Name": "Cold-Outreach-Email-Generator",
        "URL": "https://github.com/amenallahbenothmen/Cold-Outreach-Email-Generator",
        "Primary Language": "Jupyter Notebook",
        "Project Type": "Natural Language Processing and Machine Learning",
        "Main Technologies": "Jupyter Notebook, transformers, BeautifulSoup4, requests, ChromaDB, aiohttp",
        "Primary Goal": "Automated generation of personalized recommendation letters based on job offers and users' GitHub reposit