In [5]:
import os
from dotenv import load_dotenv
load_dotenv()
github_token = os.getenv("GITHUB_TOKEN")
groq_api_key = os.getenv("GROG_API_KEY")

In [6]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    temperature=0, 
    groq_api_key=groq_api_key, 
    model_name="llama-3.1-70b-versatile"
)

In [7]:
import chromadb
client=chromadb.Client()
collection=client.create_collection(name='test_collection')

In [8]:
collection.add(
    documents=[
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids=["id3", "id4"],
    metadatas=[
        {"url": "https://en.wikipedia.org/wiki/New_York_City"},
        {"url": "https://en.wikipedia.org/wiki/New_Delhi"}
    ]
)

In [9]:
results = collection.query(
    query_texts=["Query is about liberty"],
    n_results=4
)
results

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'ids': [['id3', 'id4']],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about Delhi']],
 'uris': None,
 'data': None,
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_York_City'},
   {'url': 'https://en.wikipedia.org/wiki/New_Delhi'}]],
 'distances': [[1.252912163734436, 1.541961669921875]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

## part 1 (extracting the job offer )

In [10]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.indeed.com/jobs?q=ai&l=&from=searchOnHP&vjk=cd7115d3a75e7bcd")
page_data = loader.load().pop().page_content
print(page_data)

USER_AGENT environment variable not set, consider setting it to identify your requests.






Ai Jobs, Employment | Indeed.com









































        Skip to main content






HomeCompany reviewsFind salariesSign inSign inEmployers / Post Job1 new updateStart of main content

Keyword : all jobs&nbsp;Edit location input box labelTip: Enter your city or zip code in the "where" box to show results in your area.SearchDate postedLast 24 hoursLast 3 daysLast 7 daysLast 14 daysRemoteRemoteHybrid workPay$45,000+$70,000+$85,000+$130,000+$170,000+Job typeFull-timeContractPart-timeTemporaryInternshipNewEncouraged to applyfilterLocationRemoteNew York, NYSan Francisco, CASeattle, WAAtlanta, GAChicago, ILAustin, TXBoston, MASan Jose, CAWashington, DCHouston, TXMountain View, CADallas, TXSanta Clara, CACompanyOutlier AIAccentureDeloitteHIIGoogleAmazon.comPwCMetaNTT DATAAmazon Web ServicesEXLMarsh McLennan AgencyVantagepoint ai, LLCAppleJPMorganChasePosted byEmployerStaffing agencyExperience levelEntry LevelMid LevelSenior LevelNo Experience RequiredEducationNo E

In [11]:
from langchain_core.prompts import PromptTemplate

prompt=PromptTemplate.from_template(
            """
        ### SCRAPED TEXT FROM WEBSITE:
        {page_data}
        ### INSTRUCTION:
        The scraped text is from the career's page of a website.
        Your job is to extract the job postings and return them in JSON format containing the 
        following keys: `role`, `experience`, `skills` and `description`.
        Only return the valid JSON.
        ### VALID JSON (NO PREAMBLE):    
        """
)
chain_extract = prompt | llm 
res = chain_extract.invoke(input={'page_data':page_data})
print(res.content)


[
  {
    "role": "AI Prompt Writer - Immediate Start",
    "experience": "Experienced English writer",
    "skills": "Human feedback, AI models",
    "description": "Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced English writer who would like to…"
  },
  {
    "role": "AI Engineer (LLM) (100% remote - US)",
    "experience": "Not specified",
    "skills": "Digital finance, AI development",
    "description": "Join Tether and Shape the Future of Digital Finance. At Tether, we’re not just building products, we’re pioneering a global financial revolution."
  },
  {
    "role": "AI Writing Trainer",
    "experience": "Experienced English writer",
    "skills": "Human feedback, AI models",
    "description": "Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced English writer who would like to…"
  },
  {
    "role": "AI Content Moderato

In [12]:
type(res.content)

str

In [13]:
from langchain_core.output_parsers import JsonOutputParser

json_parser = JsonOutputParser()
json_res = json_parser.parse(res.content)
json_res

[{'role': 'AI Prompt Writer - Immediate Start',
  'experience': 'Experienced English writer',
  'skills': 'Human feedback, AI models',
  'description': 'Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced English writer who would like to…'},
 {'role': 'AI Engineer (LLM) (100% remote - US)',
  'experience': 'Not specified',
  'skills': 'Digital finance, AI development',
  'description': 'Join Tether and Shape the Future of Digital Finance. At Tether, we’re not just building products, we’re pioneering a global financial revolution.'},
 {'role': 'AI Writing Trainer',
  'experience': 'Experienced English writer',
  'skills': 'Human feedback, AI models',
  'description': 'Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced English writer who would like to…'},
 {'role': 'AI Content Moderator (Tier 1)',
  'experience': 'Experienced English wri

## part 2 extracting the skills of the user 

## read me file 

In [14]:
github_readme_link="https://github.com/amenallahbenothmen"

In [15]:
def get_github_readme_raw_link(username):
    return f"https://github.com/{username}/{username}/raw/main/README.md"

In [16]:
username = "amenallahbenothmen"
link = get_github_readme_raw_link(username)

In [17]:
loader2 = WebBaseLoader(link)
page_data_2= loader2.load().pop().page_content
print(page_data_2)

# 💫 About Me:
Hi there! 👋I'm Amenallah, a final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science. Here's a snapshot of my expertise:📊 Data Visualization – Crafting insightful visualizations for data-driven decision-making.🛠️ Model Development – Building and fine-tuning models to tackle complex problems.🤖 Machine Learning & Deep Learning – Applying advanced algorithms and neural networks to unlock AI potential.🌐 Exploring Large Language Models (LLMs) – Currently enhancing my AI skills with the latest in NLP and generative models.Feel free to browse my repositories to see my work in action!


## 🌐 Socials:
[![LinkedIn](https://img.shields.io/badge/LinkedIn-%230077B5.svg?logo=linkedin&logoColor=white)](https://linkedin.com/in/www.linkedin.com/in/amen-allah-ben-othmen-662b78274) 

# 💻 Tech Stack:
![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54) ![NumPy](https://img.shields.io/ba

In [18]:

prompt_2 = PromptTemplate.from_template(
    """
    ### SCRAPED TEXT FROM GITHUB README:
    {page_data_2}
    
    ### INSTRUCTION:
    The scraped text above is from a GitHub README file that provides details about an individual's professional background. 
    Your task is to extract and format the following information as valid JSON:
    
    - `role`: A list of roles or titles of the individual, with the main or primary role listed first. If no specific role is mentioned, use the area the individual is specializing in as the primary role.
    - `experience_duration`: The number of years of experience as an integer. If the individual is a student, set `experience_duration` to `0`.
    - `experience`: A brief summary of relevant experience, including areas of expertise, specific domains, or specializations (e.g., data science, machine learning, cloud computing).
    - `skills`: A list of key technical and non-technical skills highlighted in the README, such as programming languages, frameworks, tools, and soft skills.
    - `description`: A concise summary that introduces the individual's background, education, or current focus.
    
    Please follow these rules:
    - Only include information that is explicitly mentioned in the README text.
    - For `experience_duration`, provide an integer (e.g., `2` for two years of experience, or `0` if the individual is a student).
    - If no specific role is found, infer the primary role based on the individual's area of specialization.
    - If a specific field is not available, return an empty string ("") for text fields or an empty list ([]) for the `skills` field.
    - Do not add any explanatory text outside of the JSON format.
    
    ### VALID JSON (NO PREAMBLE) 

    """
)
chain_extract_2= prompt_2 | llm 
res_2 = chain_extract_2.invoke(input={'page_data_2':page_data_2})
print(res_2.content)

```json
{
  "role": ["Data Science Student", "Data Scientist"],
  "experience_duration": 0,
  "experience": "Data Visualization, Model Development, Machine Learning & Deep Learning, Exploring Large Language Models (LLMs)",
  "skills": [
    "Python",
    "NumPy",
    "Anaconda",
    "Flutter",
    "Amazon DynamoDB",
    "Firebase",
    "MySQL",
    "Figma",
    "Matplotlib",
    "Pandas",
    "scikit-learn",
    "Scipy",
    "TensorFlow",
    "Plotly",
    "GitHub Actions",
    "GitHub",
    "Git"
  ],
  "description": "Final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science."
}
```


In [19]:

json_parser = JsonOutputParser()
json_res = json_parser.parse(res_2.content)
json_res

{'role': ['Data Science Student', 'Data Scientist'],
 'experience_duration': 0,
 'experience': 'Data Visualization, Model Development, Machine Learning & Deep Learning, Exploring Large Language Models (LLMs)',
 'skills': ['Python',
  'NumPy',
  'Anaconda',
  'Flutter',
  'Amazon DynamoDB',
  'Firebase',
  'MySQL',
  'Figma',
  'Matplotlib',
  'Pandas',
  'scikit-learn',
  'Scipy',
  'TensorFlow',
  'Plotly',
  'GitHub Actions',
  'GitHub',
  'Git'],
 'description': "Final-year engineering student at SUP'COM (Higher School of Communication of Tunis), specializing in data science."}

## github api 

In [None]:
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
headers = {"Authorization": "Bearer hf_***"}

In [2]:
import requests
import os
from transformers import pipeline
from langchain_core.prompts import PromptTemplate
summarizer = pipeline("summarization", model="sshleifer/distilbart-xsum-12-6")



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

In [None]:

def get_user_projects(username, token):
    headers = {"Authorization": f"token {token}"}
    project_output = ""
    
    # GitHub API URL to fetch user's repositories
    url = f"https://api.github.com/users/{username}/repos"
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        repos = response.json()
        for repo in repos:
            repo_name = repo['name']
            repo_url = repo['html_url']
            language = repo['language']
            
            # Get requirements and README content
            requirements_content, readme_content = get_project_content(username, repo_name, token)
            
            # Generate a concise description based on project type, technologies, and goal
            description = generate_llm_description(repo_name, language, requirements_content, readme_content)
            
            # Append project summary to output
            project_output += (
                f"Project Name: {repo_name}\n"
                f"URL: {repo_url}\n"
                f"Primary Language: {language}\n"
                f"Description: {description}\n"
                f"{'-'*40}\n"
            )
    else:
        print(f"Failed to fetch repositories: {response.status_code}")
    
    return project_output

def get_project_content(username, repo_name, token):
    headers = {"Authorization": f"token {token}"}
    contents_url = f"https://api.github.com/repos/{username}/{repo_name}/contents"
    
    # Check for requirements.txt in the repository
    requirements_url = f"{contents_url}/requirements.txt"
    requirements_response = requests.get(requirements_url, headers=headers)
    
    if requirements_response.status_code == 200:
        # Store the raw content of requirements.txt
        requirements_content = requests.get(requirements_response.json()["download_url"]).text
    else:
        # Set requirements content to None if requirements.txt is missing
        requirements_content = None

    # Check for README.md in the repository
    readme_url = f"{contents_url}/README.md"
    readme_response = requests.get(readme_url, headers=headers)
    
    if readme_response.status_code == 200:
        # Store the raw content of README.md
        readme_content = requests.get(readme_response.json()["download_url"]).text
    else:
        # Set README content to None if README.md is missing
        readme_content = None
    
    return requirements_content, readme_content

def summarize_file_content(content, max_chars=500):
    """Summarizes a file's content using a pre-trained summarizer model."""
    if content is None or len(content) <= max_chars:
        return content
    else:
        # Summarize if content is too long
        summarized_text = summarizer(content, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
        return summarized_text

def generate_llm_description(repo_name, language, requirements_content, readme_content):
    """Generates a concise description using an LLM with the project type, technologies, and goal."""
    
    # Summarize requirements and README content
    summarized_requirements = summarize_file_content(requirements_content) if requirements_content else "None"
    summarized_readme = summarize_file_content(readme_content) if readme_content else "No README available"

    # Create input data for the LLM based on the summaries
    input_data = (
        f"Project Name: {repo_name}\n"
        f"Primary Language: {language}\n"
        f"Requirements Summary: {summarized_requirements}\n"
        f"README Summary:\n{summarized_readme}"
    )

    # Define the LLM prompt template to create a structured project description
    prompt_template = PromptTemplate.from_template(
        """
        ### PROJECT INFORMATION:
        {input_data}

        ### INSTRUCTION:
        Based on the project name, language, requirements, and README summary above, generate a concise description including:
        - The project type (e.g., Machine Learning, Data Visualization, Web Application).
        - The main technologies used in the project.
        - The project's primary goal or purpose.

        ### DESCRIPTION:
        """
    )

    # Invoke the LLM with the summarized input
    llm_chain = prompt_template | llm
    result = llm_chain.invoke(input={'input_data': input_data})
    
    return result.content.strip()




  from .autonotebook import tqdm as notebook_tqdm





KeyboardInterrupt: 

In [None]:
username = "08Youssef08"  
token = github_token
projects_output = get_user_projects(username, token)
print(projects_output)

NameError: name 'get_user_projects' is not defined

In [None]:


prompt_3 = PromptTemplate.from_template(
    """
    ### SCRAPED CODE FROM GITHUB PROJECTS:
    {projects_output}
    
    ### INSTRUCTION:
    The scraped text above contains details about an individual's GitHub projects and code structure. Your task is to analyze the project details and extract the following information as valid JSON:
    
    - `roles`: An ordered list of probable roles or titles based on the project types. Prioritize roles related to data science, machine learning, and AI, followed by software engineering or general engineering roles if applicable.
    - `libraries_used`: A list of key libraries, frameworks, or tools observed in the code (e.g., `NumPy`, `Pandas`, `TensorFlow`). Extract these from the project dependencies and code content.
    - `experience`: A brief summary of relevant experience inferred from the projects, including areas of expertise, specific domains, or specializations (e.g., data science, machine learning, cloud computing).
    
    Please follow these rules:
    - For `roles`, order the list from the most probable (like "Data Scientist" or "Machine Learning Engineer") to less probable roles based on project contents.
    - For `libraries_used`, only list libraries, frameworks, or tools that appear in the code or dependencies.
    - For `experience`, provide a concise summary based on the project types and areas of specialization (e.g., machine learning, data analysis).
    - If a specific field is not available, return an empty string ("") for `experience` or an empty list ([]) for `roles` and `libraries_used`.
    - Do not include any explanatory text outside of the JSON format.
    
    ### VALID JSON (NO PREAMBLE) 

    """
)

# Chain setup to use the prompt with the language model
chain_extract_3 = prompt_3 | llm
res_3 = chain_extract_3.invoke(input={'projects_output': projects_output})

# Output the result
print(res_3.content)


APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-70b-versatile` in organization `org_01jbes0nqgfqh8x2f7w1dxhgmm` on tokens per minute (TPM): Limit 6000, Requested 26219, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}