In [1]:
#!pip install pymongo requests python-dotenv tqm

In [2]:
from pymongo import MongoClient
import os
from dotenv import load_dotenv
from urllib.parse import urlparse
from transformers import pipeline
import re
import requests
from bs4 import BeautifulSoup
from transformers.utils import logging
from tqdm import tqdm
from datetime import datetime

In [3]:
load_dotenv()

mongo_host = os.getenv('MONGO_HOST')
mongo_port = os.getenv('MONGO_PORT')
mongo_db = os.getenv('MONGO_DB')
mongo_collection = os.getenv('MONGO_COLLECTION')

github_token = os.getenv('GITHUB_TOKEN')

In [4]:
import os
from pymongo import MongoClient

mongo_host = os.getenv("MONGO_HOST", "localhost")
mongo_port = int(os.getenv("MONGO_PORT", 27017))
mongo_db = os.getenv("MONGO_DB", "PFE")  # valeur par défaut
mongo_collection = os.getenv("MONGO_COLLECTION", "resumes")  # valeur par défaut

client = MongoClient(
    host=mongo_host,
    port=mongo_port,
    serverSelectionTimeoutMS=2000,
    connectTimeoutMS=10000,
    socketTimeoutMS=10000
)

try:
    client.admin.command("ping")
except Exception as e:
    raise RuntimeError(f"MongoDB non accessible: {e}")

db = client[mongo_db]
resumes_collection = db[mongo_collection]


In [5]:
# Load summarization model (local inference)
logging.set_verbosity_error()
summarizer = pipeline("summarization", model="pszemraj/led-base-book-summary", tokenizer="pszemraj/led-base-book-summary")

config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [6]:
def clean_markdown(text):
    # Remove badges (e.g., ![alt](...) or <img ...>)
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'<img.*?>', '', text)

    # Remove raw HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove code blocks (```...```)
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)

    # Remove inline code (`...`)
    text = re.sub(r'`[^`]+`', '', text)

    # Remove markdown links but keep the link text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)

    # Remove headings (e.g., # Heading)
    text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)

    # Remove excess whitespace
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    return text.strip()

In [7]:
def get_readme_content(github_readme_url):
    # Convert to raw content URL
    if "github.com" in github_readme_url and "blob" in github_readme_url:
        raw_url = github_readme_url.replace("github.com", "raw.githubusercontent.com").replace("/blob", "")
    else:
        raw_url = github_readme_url  # assume it's already raw

    try:
        readme_resp = requests.get(raw_url)
        readme_resp.raise_for_status()
        return clean_markdown(readme_resp.text)
    except requests.exceptions.RequestException as e:
        return None

In [8]:
def extract_github_projects(profile_url, token=None, existing_repo_names=None):
    username = urlparse(profile_url).path.strip("/").split("/")[0]
    headers = {"Accept": "application/vnd.github+json"}
    repos_url = f"https://api.github.com/users/{username}/repos"
    if token:
        headers["Authorization"] = f"Bearer {token}"

    existing_repo_names = {name.lower() for name in (existing_repo_names or [])}

    try:
        response = requests.get(repos_url, headers=headers)
        response.raise_for_status()
        repos = response.json()
        projects = []

        for repo in repos:
            repo_name = repo["name"]

            # Skip already existing repos
            if repo_name.lower() in existing_repo_names:
                print(f"repo {repo_name} already exists, skipping")
                continue

            print(f"processing repo {repo_name}")

            repo_url = repo["html_url"]
            description = repo["description"]

            langs_url = f"https://api.github.com/repos/{username}/{repo_name}/languages"
            langs_resp = requests.get(langs_url, headers=headers)
            all_languages = list(langs_resp.json().keys()) if langs_resp.status_code == 200 else []

            readme_url = f"https://github.com/{username}/{repo_name}/blob/main/README.md"
            readme_resp = requests.get(readme_url)

            if readme_resp.status_code != 200:
                readme_url = f"https://github.com/{username}/{repo_name}/blob/master/README.md"
                readme_resp = requests.get(readme_url)

            summary = None
            if readme_resp.status_code == 200:
                try:
                    content = get_readme_content(readme_url)
                    input_length = len(summarizer.tokenizer.encode(content))
                    max_length = min(200, int(input_length * 0.8)) if input_length > 30 else 40
                    summary = summarizer(content, max_length=max_length, min_length=30, do_sample=False)[0]['summary_text']
                except Exception:
                    summary = None

            project = {
                "repo_name": repo_name,
                "description": summary or description,
                "url": repo_url,
                "languages": all_languages
            }

            if all_languages:
                projects.append(project)

        return {"github_projects": projects}

    except requests.exceptions.RequestException as e:
        print(f"Error fetching GitHub repos for {username}: {e}")
        return {"github_projects": []}

In [9]:
def update_resumes_with_github_projects():
    resumes = list(resumes_collection.find({
        "personal_links.github": {"$exists": True, "$ne": None}
    }))

    for resume in tqdm(resumes, desc="Updating resumes"):
        github_url = resume["personal_links"]["github"]

        # Ensure the URL starts with https://
        if not github_url.startswith("http"):
            github_url = "https://" + github_url

        print(f"\nProcessing resume: {resume.get('_id')} | GitHub: {github_url}")

        existing_projects = resume.get("github_projects", [])

        existing_names = {proj["repo_name"].lower() for proj in existing_projects}

        extracted = extract_github_projects(github_url, token=github_token, existing_repo_names=existing_names)

        new_projects = extracted.get("github_projects", [])

        # Update if needed
        if new_projects:
            updated_projects = existing_projects + new_projects
            resumes_collection.update_one(
                {"_id": resume["_id"]},
                {
                    "$set": {
                        "github_projects": updated_projects,
                        "last_updated": datetime.utcnow()
                    }
                }
            )
            print(f"→ Updated with {len(new_projects)} new project(s).")
        else:
            print("→ No new projects to add.")

In [10]:
update_resumes_with_github_projects()

Updating resumes:   0%|                                                     | 0/15 [00:00<?, ?it/s]


Processing resume: 6816e2308c095f364cfca340 | GitHub: https://github.com/ExtremelySunnyYK


Updating resumes:   7%|███                                          | 1/15 [00:00<00:04,  2.84it/s]

Error fetching GitHub repos for ExtremelySunnyYK: 404 Client Error: Not Found for url: https://api.github.com/users/ExtremelySunnyYK/repos
→ No new projects to add.

Processing resume: 68175c842d7cc10ceb81604e | GitHub: https://github.com/ashlylau


repo adversarial-robustness-toolbox already exists, skipping
repo archive-website already exists, skipping
repo bg-anim already exists, skipping
repo bootstrap-test already exists, skipping
repo C-Lexis-Tests already exists, skipping
repo Camera-Application already exists, skipping
repo co362-group1 already exists, skipping
repo deep-income already exists, skipping
repo facebook-H-A-P already exists, skipping
repo First-Android-App already exists, skipping
repo first-website already exists, skipping
repo Flare-Flutter already exists, skipping
repo google-challenges-practice already exists, skipping
repo Handcrafted-DP already exists, skipping
repo HatchHack-Team2 already exists, skipping
processing repo hatchlondon-2017-projects


repo health-hack-20 already exists, skipping
repo hole-in-the-wall already exists, skipping
repo hole-in-the-wall-archive already exists, skipping
repo internship-coding-challenge-2019 already exists, skipping
repo intro-to-ml-repo already exists, skipping
repo judge-pet already exists, skipping
repo mixup-cifar10 already exists, skipping
repo ml-class already exists, skipping
repo NHANES-diabetes already exists, skipping
processing repo personal-website-1.0


processing repo personal-website-2.0


processing repo personal-website-3.0


Updating resumes:  13%|██████                                       | 2/15 [00:45<05:46, 26.62s/it]

repo private-data-generation already exists, skipping
repo private-pipelines already exists, skipping
→ Updated with 3 new project(s).

Processing resume: 68175d692d7cc10ceb816050 | GitHub: https://github.com/AnuvaGoyal


Updating resumes:  20%|█████████                                    | 3/15 [00:45<02:56, 14.68s/it]

repo FACE-MASK-DETECTION already exists, skipping
repo Heart-Attack-Prediction already exists, skipping
repo LDA-Topic-Modeling already exists, skipping
repo Mental-Healthcare-Chatbot already exists, skipping
repo TECH-A-THON-Emotion-Based-Movie-Recommender-System already exists, skipping
repo Web-Scraping already exists, skipping
→ No new projects to add.

Processing resume: 68175e0a2d7cc10ceb816051 | GitHub: https://github.com/AnuvaGoyal


Updating resumes:  27%|████████████                                 | 4/15 [00:46<01:38,  8.98s/it]

repo FACE-MASK-DETECTION already exists, skipping
repo Heart-Attack-Prediction already exists, skipping
repo LDA-Topic-Modeling already exists, skipping
repo Mental-Healthcare-Chatbot already exists, skipping
repo TECH-A-THON-Emotion-Based-Movie-Recommender-System already exists, skipping
repo Web-Scraping already exists, skipping
→ No new projects to add.

Processing resume: 6817625e2d7cc10ceb816053 | GitHub: https://github.com/aswathinrp


repo AD-Click-Predictions already exists, skipping
repo aswathi already exists, skipping
processing repo aswathinrp


repo bg_removal already exists, skipping
repo CarPricePrediction-Flask-app already exists, skipping
processing repo clone-microsoft


repo Driver-drowsiness-project already exists, skipping
repo facedetection-harcascade already exists, skipping
processing repo Facemask-detection


repo forest_fire-prediction already exists, skipping
repo HandgestureRecognition already exists, skipping
processing repo hello_world


processing repo image_summarisation-with-openai


repo interaction-with-db-tables already exists, skipping
repo login-home-page-using-python-django already exists, skipping
repo Movie-Recommendations-Using-Machine-Learning---Streamlit already exists, skipping
repo ms-clone already exists, skipping
repo mybudgethouse already exists, skipping
repo name-generator already exists, skipping
repo Netflix already exists, skipping
repo netflix-responsive already exists, skipping
repo NumberplateDetection already exists, skipping
repo object-detection-using-mobilenet-SSD already exists, skipping
repo object-detection-using-SSD already exists, skipping
repo open_cv already exists, skipping
processing repo parrot_detection_using-yolov5


repo pdf-e-sign-with-python already exists, skipping
repo pdf-esign-application already exists, skipping
repo Pencil-Drawing---cv2 already exists, skipping
processing repo personalsite


Updating resumes:  33%|███████████████                              | 5/15 [01:10<02:23, 14.39s/it]

→ No new projects to add.

Processing resume: 681763012d7cc10ceb816055 | GitHub: https://github.com/Fulkar-khan


Updating resumes:  40%|██████████████████                           | 6/15 [01:10<01:26,  9.61s/it]

Error fetching GitHub repos for Fulkar-khan: 404 Client Error: Not Found for url: https://api.github.com/users/Fulkar-khan/repos
→ No new projects to add.

Processing resume: 681763fc2d7cc10ceb816057 | GitHub: https://github.com/KunikaBhargav


Updating resumes:  47%|█████████████████████                        | 7/15 [01:10<00:52,  6.59s/it]

Error fetching GitHub repos for KunikaBhargav: 404 Client Error: Not Found for url: https://api.github.com/users/KunikaBhargav/repos
→ No new projects to add.

Processing resume: 6817649a2d7cc10ceb816058 | GitHub: https://github.com/kyotikhan


Updating resumes:  53%|████████████████████████                     | 8/15 [01:11<00:32,  4.62s/it]

Error fetching GitHub repos for kyotikhan: 404 Client Error: Not Found for url: https://api.github.com/users/kyotikhan/repos
→ No new projects to add.

Processing resume: 681764f32d7cc10ceb816059 | GitHub: https://github.com/ToshanTile


Updating resumes:  60%|███████████████████████████                  | 9/15 [01:11<00:19,  3.28s/it]

Error fetching GitHub repos for ToshanTile: 404 Client Error: Not Found for url: https://api.github.com/users/ToshanTile/repos
→ No new projects to add.

Processing resume: 681766372d7cc10ceb81605b | GitHub: https://github.com/AnmishaMurarka


Updating resumes:  67%|█████████████████████████████▎              | 10/15 [01:11<00:11,  2.37s/it]

Error fetching GitHub repos for AnmishaMurarka: 404 Client Error: Not Found for url: https://api.github.com/users/AnmishaMurarka/repos
→ No new projects to add.

Processing resume: 681766e72d7cc10ceb81605c | GitHub: https://phy-m


Updating resumes:  73%|████████████████████████████████▎           | 11/15 [01:12<00:06,  1.73s/it]

Error fetching GitHub repos for : 404 Client Error: Not Found for url: https://api.github.com/users//repos
→ No new projects to add.

Processing resume: 681b6fed5416ebae9d37160e | GitHub: https://github.com/KuruvaNagaraju/DataScience


repo DATA-SCIENCE already exists, skipping
repo DataScience already exists, skipping
processing repo Nagaraju


Updating resumes:  80%|███████████████████████████████████▏        | 12/15 [01:14<00:05,  1.88s/it]

→ No new projects to add.

Processing resume: 681b71645416ebae9d371610 | GitHub: https://github.com/gkapil801


Updating resumes:  87%|██████████████████████████████████████▏     | 13/15 [01:14<00:02,  1.41s/it]

Error fetching GitHub repos for gkapil801: 404 Client Error: Not Found for url: https://api.github.com/users/gkapil801/repos
→ No new projects to add.

Processing resume: 681b75995416ebae9d371612 | GitHub: https://github.com/dhaferalmakhles


Updating resumes:  93%|█████████████████████████████████████████   | 14/15 [01:15<00:01,  1.10s/it]

repo dhaferalmakhles.github.io already exists, skipping
→ No new projects to add.

Processing resume: 681b77195416ebae9d371615 | GitHub: https://github.com/shahidmumtaz


processing repo -Shahid


processing repo A_Sentiment_analysis


repo belly_button already exists, skipping
repo BootCampUCD.github.io already exists, skipping
processing repo CitiBike_Tableau


repo COVID-19-Socio_Economic_Analysis-Project already exists, skipping
repo D3_challenge already exists, skipping
repo IMDB_Movie_Analysis-Project already exists, skipping
repo javascript_challenge already exists, skipping
repo leaflet_challenge already exists, skipping
repo Matplotlib already exists, skipping
repo Pandas_Challenge already exists, skipping
repo Pet_Pals already exists, skipping
repo Plotly-HW already exists, skipping
processing repo Predictive-Analysis


repo project_2 already exists, skipping
processing repo Python-Homework


repo Python_API already exists, skipping
repo python_challenge already exists, skipping
processing repo Scientific-Decision-Making


processing repo Shahidmumtaz


repo Shahidmumtaz.github.io already exists, skipping
repo Shark_tank_ETL already exists, skipping
repo sqlalchemy_challenge already exists, skipping
repo sql_challenge already exists, skipping
processing repo Understanding-Visualizing-data


Updating resumes:  93%|█████████████████████████████████████████   | 14/15 [03:10<00:13, 13.59s/it]




In [None]:
print("Mongo host:", mongo_host)
print("Mongo port:", mongo_port)
print("Mongo DB:", mongo_db)
print("Mongo collection:", mongo_collection)
print("Resumes collection is None:", resumes_collection is None)
