In [12]:
#!pip install pymongo requests python-dotenv tqm

In [13]:
from pymongo import MongoClient
import os
from dotenv import load_dotenv
from urllib.parse import urlparse
from transformers import pipeline
import re
import requests
from bs4 import BeautifulSoup
from transformers.utils import logging
from tqdm import tqdm
from datetime import datetime

In [14]:
load_dotenv()

mongo_host = os.getenv('MONGO_HOST')
mongo_port = os.getenv('MONGO_PORT')
mongo_db = os.getenv('MONGO_DB')
mongo_collection = os.getenv('MONGO_COLLECTION')

github_token = os.getenv('GITHUB_TOKEN')

In [15]:
import os
from pymongo import MongoClient

mongo_host = os.getenv("MONGO_HOST", "localhost")
mongo_port = int(os.getenv("MONGO_PORT", 27017))
mongo_db = os.getenv("MONGO_DB", "PFE")  # valeur par défaut
mongo_collection = os.getenv("MONGO_COLLECTION", "resumes")  # valeur par défaut

client = MongoClient(
    host=mongo_host,
    port=mongo_port,
    serverSelectionTimeoutMS=2000,
    connectTimeoutMS=10000,
    socketTimeoutMS=10000
)

try:
    client.admin.command("ping")
except Exception as e:
    raise RuntimeError(f"MongoDB non accessible: {e}")

db = client[mongo_db]
resumes_collection = db[mongo_collection]


In [16]:
# Load summarization model (local inference)
logging.set_verbosity_error()
summarizer = pipeline("summarization", model="pszemraj/led-base-book-summary", tokenizer="pszemraj/led-base-book-summary")

In [17]:
def clean_markdown(text):
    # Remove badges (e.g., ![alt](...) or <img ...>)
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'<img.*?>', '', text)

    # Remove raw HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove code blocks (```...```)
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)

    # Remove inline code (`...`)
    text = re.sub(r'`[^`]+`', '', text)

    # Remove markdown links but keep the link text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)

    # Remove headings (e.g., # Heading)
    text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)

    # Remove excess whitespace
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    return text.strip()

In [18]:
def get_readme_content(github_readme_url):
    # Convert to raw content URL
    if "github.com" in github_readme_url and "blob" in github_readme_url:
        raw_url = github_readme_url.replace("github.com", "raw.githubusercontent.com").replace("/blob", "")
    else:
        raw_url = github_readme_url  # assume it's already raw

    try:
        readme_resp = requests.get(raw_url)
        readme_resp.raise_for_status()
        return clean_markdown(readme_resp.text)
    except requests.exceptions.RequestException as e:
        return None

In [19]:
def extract_github_projects(profile_url, token=None, existing_repo_names=None):
    username = urlparse(profile_url).path.strip("/").split("/")[0]
    headers = {"Accept": "application/vnd.github+json"}
    repos_url = f"https://api.github.com/users/{username}/repos"
    if token:
        headers["Authorization"] = f"Bearer {token}"

    existing_repo_names = {name.lower() for name in (existing_repo_names or [])}

    try:
        response = requests.get(repos_url, headers=headers)
        response.raise_for_status()
        repos = response.json()
        projects = []

        for repo in repos:
            repo_name = repo["name"]

            # Skip already existing repos
            if repo_name.lower() in existing_repo_names:
                print(f"repo {repo_name} already exists, skipping")
                continue

            print(f"processing repo {repo_name}")

            repo_url = repo["html_url"]
            description = repo["description"]

            langs_url = f"https://api.github.com/repos/{username}/{repo_name}/languages"
            langs_resp = requests.get(langs_url, headers=headers)
            all_languages = list(langs_resp.json().keys()) if langs_resp.status_code == 200 else []

            readme_url = f"https://github.com/{username}/{repo_name}/blob/main/README.md"
            readme_resp = requests.get(readme_url)

            if readme_resp.status_code != 200:
                readme_url = f"https://github.com/{username}/{repo_name}/blob/master/README.md"
                readme_resp = requests.get(readme_url)

            summary = None
            if readme_resp.status_code == 200:
                try:
                    content = get_readme_content(readme_url)
                    input_length = len(summarizer.tokenizer.encode(content))
                    max_length = min(200, int(input_length * 0.8)) if input_length > 30 else 40
                    summary = summarizer(content, max_length=max_length, min_length=30, do_sample=False)[0]['summary_text']
                except Exception:
                    summary = None

            project = {
                "repo_name": repo_name,
                "description": summary or description,
                "url": repo_url,
                "languages": all_languages
            }

            if all_languages:
                projects.append(project)

        return {"github_projects": projects}

    except requests.exceptions.RequestException as e:
        print(f"Error fetching GitHub repos for {username}: {e}")
        return {"github_projects": []}

In [20]:
def update_resumes_with_github_projects():
    resumes = list(resumes_collection.find({
        "personal_links.github": {"$exists": True, "$ne": None}
    }))

    for resume in tqdm(resumes, desc="Updating resumes"):
        github_url = resume["personal_links"]["github"]

        # Ensure the URL starts with https://
        if not github_url.startswith("http"):
            github_url = "https://" + github_url

        print(f"\nProcessing resume: {resume.get('_id')} | GitHub: {github_url}")

        existing_projects = resume.get("github_projects", [])

        existing_names = {proj["repo_name"].lower() for proj in existing_projects}

        extracted = extract_github_projects(github_url, token=github_token, existing_repo_names=existing_names)

        new_projects = extracted.get("github_projects", [])

        # Update if needed
        if new_projects:
            updated_projects = existing_projects + new_projects
            resumes_collection.update_one(
                {"_id": resume["_id"]},
                {
                    "$set": {
                        "github_projects": updated_projects,
                        "last_updated": datetime.utcnow()
                    }
                }
            )
            print(f"→ Updated with {len(new_projects)} new project(s).")
        else:
            print("→ No new projects to add.")

In [21]:
update_resumes_with_github_projects()

Updating resumes:   0%|          | 0/16 [00:00<?, ?it/s]


Processing resume: 6816e2308c095f364cfca340 | GitHub: https://github.com/ExtremelySunnyYK


Updating resumes:   6%|▋         | 1/16 [00:00<00:05,  2.59it/s]

Error fetching GitHub repos for ExtremelySunnyYK: 404 Client Error: Not Found for url: https://api.github.com/users/ExtremelySunnyYK/repos
→ No new projects to add.

Processing resume: 68175c842d7cc10ceb81604e | GitHub: https://github.com/ashlylau
processing repo adversarial-robustness-toolbox
processing repo archive-website
processing repo ashlylau.github.io
processing repo bg-anim
processing repo bootstrap-test
processing repo C-Lexis-Tests




processing repo Camera-Application
processing repo co362-group1
processing repo deep-income
processing repo facebook-H-A-P
processing repo First-Android-App
processing repo first-website
processing repo Flare-Flutter
processing repo google-challenges-practice
processing repo Handcrafted-DP
processing repo HatchHack-Team2
processing repo hatchlondon-2017-projects
processing repo health-hack-20
processing repo hole-in-the-wall
processing repo hole-in-the-wall-archive
processing repo internship-coding-challenge-2019
processing repo intro-to-ml-repo
processing repo judge-pet
processing repo mixup-cifar10
processing repo ml-class
processing repo NHANES-diabetes
processing repo private-data-generation
processing repo private-pipelines
processing repo pygame-tutorial
processing repo PyStatDP


  "last_updated": datetime.utcnow()
Updating resumes:  12%|█▎        | 2/16 [05:14<43:12, 185.20s/it]

→ Updated with 29 new project(s).

Processing resume: 68175d692d7cc10ceb816050 | GitHub: https://github.com/AnuvaGoyal
processing repo FACE-MASK-DETECTION
processing repo Heart-Attack-Prediction
processing repo LDA-Topic-Modeling
processing repo Mental-Healthcare-Chatbot
processing repo TECH-A-THON-Emotion-Based-Movie-Recommender-System
processing repo Web-Scraping


Updating resumes:  19%|█▉        | 3/16 [06:40<30:18, 139.87s/it]

→ Updated with 6 new project(s).

Processing resume: 68175e0a2d7cc10ceb816051 | GitHub: https://github.com/AnuvaGoyal
processing repo FACE-MASK-DETECTION
processing repo Heart-Attack-Prediction
processing repo LDA-Topic-Modeling
processing repo Mental-Healthcare-Chatbot
processing repo TECH-A-THON-Emotion-Based-Movie-Recommender-System
processing repo Web-Scraping


Updating resumes:  25%|██▌       | 4/16 [08:06<23:40, 118.35s/it]

→ Updated with 6 new project(s).

Processing resume: 6817625e2d7cc10ceb816053 | GitHub: https://github.com/aswathinrp
processing repo AD-Click-Predictions
processing repo aswathi
processing repo aswathinrp
processing repo bg_removal
processing repo CarPricePrediction-Flask-app
processing repo clone-microsoft
processing repo Driver-drowsiness-project
processing repo facedetection-harcascade
processing repo Facemask-detection
processing repo forest_fire-prediction
processing repo HandgestureRecognition
processing repo hello_world
processing repo image_summarisation-with-openai
processing repo interaction-with-db-tables
processing repo login-home-page-using-python-django
processing repo Movie-Recommendations-Using-Machine-Learning---Streamlit
processing repo ms-clone
processing repo mybudgethouse
processing repo name-generator
processing repo Netflix
processing repo netflix-responsive
processing repo NumberplateDetection
processing repo object-detection-using-mobilenet-SSD
processing repo

Updating resumes:  31%|███▏      | 5/16 [09:39<20:00, 109.17s/it]

→ Updated with 23 new project(s).

Processing resume: 681763012d7cc10ceb816055 | GitHub: https://github.com/Fulkar-khan


Updating resumes:  38%|███▊      | 6/16 [09:39<12:01, 72.16s/it] 

Error fetching GitHub repos for Fulkar-khan: 404 Client Error: Not Found for url: https://api.github.com/users/Fulkar-khan/repos
→ No new projects to add.

Processing resume: 681763fc2d7cc10ceb816057 | GitHub: https://github.com/KunikaBhargav


Updating resumes:  44%|████▍     | 7/16 [09:39<07:18, 48.68s/it]

Error fetching GitHub repos for KunikaBhargav: 404 Client Error: Not Found for url: https://api.github.com/users/KunikaBhargav/repos
→ No new projects to add.

Processing resume: 6817649a2d7cc10ceb816058 | GitHub: https://github.com/kyotikhan


Updating resumes:  50%|█████     | 8/16 [09:40<04:26, 33.28s/it]

Error fetching GitHub repos for kyotikhan: 404 Client Error: Not Found for url: https://api.github.com/users/kyotikhan/repos
→ No new projects to add.

Processing resume: 681764f32d7cc10ceb816059 | GitHub: https://github.com/ToshanTile


Updating resumes:  56%|█████▋    | 9/16 [09:40<02:40, 22.97s/it]

Error fetching GitHub repos for ToshanTile: 404 Client Error: Not Found for url: https://api.github.com/users/ToshanTile/repos
→ No new projects to add.

Processing resume: 681766372d7cc10ceb81605b | GitHub: https://github.com/AnmishaMurarka


Updating resumes:  62%|██████▎   | 10/16 [09:40<01:35, 15.97s/it]

Error fetching GitHub repos for AnmishaMurarka: 404 Client Error: Not Found for url: https://api.github.com/users/AnmishaMurarka/repos
→ No new projects to add.

Processing resume: 681766e72d7cc10ceb81605c | GitHub: https://phy-m


Updating resumes:  69%|██████▉   | 11/16 [09:40<00:55, 11.17s/it]

Error fetching GitHub repos for : 404 Client Error: Not Found for url: https://api.github.com/users//repos
→ No new projects to add.

Processing resume: 681b6f745416ebae9d37160d | GitHub: https://
Error fetching GitHub repos for : 404 Client Error: Not Found for url: https://api.github.com/users//repos
→ No new projects to add.


Updating resumes:  75%|███████▌  | 12/16 [09:41<00:31,  7.83s/it]


Processing resume: 681b6fed5416ebae9d37160e | GitHub: https://github.com/KuruvaNagaraju/DataScience
processing repo DATA-SCIENCE
processing repo DataScience
processing repo Nagaraju


Updating resumes:  81%|████████▏ | 13/16 [10:08<00:41, 13.81s/it]

→ Updated with 2 new project(s).

Processing resume: 681b71645416ebae9d371610 | GitHub: https://github.com/gkapil801


Updating resumes:  88%|████████▊ | 14/16 [10:09<00:19,  9.76s/it]

Error fetching GitHub repos for gkapil801: 404 Client Error: Not Found for url: https://api.github.com/users/gkapil801/repos
→ No new projects to add.

Processing resume: 681b75995416ebae9d371612 | GitHub: https://github.com/dhaferalmakhles
processing repo dhaferalmakhles.github.io


Updating resumes:  94%|█████████▍| 15/16 [10:24<00:11, 11.45s/it]

→ Updated with 1 new project(s).

Processing resume: 681b77195416ebae9d371615 | GitHub: https://github.com/shahidmumtaz
processing repo -Shahid
processing repo A_Sentiment_analysis
processing repo belly_button
processing repo BootCampUCD.github.io
processing repo CitiBike_Tableau
processing repo COVID-19-Socio_Economic_Analysis-Project
processing repo D3_challenge
processing repo IMDB_Movie_Analysis-Project
processing repo javascript_challenge
processing repo leaflet_challenge
processing repo Matplotlib
processing repo Pandas_Challenge
processing repo Pet_Pals
processing repo Plotly-HW
processing repo Predictive-Analysis
processing repo project_2
processing repo Python-Homework
processing repo Python_API
processing repo python_challenge
processing repo Scientific-Decision-Making
processing repo Shahidmumtaz
processing repo Shahidmumtaz.github.io
processing repo Shark_tank_ETL
processing repo sqlalchemy_challenge
processing repo sql_challenge
processing repo Understanding-Visualizing-da

Updating resumes: 100%|██████████| 16/16 [16:11<00:00, 60.70s/it] 

→ Updated with 20 new project(s).





In [22]:
print("Mongo host:", mongo_host)
print("Mongo port:", mongo_port)
print("Mongo DB:", mongo_db)
print("Mongo collection:", mongo_collection)
print("Resumes collection is None:", resumes_collection is None)


Mongo host: localhost
Mongo port: 27017
Mongo DB: PFE
Mongo collection: resumes
Resumes collection is None: False
