In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import glob

folder_path = 'E:\youssef\Testing\Python\Resumes'
files = glob.glob(os.path.join(folder_path, '*'))

for file in files:
    print(file)

E:\youssef\Testing\Python\Resumes\Amsterdam-Modern-Resume-Template.pdf
E:\youssef\Testing\Python\Resumes\Dublin-Resume-Template-Modern.pdf
E:\youssef\Testing\Python\Resumes\London-Resume-Template-Professional.pdf
E:\youssef\Testing\Python\Resumes\Madrid-Resume-Template-Modern.pdf
E:\youssef\Testing\Python\Resumes\Moscow-Creative-Resume-Template.pdf
E:\youssef\Testing\Python\Resumes\New-York-Resume-Template-Creative.pdf
E:\youssef\Testing\Python\Resumes\Santiago-Resume-Template-Professional.pdf
E:\youssef\Testing\Python\Resumes\Stockholm-Resume-Template-Simple.pdf
E:\youssef\Testing\Python\Resumes\Sydney-Resume-Template-Modern.pdf
E:\youssef\Testing\Python\Resumes\Vienna-Modern-Resume-Template.pdf
E:\youssef\Testing\Python\Resumes\Yosry-Negm-Resume.pdf


In [2]:
import mimetypes
import PyPDF2
from docx import Document
from pptx import Presentation

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text

# Function to extract text from Word (DOCX)
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = '\n'.join([para.text for para in doc.paragraphs])
    return text

# Function to extract text from PowerPoint (PPTX)
def extract_text_from_pptx(file_path):
    prs = Presentation(file_path)
    text = ''
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + '\n'
    return text

# Function to detect file type and extract text accordingly
def extract_text_from_file(file_path):
    mime_type, _ = mimetypes.guess_type(file_path)
    
    if mime_type == 'application/pdf':
        return extract_text_from_pdf(file_path)
    elif mime_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        return extract_text_from_docx(file_path)
    elif mime_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
        return extract_text_from_pptx(file_path)
    else:
        raise ValueError("Unsupported file type")

In [3]:
#Split text into chunks
def split_text_into_chunks(text, max_length=512):
    paragraphs = text.split('\n\n')
    chunks = []
    current_chunk = ""

    for paragraph in paragraphs:
        if len(current_chunk) + len(paragraph) <= max_length:
            current_chunk += paragraph + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph + " "

    if current_chunk:  # Add the last chunk
        chunks.append(current_chunk.strip())

    # Filter out empty or whitespace-only chunks
    chunks = [chunk for chunk in chunks if chunk.strip()]

    return chunks

from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

def embed_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        embedding = model(**inputs).last_hidden_state.mean(dim=1)
    return embedding



In [4]:
import numpy as np
from pymongo import MongoClient
from datetime import datetime

number = 0

for file_path in files:
    try:
        text_content = extract_text_from_file(file_path)
        chunks = split_text_into_chunks(text_content)
        
        # Example: Generating embeddings for a list of text chunks
        chunk_embeddings = [embed_text(chunk) for chunk in chunks]
        
        def average_embeddings(embeddings):
            return torch.mean(torch.stack(embeddings), dim=0)
        
        document_embedding = average_embeddings(chunk_embeddings)
        
        # Convert to list
        embedding_list = document_embedding.tolist()
        
        # Connect to MongoDB
        client = MongoClient('mongodb://localhost:27017/')
        db = client['VectorDBPython']
        collection = db['CVs']
        
        # Define the document with metadata, embedding, and text excerpt
        document = {
            "metadata": {
                "file_path": file_path,
                "email": "yosrinegm@gmail.com",
                "created_at": datetime.now().isoformat(),
                "tags": ["tag1", "tag2"],
            },
            "embedding": embedding_list,
            "text_excerpt": text_content,
        }
        
        # Insert the document into the collection
        result = collection.insert_one(document)
        print(f"Inserted document ID: {result.inserted_id}")
        number = number + 1

    except ValueError as e:
        print(e)
print(f"{number} documents added")

Inserted document ID: 66bfc407fd3ca604c3fce31d
Inserted document ID: 66bfc40bfd3ca604c3fce31f
Inserted document ID: 66bfc40efd3ca604c3fce321
Inserted document ID: 66bfc412fd3ca604c3fce323
Inserted document ID: 66bfc416fd3ca604c3fce325
Inserted document ID: 66bfc41afd3ca604c3fce327
Inserted document ID: 66bfc41efd3ca604c3fce329
Inserted document ID: 66bfc421fd3ca604c3fce32b
Inserted document ID: 66bfc425fd3ca604c3fce32d
Inserted document ID: 66bfc428fd3ca604c3fce32f
Inserted document ID: 66bfc42cfd3ca604c3fce331
11 documents added
