In [1]:
import os
import json
import platform
import datetime


def get_drives():
    system = platform.system()
    drives = []
    if system == "Windows":
        import string
        from ctypes import windll

        bitmask = windll.kernel32.GetLogicalDrives()
        for letter in string.ascii_uppercase:
            if bitmask & 1:
                drives.append(f"{letter}:\\")
            bitmask >>= 1
    elif system == "Linux" or system == "Darwin":  # macOS
        drives.append("/")
    return drives


def get_file_metadata(file_path):
    try:
        stat = os.stat(file_path)
        return {
            "file": os.path.basename(file_path),
            "file_type": "Directory" if os.path.isdir(file_path) else "File",
            "location": os.path.abspath(file_path),
            "created_date": datetime.datetime.fromtimestamp(stat.st_ctime).strftime("%Y-%m-%d %H:%M:%S"),
        }
    except Exception as e:
        return {"file": file_path, "error": str(e)}


def scan_drive(drive):
    metadata = []
    for root, dirs, files in os.walk(drive):
        for name in files + dirs:
            file_path = os.path.join(root, name)
            metadata.append(get_file_metadata(file_path))
    return metadata


def main():
    all_metadata = []
    drives = get_drives()
    
    for drive in drives:
        print(f"Scanning {drive}...")
        all_metadata.extend(scan_drive(drive))

    with open("file_metadata.json", "w", encoding="utf-8") as f:
        json.dump(all_metadata, f, indent=4)

    print("Metadata saved to file_metadata.json")


if __name__ == "__main__":
    main()


Scanning C:\...


KeyboardInterrupt: 

json to sql

In [None]:
import sqlite3

def load_json():
    with open("file_metadata.json", "r", encoding="utf-8") as f:
        return json.load(f)

def sanitize_text(text):
    if text is None:
        return ""
    return text.encode("utf-8", "ignore").decode("utf-8")

def save_to_sqlite(data):
    conn = sqlite3.connect("file_metadata.db")
    conn.execute("PRAGMA busy_timeout = 5000")  # Wait up to 5 seconds
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS files (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        file TEXT,
                        file_type TEXT,
                        location TEXT,
                        created_date TEXT
                    )''')
    for entry in data:
        cursor.execute("INSERT INTO files (file, file_type, location, created_date) VALUES (?, ?, ?, ?)",
                       (sanitize_text(entry.get("file")), 
                        sanitize_text(entry.get("file_type")), 
                        sanitize_text(entry.get("location")), 
                        sanitize_text(entry.get("created_date"))))
    conn.commit()
    conn.close()
    print("Metadata saved to file_metadata.db")


def main():
    data = load_json()
    save_to_sqlite(data)

if __name__ == "__main__":
    main()


Metadata saved to file_metadata.db


In [1]:
import sqlite3

def find_pdf_files_by_name(search_name, file_format):
    conn = sqlite3.connect("file_metadata.db")
    cursor = conn.cursor()
    
    # Corrected SQL query
    cursor.execute(
        "SELECT file, location FROM files WHERE file LIKE ? AND file LIKE ?",
        (f"%{search_name}%", f"%{file_format}")
    )
    
    results = cursor.fetchall()
    conn.close()
    
    pdf_files = [f"File: {row[0]}, Location: {row[1]}" for row in results]
    
    if pdf_files:
        print("PDF Files found:")
        for file in pdf_files:
            print(file)
    else:
        print("No PDF files found with the given name.")

# Example usage
search_name = "yashvardhan"
file_format = ".pdf" 
search_list = find_pdf_files_by_name(search_name, file_format)
print(search_list)


PDF Files found:
File: YASHVARDHAN ASHOK (1).pdf, Location: D:\downloads\YASHVARDHAN ASHOK (1).pdf
File: YASHVARDHAN ASHOK (2).pdf, Location: D:\downloads\YASHVARDHAN ASHOK (2).pdf
File: Yashvardhan Ashok Letter (1).pdf, Location: D:\downloads\Yashvardhan Ashok Letter (1).pdf
File: Yashvardhan Ashok Letter (2).pdf, Location: D:\downloads\Yashvardhan Ashok Letter (2).pdf
File: Yashvardhan Ashok Letter.pdf, Location: D:\downloads\Yashvardhan Ashok Letter.pdf
File: Yashvardhan Ashok Resume - 1 (1).pdf, Location: D:\downloads\Yashvardhan Ashok Resume - 1 (1).pdf
File: Yashvardhan Ashok Resume - 1 (2).pdf, Location: D:\downloads\Yashvardhan Ashok Resume - 1 (2).pdf
File: Yashvardhan Ashok Resume - 1.pdf, Location: D:\downloads\Yashvardhan Ashok Resume - 1.pdf
File: Yashvardhan Ashok Resume - 2.pdf, Location: D:\downloads\Yashvardhan Ashok Resume - 2.pdf
File: YASHVARDHAN ASHOK.pdf, Location: D:\downloads\YASHVARDHAN ASHOK.pdf
File: YashvardhanAshokResume.pdf, Location: D:\downloads\Yashvard

In [None]:
import os
import pypdf
import chromadb
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB and Sentence Transformer
chroma_client = chromadb.PersistentClient(path="vector_db")
collection = chroma_client.get_or_create_collection("pdf_embeddings")
model = SentenceTransformer("all-MiniLM-L6-v2")

def find_pdf_files_by_name(search_name, file_format=".pdf"):
    conn = sqlite3.connect("file_metadata.db")
    cursor = conn.cursor()
    
    cursor.execute(
        "SELECT file, location FROM files WHERE file LIKE ? AND file LIKE ?",
        (f"%{search_name}%", f"%{file_format}")
    )
    
    results = cursor.fetchall()
    conn.close()
    
    pdf_list = [(row[0], row[1]) for row in results]
    return pdf_list

def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, "rb") as f:
            reader = pypdf.PdfReader(f)
            text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

def process_pdfs():
    search_name = "yashvardhan"
    pdf_files = find_pdf_files_by_name(search_name)

    for file_name, file_path in pdf_files:
        if os.path.exists(file_path):
            text = extract_text_from_pdf(file_path)
            if text:
                embedding = model.encode(text).tolist()
                collection.add(documents=[text], embeddings=[embedding], ids=[file_name])
                print(f"Processed and stored: {file_name}")
        else:
            print(f"File not found: {file_path}")

process_pdfs()


  from .autonotebook import tqdm as notebook_tqdm


Processed and stored: YASHVARDHAN ASHOK.pdf


In [None]:
import ollama
import chromadb
from sentence_transformers import SentenceTransformer  


# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="vector_db")
collection = chroma_client.get_collection("pdf_embeddings")

def query_pdf_database(query):
    # model = SentenceTransformer("all-MiniLM-L6-v2") model selection
    query_embedding = model.encode(query).tolist()  
    results = collection.query(query_embeddings=[query_embedding], n_results=3)
    return results["documents"][0] if results["documents"] else "No relevant documents found."

def chat_with_tinyllama(query):
    relevant_docs = query_pdf_database(query)
    
    if isinstance(relevant_docs, list):
        relevant_docs = "\n".join(relevant_docs)  
    
    response = ollama.chat(model="tinyllama:latest", messages=[{"role": "user", "content": relevant_docs}])
    return response["message"]["content"]

if __name__ == "__main__":
    # user_query = input("Ask a question:")
    user_query = "what his epectise"
    print(chat_with_tinyllama(user_query))


The student mentioned in the text, who is currently interning at MyUpchar, has over two years of experience in web development, data analysis, and automation. They have a strong proficiency in Python, Ruby on Rails, React, and MySQL, complemented by their skills in data visualization. Their focus is on innovative projects that improve workflows and drive decision-making through their expertise in applying data analysis techniques to improve processes. The student's internship at MyUpchar has focused on creating an offline-compatible Gmail-like internal communication system and improving message retrieval speed by 30%, streamlining messaging capabilities, reducing annual operational costs by 25%, and eliminating dependency on Google Workspace to reduce operational costs. In addition, they have created a Python-based GUI application integrated with Zebra Mail Server for automated management of over 3,000 emails daily, improved communication efficiency by 60% through real-time tracking vi