In [1]:
import os

dataset_path = r"C:\Users\adity\Resume_Classification_Project\data"

print("RESUME DATASET EXPLORATION")

if not os.path.exists(dataset_path):
    print(f" Path not found: {dataset_path}")
    print("Please update the 'dataset_path' variable with your correct path")
else:
    print(f" Dataset path found!\n")
    
    # List all folders (categories)
    categories = []
    for item in os.listdir(dataset_path):
        item_path = os.path.join(dataset_path, item)
        if os.path.isdir(item_path):
            categories.append(item)
    
    print(f"Total Subfolders Found: {len(categories)}\n")
    
    # Count files directly in Resumes folder
    direct_files = []
    for item in os.listdir(dataset_path):
        item_path = os.path.join(dataset_path, item)
        if os.path.isfile(item_path):
            direct_files.append(item)
    
    # Show files directly in Resumes folder
    if direct_files:
        print(" Files in Main Resumes Folder (no subfolder):")
      
        print(f"   {len(direct_files)} files found")
        print(f"   Examples: {direct_files[:3]}")  # Show first 3
        print()
    
    # Show each category with file count
    print("Category Subfolders:")
    
    total_files = len(direct_files)
    for category in sorted(categories):
        category_path = os.path.join(dataset_path, category)
        files = os.listdir(category_path)
        file_count = len(files)
        total_files += file_count
        
        print(f" {category}: {file_count} files")

    print(f"\n TOTAL Resume Files: {total_files}")
    print(f"   - Files in main folder: {len(direct_files)}")
    print(f"   - Files in subfolders: {total_files - len(direct_files)}")
    print(f" Total Subfolders: {len(categories)}")

RESUME DATASET EXPLORATION
 Dataset path found!

Total Subfolders Found: 4

Category Subfolders:
 Peoplesoft resumes: 20 files
 React.js Developers: 24 files
 SQL Developer Lightning insight: 14 files
 workday resumes: 21 files

 TOTAL Resume Files: 79
   - Files in main folder: 0
   - Files in subfolders: 79
 Total Subfolders: 4


In [3]:
import os
from collections import Counter

dataset_path = r"C:\Users\adity\Resume_Classification_Project\data"

print("FILE TYPE ANALYSIS")

# Collect all file extensions
file_extensions = []

# Get files from main folder
for item in os.listdir(dataset_path):
    item_path = os.path.join(dataset_path, item)
    if os.path.isfile(item_path):
        ext = os.path.splitext(item)[1].lower()
        file_extensions.append(ext)

# Get files from subfolders
for item in os.listdir(dataset_path):
    item_path = os.path.join(dataset_path, item)
    if os.path.isdir(item_path):
        for file in os.listdir(item_path):
            ext = os.path.splitext(file)[1].lower()
            file_extensions.append(ext)

# Count file types
file_type_counts = Counter(file_extensions)

print("\nFile Types Found:")

for ext, count in file_type_counts.most_common():
    percentage = (count / len(file_extensions)) * 100
    print(f"{ext:10} : {count:3} files ({percentage:.1f}%)")

print(f"Total Files: {len(file_extensions)}")

# Check if we need different libraries for different formats
print("\n Libraries We'll Need:")

if '.pdf' in file_type_counts:
    print(" PyPDF2 or pdfplumber (for PDF files)")
if '.docx' in file_type_counts:
    print(" python-docx (for DOCX files)")
if '.doc' in file_type_counts:
    print(" textract or antiword (for old DOC files)")


FILE TYPE ANALYSIS

File Types Found:
.docx      :  52 files (65.8%)
.doc       :  26 files (32.9%)
.pdf       :   1 files (1.3%)
Total Files: 79

 Libraries We'll Need:
 PyPDF2 or pdfplumber (for PDF files)
 python-docx (for DOCX files)
 textract or antiword (for old DOC files)


In [7]:
!pip install python-docx


Collecting python-docx
  Using cached python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Using cached python_docx-1.2.0-py3-none-any.whl (252 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.2.0


In [9]:
import os
from docx import Document
import PyPDF2
import pandas as pd
import docx2txt

dataset_path = r"C:\Users\adity\Resume_Classification_Project\data"

print("READING ALL RESUMES - INCLUDING .DOC FILES")

def read_doc(file_path):
    try:
        text = docx2txt.process(file_path)
        return text
    except Exception as e:
        return None

def read_docx(file_path):
    try:
        doc = Document(file_path)
        text = []
        for paragraph in doc.paragraphs:
            text.append(paragraph.text)
        return '\n'.join(text)
    except Exception as e:
        return None

def read_pdf(file_path):
    try:
        text = []
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text.append(page.extract_text())
        return '\n'.join(text)
    except Exception as e:
        return None

resume_data = []
success_count = 0
fail_count = 0


for item in os.listdir(dataset_path):
    item_path = os.path.join(dataset_path, item)
    
    if os.path.isfile(item_path):
        file_name = item
        file_ext = os.path.splitext(item)[1].lower()
        category = 'React Developer'
        
        content = None
        if file_ext == '.docx':
            content = read_docx(item_path)
        elif file_ext == '.pdf':
            content = read_pdf(item_path)
        elif file_ext == '.doc':
            content = read_doc(item_path)
        
        if content:
            resume_data.append({
                'filename': file_name,
                'category': category,
                'content': content
            })
            success_count += 1
            print(f"Success: {file_name[:50]}")
        else:
            fail_count += 1
            print(f"Failed: {file_name}")

for folder in os.listdir(dataset_path):
    folder_path = os.path.join(dataset_path, folder)
    
    if os.path.isdir(folder_path):
        category = folder
        
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            file_ext = os.path.splitext(file)[1].lower()
            
            content = None
            if file_ext == '.docx':
                content = read_docx(file_path)
            elif file_ext == '.pdf':
                content = read_pdf(file_path)
            elif file_ext == '.doc':
                content = read_doc(file_path)
            
            if content:
                resume_data.append({
                    'filename': file,
                    'category': category,
                    'content': content
                })
                success_count += 1
                print(f"Success: [{category}] {file[:40]}")
            else:
                fail_count += 1
                print(f"Failed: {file}")

print("SUMMARY")
print(f"Successfully read: {success_count} resumes")
print(f"Failed to read: {fail_count} resumes")
print(f"Total processed: {success_count + fail_count}")

if resume_data:
    df = pd.DataFrame(resume_data)
    output_file = r"C:\Users\adity\Resume_Classification_Project\extracted_data\resumes_data.csv"
    df.to_csv(output_file, index=False)
    print(f"\nData saved to: {output_file}")
    print(f"Columns: {list(df.columns)}")
    print(f"Shape: {df.shape}")


ModuleNotFoundError: No module named 'PyPDF2'