<a href="https://colab.research.google.com/github/alaaboobaid/CV-Extraction/blob/main/Testing_ApplicantSystemWithDetailedFilesDownloading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber spacy phonenumbers sentence-transformers pandas numpy matplotlib seaborn reportlab
!python -m spacy download en_core_web_sm

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting phonenumbers
  Downloading phonenumbers-9.0.21-py2.py3-none-any.whl.metadata (10 kB)
Collecting reportlab
  Downloading reportlab-4.4.7-py3-none-any.whl.metadata (1.7 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# cv_full_pipeline_colab.py
# -*- coding: utf-8 -*-
import os
import sys
import re
import json
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from google.colab import files
import tempfile

import pdfplumber
import spacy
import phonenumbers
from sentence_transformers import SentenceTransformer, util
import numpy as np

warnings.filterwarnings("ignore")

print("="*50)
print("CV PROCESSING PIPELINE FOR GOOGLE COLAB")
print("="*50)

# ======================
# Install required packages
# ======================
print("\n[INFO] Installing required packages...")
!pip install pdfplumber spacy phonenumbers sentence-transformers seaborn matplotlib -q
!python -m spacy download en_core_web_sm -q

# ======================
# Initialize paths and directories
# ======================
from google.colab import drive
drive.mount('/content/drive')

# Create directories
BASE_DIR = "/content/cv_processing"
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
MODEL_DIR = os.path.join(BASE_DIR, "cv_ai_models")

os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"[INFO] Base directory: {BASE_DIR}")
print(f"[INFO] Output directory: {OUTPUT_DIR}")

# ======================
# Load SpaCy model
# ======================
try:
    nlp = spacy.load("en_core_web_sm")
    print("[INFO] SpaCy model loaded successfully")
except:
    print("[ERROR] Failed to load SpaCy model")

# ======================
# EliteCVExtractor Class
# ======================
class EliteCVExtractor:
    def __init__(self, nlp):
        self.nlp = nlp
        self.headers = {
            'education': ['EDUCATION', 'ACADEMIC BACKGROUND', 'QUALIFICATIONS'],
            'experience': ['EXPERIENCE', 'ACADEMIC EXPERIENCE', 'TEACHING EXPERIENCE',
                           'PROFESSIONAL EXPERIENCE', 'WORK HISTORY', 'EMPLOYMENT', 'WORK EXPERIENCE'],
            'skills': ['SKILLS', 'TECHNICAL SKILLS', 'TECHNOLOGIES', 'COMPETENCIES'],
            'projects': ['PROJECTS', 'RESEARCH & PROJECTS', 'PUBLICATIONS', 'RESEARCH INTERESTS']
        }

    def _read_pdf(self, pdf_path):
        lines = []
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        lines.extend(text.split('\n'))
        except Exception as e:
            print(f"Error reading PDF: {e}")
        return lines

    def process(self, pdf_path):
        print(f"[INFO] Processing: {pdf_path}")
        lines = self._read_pdf(pdf_path)
        full_text = "\n".join(lines)

        name = self._extract_name(lines)
        phone = self._extract_phone(full_text)
        email = self._extract_email(full_text)
        sections = self._segment_sections(lines)
        education_data = self._refine_education(sections.get('education', []))
        refined_skills = self._refine_skills(sections.get('skills', []), full_text)
        combined_experience = sections.get('experience', []) + sections.get('projects', [])

        return {
            "personal": {"name": name, "phone": phone, "email": email},
            "education": education_data,
            "experience": combined_experience,
            "skills": refined_skills,
            "raw_text": full_text
        }

    def _extract_name(self, lines):
        for line in lines[:5]:
            clean = line.strip()
            if not clean: continue
            doc = self.nlp(clean)
            for ent in doc.ents:
                if ent.label_ == "PERSON" and len(clean.split()) < 6:
                    return ent.text.strip()
            if clean.isupper() and len(clean.split()) < 5:
                return clean
        return lines[0].strip() if lines else "Unknown"

    def _extract_phone(self, text):
        try:
            for match in phonenumbers.PhoneNumberMatcher(text, "PS"):
                num = match.number
                try:
                    return phonenumbers.format_number(num, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
                except:
                    return phonenumbers.format_number(num, phonenumbers.PhoneNumberFormat.E164)
        except:
            pass
        patterns = [r'\(\+97[02]\)\s?\d{6,12}', r'\+97[02]\s?\d{1,3}[\s-]?\d{3,6}', r'\b05\d{7,9}\b', r'\b\+?\d{7,15}\b']
        for p in patterns:
            match = re.search(p, text)
            if match: return match.group(0).strip()
        return "Not Found"

    def _extract_email(self, text):
        match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', text)
        return match.group(0) if match else "Not Found"

    def _segment_sections(self, lines):
        sections = {k: [] for k in self.headers.keys()}
        current_sec = None
        for line in lines:
            clean = line.strip()
            if not clean: continue
            up = clean.upper()
            header_found = False
            for sec, keys in self.headers.items():
                for k in keys:
                    if up == k or up.startswith(k):
                        if len(up) < 60:
                            current_sec = sec
                            header_found = True
                            break
                if header_found: break
            if header_found: continue
            if current_sec: sections[current_sec].append(clean)
        return sections

    def _refine_education(self, edu_lines):
        refined = []
        for l in edu_lines:
            if len(l.strip()) < 4: continue
            up = l.upper()
            if any(w in up for w in ['BACHELOR', "BACHELOR'S",'B.SC', 'MASTER', 'M.SC', 'PHD', 'PH.D', 'DOCTOR']):
                refined.append(l.strip())
            elif any(x in up for x in ['UNIVERSITY', 'COLLEGE', 'INSTITUTE', 'SCHOOL', 'FACULTY']):
                refined.append(l.strip())
            elif re.search(r'\b(19|20)\d{2}\b', l):
                refined.append(l.strip())
            elif len(l.strip()) > 30:
                refined.append(l.strip())
        return refined

    def _refine_skills(self, skill_lines, full_text):
        text = " ".join(skill_lines) if skill_lines and sum(len(s) for s in skill_lines) > 20 else full_text
        doc = self.nlp(text)
        skills = set()
        for token in doc:
            if token.pos_ in ['PROPN', 'NOUN'] and len(token.text) > 2:
                cleaned = re.sub(r'[^A-Za-z0-9\+\#\.\-]', '', token.text)
                if len(cleaned) > 1: skills.add(cleaned)
        whitelist = ['C++', 'C#', 'Go', 'React', 'Vue', 'Node.js', 'Latex', 'Docker', 'Kubernetes',
                     'SQL', 'NoSQL', 'Git', 'Linux', 'Matlab', 'Simulink', 'IoT', 'Python', 'Java',
                     'TensorFlow', 'PyTorch', 'OpenCV', 'Spark', 'AWS', 'Azure', 'MongoDB', 'Flutter',
                     'React Native']
        for w in whitelist:
            if re.search(r'\b' + re.escape(w) + r'\b', text, flags=re.IGNORECASE):
                skills.add(w)
        cleaned_skills = [s.strip() for s in skills if len(s.strip()) > 1]
        return sorted(set(cleaned_skills), key=lambda x: x.lower())

# ======================
# Internship Matcher Class
# ======================
class InternshipMatcher:
    def __init__(self, model, df):
        self.model = model
        self.df = df.copy()
        self.df["combined_text"] = (
            self.df["position"] + " " +
            self.df["required_skills"] + " " +
            self.df["description"]
        )
        print("[INFO] Encoding internship embeddings...")
        self.job_embeddings = self.model.encode(self.df["combined_text"].tolist(), convert_to_tensor=True)

    def calculate_ats_score(self, cv):
        breakdown = {}
        contact = 0
        if cv["personal"]["name"] != "Unknown": contact += 5
        if cv["personal"]["phone"] != "Not Found": contact += 5
        if cv["personal"]["email"] != "Not Found": contact += 5
        breakdown["Contact"] = contact

        edu_text = " ".join(cv["education"]).upper()
        edu_score = 10
        if "PHD" in edu_text or "DOCTOR" in edu_text: edu_score += 25
        elif "MASTER" in edu_text or "M.SC" in edu_text: edu_score += 15
        elif "BACHELOR" in edu_text or "B.SC" in edu_text: edu_score += 10
        breakdown["Education"] = min(35, edu_score)

        exp_text = " ".join(cv["experience"]).upper()
        exp_score = 10
        if any(k in exp_text for k in ["PROFESSOR", "LECTURER", "MANAGER", "LEAD", "CHAIR"]):
            exp_score += 20
        elif len(cv["experience"]) > 15: exp_score += 15
        elif len(cv["experience"]) > 5: exp_score += 5
        breakdown["Experience"] = min(30, exp_score)

        breakdown["Skills"] = min(20, len(cv["skills"]) + 5)
        total = sum(breakdown.values())
        return total, breakdown

    def match(self, cv, ats_score):
        candidate_text = " ".join(cv["skills"]) + " " + " ".join(cv["experience"]) + " " + " ".join(cv["education"])
        cand_embedding = self.model.encode(candidate_text, convert_to_tensor=True)
        similarities = util.cos_sim(cand_embedding, self.job_embeddings)[0]

        results = []
        for idx, sim in enumerate(similarities):
            semantic_pct = float(sim) * 100
            final_score = 0.6 * semantic_pct + 0.4 * ats_score

            # Handle stipend safely
            try:
                stipend = float(self.df.iloc[idx]["monthly_stipend"])
            except:
                stipend = 0.0

            results.append({
                "company": str(self.df.iloc[idx]["company"]),
                "position": str(self.df.iloc[idx]["position"]),
                "location": str(self.df.iloc[idx]["location"]),
                "stipend": stipend,
                "semantic_score": round(semantic_pct, 2),
                "final_score": round(final_score, 2),
                "required_skills": str(self.df.iloc[idx]["required_skills"]),
                "description": str(self.df.iloc[idx]["description"])
            })

        return sorted(results, key=lambda x: x["final_score"], reverse=True)

# ======================
# Dashboard Creation Function
# ======================
def create_dashboard(cv_data, ats_breakdown, matches, output_path):
    """Create dashboard visualization"""
    try:
        sns.set_theme(style="whitegrid")

        fig = plt.figure(figsize=(20, 14))
        fig.suptitle(
            f"CV ATS & Internship Dashboard\n{cv_data['personal']['name']}",
            fontsize=22,
            fontweight="bold",
            y=0.97
        )

        # 1. ATS DONUT
        ax1 = fig.add_subplot(231)
        ax1.pie(ats_breakdown.values(), labels=ats_breakdown.keys(), autopct="%1.1f%%", startangle=140)
        ax1.add_artist(plt.Circle((0,0),0.7,fc="white"))
        ax1.text(0,0,str(sum(ats_breakdown.values())),ha="center",va="center",fontsize=26,fontweight="bold")
        ax1.set_title("ATS Score Breakdown", fontweight="bold")

        # 2. RADAR
        ax2 = fig.add_subplot(232, polar=True)
        categories = list(ats_breakdown.keys())
        values = list(ats_breakdown.values())
        max_values = [15,35,30,20]
        values_norm = [(v/m)*100 for v,m in zip(values,max_values)]
        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False)
        angles = np.concatenate((angles, [angles[0]]))
        values_norm = np.concatenate((values_norm, [values_norm[0]]))

        ax2.plot(angles, values_norm, linewidth=2)
        ax2.fill(angles, values_norm, alpha=0.3)
        ax2.set_ylim(0,120)
        ax2.set_xticks([])
        ax2.set_title("Profile Strength", pad=35, fontweight="bold")

        # 3. Top skills
        ax3 = fig.add_subplot(233)
        top_skills = cv_data["skills"][:10]
        if top_skills:
            sns.barplot(x=list(range(len(top_skills))), y=top_skills, ax=ax3)
        ax3.set_title("Top Skills Detected", fontweight="bold")
        ax3.set_xlabel("Relevance")
        ax3.set_ylabel("Skill")

        # 4. Top 5 Matches
        ax4 = fig.add_subplot(234)
        top5 = matches[:5]
        if top5:
            labels = [f"{m['company']}\n{m['position']}" for m in top5]
            scores = [m["final_score"] for m in top5]
            sns.barplot(x=scores, y=labels, ax=ax4)
            ax4.set_xlim(0,100)
        ax4.set_title("Top 5 Internship Matches", fontweight="bold")
        ax4.set_xlabel("Fit Score (%)")

        # 5. Stipend vs Fit
        ax5 = fig.add_subplot(235)
        top10 = matches[:10]
        if top10:
            sns.scatterplot(x=[m["stipend"] for m in top10], y=[m["final_score"] for m in top10], s=300, ax=ax5)
        ax5.set_xlabel("Monthly Stipend")
        ax5.set_ylabel("Fit Score")
        ax5.set_title("Stipend vs Fit", fontweight="bold")

        # 6. Locations
        ax6 = fig.add_subplot(236)
        locations = [m["location"] for m in matches[:10]]
        loc_counts = Counter(locations)
        if loc_counts:
            ax6.pie(loc_counts.values(), labels=loc_counts.keys(), autopct="%1.1f%%", startangle=140)
        ax6.set_title("Top Locations", fontweight="bold")

        plt.tight_layout()
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"[INFO] Dashboard saved: {output_path}")

        # Display the dashboard in Colab
        plt.figure(figsize=(12, 8))
        img = plt.imread(output_path)
        plt.imshow(img)
        plt.axis('off')
        plt.title("CV Analysis Dashboard", fontsize=16, fontweight='bold')
        plt.show()

        return True
    except Exception as e:
        print(f"[ERROR] Failed to create dashboard: {e}")
        return False

# ======================
# Functions to save JSON files with specific format
# ======================
def save_education_json(education_data, output_path):
    """Save education data in JSON format with commas between degrees"""
    try:
        # Prepare education data as a list
        education_list = education_data if isinstance(education_data, list) else []

        # Save as JSON with custom separators
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(education_list, f, ensure_ascii=False, separators=(',', ':'))
        print(f"[INFO] Education JSON saved: {output_path}")

        # Display the content
        print("\nEducation Data (JSON format):")
        print("-" * 50)
        with open(output_path, "r", encoding="utf-8") as f:
            print(f.read())

        return True
    except Exception as e:
        print(f"[ERROR] Failed to save education JSON: {e}")
        return False

def save_experience_json(experience_data, output_path):
    """Save experience data in JSON format with commas between experiences"""
    try:
        # Prepare experience data as a list
        experience_list = experience_data if isinstance(experience_data, list) else []

        # Save as JSON with custom separators
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(experience_list, f, ensure_ascii=False, separators=(',', ':'))
        print(f"[INFO] Experience JSON saved: {output_path}")

        # Display the content
        print("\nExperience Data (JSON format):")
        print("-" * 50)
        with open(output_path, "r", encoding="utf-8") as f:
            print(f.read())

        return True
    except Exception as e:
        print(f"[ERROR] Failed to save experience JSON: {e}")
        return False

# ======================
# Main Pipeline Function
# ======================
def run_pipeline(cv_file_path, csv_file_path=None):
    print("\n" + "="*50)
    print("STARTING CV PROCESSING PIPELINE")
    print("="*50)

    try:
        # ===== Extraction =====
        extractor = EliteCVExtractor(nlp)
        cv_data = extractor.process(cv_file_path)

        # Display extracted information
        print("\n" + "="*50)
        print("EXTRACTED CV INFORMATION")
        print("="*50)
        print(f"Name: {cv_data['personal']['name']}")
        print(f"Email: {cv_data['personal']['email']}")
        print(f"Phone: {cv_data['personal']['phone']}")
        print(f"Education items: {len(cv_data['education'])}")
        print(f"Experience items: {len(cv_data['experience'])}")
        print(f"Skills found: {len(cv_data['skills'])}")

        # Save extracted data
        base_name = os.path.basename(cv_file_path).replace(".pdf", "")

        # Save education in requested JSON format
        education_json_path = os.path.join(OUTPUT_DIR, f"{base_name}_education.json")
        save_education_json(cv_data["education"], education_json_path)

        # Save experience in requested JSON format
        experience_json_path = os.path.join(OUTPUT_DIR, f"{base_name}_experience.json")
        save_experience_json(cv_data["experience"], experience_json_path)

        # Save full extracted data
        out_json = os.path.join(OUTPUT_DIR, f"{base_name}_extracted.json")
        with open(out_json, "w", encoding="utf-8") as f:
            json.dump(cv_data, f, indent=4, ensure_ascii=False)
        print(f"\n[INFO] Full extracted data saved: {out_json}")

        # Display skills
        print("\n" + "="*50)
        print("DETECTED SKILLS")
        print("="*50)
        for i, skill in enumerate(cv_data["skills"][:20], 1):
            print(f"{i}. {skill}")
        if len(cv_data["skills"]) > 20:
            print(f"... and {len(cv_data['skills']) - 20} more")

        # ===== Check if matching should be done =====
        if csv_file_path and os.path.exists(csv_file_path):
            print(f"\n[INFO] Loading internship data from: {csv_file_path}")
            df = pd.read_csv(csv_file_path).fillna("")
            print(f"[INFO] Loaded {len(df)} internships")

            # Load SBERT model
            model_path = os.path.join(MODEL_DIR, "sbert_model")
            if not os.path.exists(model_path):
                print("[INFO] Downloading SBERT model...")
                sbert = SentenceTransformer("all-MiniLM-L6-v2")
                sbert.save(model_path)
            else:
                sbert = SentenceTransformer(model_path)
            print("[INFO] SBERT model ready")

            # ===== Matching =====
            matcher = InternshipMatcher(sbert, df)
            ats_score, ats_breakdown = matcher.calculate_ats_score(cv_data)
            matches = matcher.match(cv_data, ats_score)

            # Display ATS score
            print("\n" + "="*50)
            print("ATS SCORE ANALYSIS")
            print("="*50)
            print(f"Overall ATS Score: {ats_score}/100")
            for category, score in ats_breakdown.items():
                print(f"  {category}: {score}")

            # Save matching results
            matches_json = os.path.join(OUTPUT_DIR, f"{base_name}_matches.json")
            matching_results = {
                "ats_score": ats_score,
                "ats_breakdown": ats_breakdown,
                "total_matches": len(matches),
                "matches": matches[:50],
                "top_matches": matches[:10]
            }

            with open(matches_json, "w", encoding="utf-8") as f:
                json.dump(matching_results, f, indent=4, ensure_ascii=False)
            print(f"[INFO] Matching results saved: {matches_json}")

            # Display top matches
            print("\n" + "="*50)
            print("TOP 10 INTERNSHIP MATCHES")
            print("="*50)
            for i, match in enumerate(matches[:10], 1):
                print(f"{i}. {match['company']} - {match['position']}")
                print(f"   Fit Score: {match['final_score']:.1f}% | Stipend: ${match['stipend']:,.0f}")
                print(f"   Location: {match['location']}")
                print()

            # ===== Generate Dashboard =====
            dashboard_path = os.path.join(OUTPUT_DIR, f"{base_name}_dashboard.png")
            print(f"\n[INFO] Generating dashboard...")
            create_dashboard(cv_data, ats_breakdown, matches, dashboard_path)

            # ===== Save summary =====
            summary_json = os.path.join(OUTPUT_DIR, f"{base_name}_summary.json")
            summary = {
                "applicant_name": cv_data["personal"]["name"],
                "applicant_email": cv_data["personal"]["email"],
                "applicant_phone": cv_data["personal"]["phone"],
                "ats_score": ats_score,
                "total_matches": len(matches),
                "top_3_matches": [
                    {
                        "company": match["company"],
                        "position": match["position"],
                        "score": match["final_score"],
                        "stipend": match["stipend"]
                    }
                    for match in matches[:3]
                ],
                "files": {
                    "extracted": os.path.basename(out_json),
                    "education": os.path.basename(education_json_path),
                    "experience": os.path.basename(experience_json_path),
                    "matches": os.path.basename(matches_json),
                    "dashboard": os.path.basename(dashboard_path)
                }
            }

            with open(summary_json, "w", encoding="utf-8") as f:
                json.dump(summary, f, indent=4, ensure_ascii=False)
            print(f"[INFO] Summary saved: {summary_json}")

            # ===== Prepare download links =====
            print("\n" + "="*50)
            print("GENERATED FILES")
            print("="*50)
            files_to_download = [
                out_json,
                education_json_path,
                experience_json_path,
                matches_json,
                dashboard_path,
                summary_json
            ]

            for file_path in files_to_download:
                if os.path.exists(file_path):
                    print(f"✓ {os.path.basename(file_path)}")

            # Ask user if they want to download files
            print("\n" + "="*50)
            print("DOWNLOAD OPTIONS")
            print("="*50)
            print("1. Download all generated files")
            print("2. Download specific files")
            print("3. Continue without downloading")

            choice = input("\nEnter your choice (1-3): ").strip()

            if choice == "1":
                # Create a zip file of all outputs
                import zipfile
                zip_path = os.path.join(OUTPUT_DIR, f"{base_name}_results.zip")
                with zipfile.ZipFile(zip_path, 'w') as zipf:
                    for file_path in files_to_download:
                        if os.path.exists(file_path):
                            zipf.write(file_path, os.path.basename(file_path))

                print(f"\n[INFO] Zip file created: {zip_path}")
                files.download(zip_path)

            elif choice == "2":
                print("\nAvailable files:")
                for i, file_path in enumerate(files_to_download, 1):
                    if os.path.exists(file_path):
                        print(f"{i}. {os.path.basename(file_path)}")

                file_choices = input("\nEnter file numbers to download (comma-separated): ").strip()
                try:
                    indices = [int(idx.strip()) - 1 for idx in file_choices.split(",")]
                    for idx in indices:
                        if 0 <= idx < len(files_to_download):
                            file_path = files_to_download[idx]
                            if os.path.exists(file_path):
                                files.download(file_path)
                except:
                    print("[ERROR] Invalid input")

            return {
                "success": True,
                "matching": True,
                "applicant": cv_data["personal"],
                "ats_score": ats_score,
                "total_matches": len(matches),
                "top_matches": matches[:5],
                "files": files_to_download
            }

        else:
            print("\n[INFO] No internship CSV provided or file not found.")
            print("[INFO] Skipping matching process.")

            return {
                "success": True,
                "matching": False,
                "applicant": cv_data["personal"],
                "files": [out_json, education_json_path, experience_json_path]
            }

    except Exception as e:
        print(f"\n[ERROR] Pipeline failed: {e}")
        import traceback
        traceback.print_exc()

        return {
            "success": False,
            "error": str(e)
        }

# ======================
# Main execution for Colab
# ======================
def main():
    print("\n" + "="*50)
    print("CV PROCESSING - INTERACTIVE MODE")
    print("="*50)

    # Ask user for input method
    print("\nChoose input method:")
    print("1. Upload PDF file")
    print("2. Use sample CV from Drive")
    print("3. Enter path to PDF file")

    choice = input("\nEnter your choice (1-3): ").strip()

    cv_file_path = None

    if choice == "1":
        # Upload PDF file
        print("\nPlease upload your CV (PDF file):")
        uploaded = files.upload()
        if uploaded:
            file_name = list(uploaded.keys())[0]
            cv_file_path = os.path.join("/content", file_name)
            print(f"[INFO] File uploaded: {cv_file_path}")

    elif choice == "2":
        # Use file from Google Drive
        print("\nEnter the path to your CV in Google Drive")
        print("Example: /content/drive/MyDrive/CVs/my_cv.pdf")
        drive_path = input("Path: ").strip()
        if os.path.exists(drive_path):
            cv_file_path = drive_path
            print(f"[INFO] Using file: {cv_file_path}")
        else:
            print(f"[ERROR] File not found: {drive_path}")
            return

    elif choice == "3":
        # Enter file path
        file_path = input("\nEnter the full path to your PDF file: ").strip()
        if os.path.exists(file_path):
            cv_file_path = file_path
            print(f"[INFO] Using file: {cv_file_path}")
        else:
            print(f"[ERROR] File not found: {file_path}")
            return

    else:
        print("[ERROR] Invalid choice")
        return

    # Ask for internship CSV
    print("\n" + "="*50)
    print("INTERNSHIP MATCHING")
    print("="*50)

    print("\nChoose internship data source:")
    print("1. Upload CSV file")
    print("2. Use sample CSV from Drive")
    print("3. Skip matching (extract only)")

    csv_choice = input("\nEnter your choice (1-3): ").strip()

    csv_file_path = None

    if csv_choice == "1":
        # Upload CSV file
        print("\nPlease upload your internship CSV file:")
        uploaded = files.upload()
        if uploaded:
            csv_file_name = list(uploaded.keys())[0]
            csv_file_path = os.path.join("/content", csv_file_name)
            print(f"[INFO] CSV uploaded: {csv_file_path}")

    elif csv_choice == "2":
        # Use CSV from Google Drive
        print("\nEnter the path to your internship CSV in Google Drive")
        print("Example: /content/palestinian_internships_200.csv")
        csv_drive_path = input("Path: ").strip()
        if os.path.exists(csv_drive_path):
            csv_file_path = csv_drive_path
            print(f"[INFO] Using CSV: {csv_file_path}")
        else:
            print(f"[WARNING] CSV file not found: {csv_drive_path}")
            print("[INFO] Continuing without matching...")

    elif csv_choice == "3":
        print("[INFO] Skipping matching process...")

    else:
        print("[ERROR] Invalid choice")
        return

    # Run the pipeline
    print("\n" + "="*50)
    print("STARTING PROCESSING")
    print("="*50)

    result = run_pipeline(cv_file_path, csv_file_path)

    # Display final summary
    print("\n" + "="*50)
    print("PROCESSING COMPLETE")
    print("="*50)

    if result["success"]:
        if result.get("matching", False):
            print(f"✓ CV processed successfully!")
            print(f"✓ Applicant: {result['applicant']['name']}")
            print(f"✓ ATS Score: {result['ats_score']}/100")
            print(f"✓ Matches found: {result['total_matches']}")
        else:
            print(f"✓ CV extracted successfully!")
            print(f"✓ Applicant: {result['applicant']['name']}")
            print(f"✓ Education data saved in JSON format")
            print(f"✓ Experience data saved in JSON format")
    else:
        print(f"✗ Processing failed: {result.get('error', 'Unknown error')}")

# ======================
# Run the main function
# ======================
if __name__ == "__main__":
    main()



CV PROCESSING PIPELINE FOR GOOGLE COLAB

[INFO] Installing required packages...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m120.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Mounted at /content/drive
[INFO] Base directory: /content/cv_processing
[INFO] Output directory: /content/cv_processing/output
[INFO] SpaCy model loaded successfully

CV PROCESSING - INTERACTIVE MODE

Choose input method:
1. Upload PDF file
2. Use sample CV from Drive
3. Enter path to PDF file

Enter your choice (1-3): 1

Please upload your CV (PDF file):


Saving Anas_CV.pdf to Anas_CV.pdf
[INFO] File uploaded: /content/Anas_CV.pdf

INTERNSHIP MATCHING

Choose internship data source:
1. Upload CSV file
2. Use sample CSV from Drive
3. Skip matching (extract only)

Enter your choice (1-3): 3
[INFO] Skipping matching process...

STARTING PROCESSING

STARTING CV PROCESSING PIPELINE
[INFO] Processing: /content/Anas_CV.pdf

EXTRACTED CV INFORMATION
Name: ANAS MELHEM
Email: a.melhem@ptuk.edu.ps
Phone: +970 599 320 207
Education items: 10
Experience items: 73
Skills found: 20
[INFO] Education JSON saved: /content/cv_processing/output/Anas_CV_education.json

Education Data (JSON format):
--------------------------------------------------
["• PhD in Computer Engineering","Eastern Mediterranean University, North Cyprus. September, 2021","– Thesis title: Analysis and Development of Ciphers Homomorphic on Addition and","– Advisor: Prof. Dr. Alexander Chefranov.","• Master’s in Electronics and Computer Engineering","Al-Quds University, Palestine. May,