In [9]:
# Install required packages
!pip install beautifulsoup4 requests pandas numpy scikit-learn nltk schedule

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time
import schedule
import pickle
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Configuration
class Config:
    BASE_URL = "https://www.karkidi.com/Find-Jobs/{page}/all/India?search={query}"
    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    REQUEST_DELAY = 2  # seconds between requests
    CLUSTER_RANGE = range(2, 6)  # Range of clusters to try
    MODEL_FILE = 'job_clustering_model.pkl'
    DATA_FILE = 'job_postings.csv'

    # Sample user profiles (modify as needed)
    USER_PROFILES = [
        {
            'name': 'Data Scientist',
            'skills': 'python, machine learning, statistics, data analysis',
            'email': 'user1@example.com'
        },
        {
            'name': 'Web Developer',
            'skills': 'javascript, html, css, react',
            'email': 'user2@example.com'
        }
    ]

# Text Preprocessor
class TextPreprocessor:
    @staticmethod
    def preprocess(text):
        if not isinstance(text, str):
            return ""

        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        words = text.split()

        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        return ' '.join(words)

# Job Scraper (based on your code)
class JobScraper:
    def __init__(self):
        self.headers = {'User-Agent': Config.USER_AGENT}

    def scrape_jobs(self, keyword="data science", pages=2):
        jobs_list = []

        for page in range(1, pages + 1):
            url = Config.BASE_URL.format(
                page=page,
                query=keyword.replace(' ', '%20')
            )

            try:
                print(f"Scraping page {page}...")
                response = requests.get(url, headers=self.headers, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, "html.parser")

                job_blocks = soup.find_all("div", class_="ads-details")

                for job in job_blocks:
                    try:
                        title = job.find("h4").get_text(strip=True)
                        company = job.find("a", href=lambda x: x and "Employer-Profile" in x).get_text(strip=True)
                        location = job.find("p").get_text(strip=True)
                        experience = job.find("p", class_="emp-exp").get_text(strip=True)
                        key_skills_tag = job.find("span", string="Key Skills")
                        skills = key_skills_tag.find_next("p").get_text(strip=True) if key_skills_tag else ""
                        summary_tag = job.find("span", string="Summary")
                        summary = summary_tag.find_next("p").get_text(strip=True) if summary_tag else ""

                        jobs_list.append({
                            "title": title,
                            "company": company,
                            "location": location,
                            "experience": experience,
                            "description": summary,
                            "skills": skills,
                            "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        })
                    except Exception as e:
                        print(f"Error parsing job: {e}")
                        continue

                time.sleep(Config.REQUEST_DELAY)
            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                continue

        return pd.DataFrame(jobs_list)

# Job Clustering System
class JobClusterer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=1000)
        self.model = None
        self.cluster_descriptions = {}

    def train_model(self, job_data):
        # Preprocess skills
        job_data['processed_skills'] = job_data['skills'].apply(TextPreprocessor.preprocess)

        # Vectorize skills
        X = self.vectorizer.fit_transform(job_data['processed_skills'])

        # Find optimal number of clusters
        best_score = -1
        best_n_clusters = 2

        for n_clusters in Config.CLUSTER_RANGE:
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            cluster_labels = kmeans.fit_predict(X)
            score = silhouette_score(X, cluster_labels)

            if score > best_score:
                best_score = score
                best_n_clusters = n_clusters

        print(f"Optimal clusters: {best_n_clusters} (silhouette score: {best_score:.2f})")

        # Train final model
        self.model = KMeans(n_clusters=best_n_clusters, random_state=42)
        job_data['cluster'] = self.model.fit_predict(X)

        # Create cluster descriptions
        for cluster_num in range(best_n_clusters):
            cluster_data = job_data[job_data['cluster'] == cluster_num]
            all_skills = ' '.join(cluster_data['processed_skills']).split()
            top_skills = pd.Series(all_skills).value_counts().head(5).index.tolist()
            self.cluster_descriptions[cluster_num] = top_skills

        return job_data

    def classify_job(self, job_posting):
        processed_skills = TextPreprocessor.preprocess(job_posting['skills'])
        skills_vector = self.vectorizer.transform([processed_skills])
        cluster = self.model.predict(skills_vector)[0]
        return cluster, self.cluster_descriptions[cluster]

    def save_model(self):
        with open(Config.MODEL_FILE, 'wb') as f:
            pickle.dump({
                'vectorizer': self.vectorizer,
                'model': self.model,
                'cluster_descriptions': self.cluster_descriptions
            }, f)

    def load_model(self):
        try:
            with open(Config.MODEL_FILE, 'rb') as f:
                data = pickle.load(f)
                self.vectorizer = data['vectorizer']
                self.model = data['model']
                self.cluster_descriptions = data['cluster_descriptions']
            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False

# Notification System (simplified for Colab)
class Notifier:
    @staticmethod
    def notify(user, matching_jobs):
        print(f"\nNotification for {user['name']} ({user['email']}):")
        print(f"Found {len(matching_jobs)} new jobs matching your skills:")
        for job in matching_jobs:
            print(f"- {job['title']} at {job['company']}")
            print(f"  Skills: {job['skills']}")
            print(f"  Posted: {job['scraped_at']}\n")

# Main Application
class JobMonitor:
    def __init__(self):
        self.scraper = JobScraper()
        self.clusterer = JobClusterer()
        self.job_data = pd.DataFrame()

        # Try to load existing model
        if not self.clusterer.load_model():
            print("No existing model found. Will train new model.")

    def run_daily_task(self):
        print(f"\n{datetime.now()}: Running daily job monitoring...")

        # Step 1: Scrape new jobs
        new_jobs = self.scraper.scrape_jobs()

        if new_jobs.empty:
            print("No new jobs scraped.")
            return

        # Step 2: Load existing data
        try:
            existing_data = pd.read_csv(Config.DATA_FILE)
            combined_data = pd.concat([existing_data, new_jobs]).drop_duplicates()
        except:
            combined_data = new_jobs

        # Step 3: Save all data
        combined_data.to_csv(Config.DATA_FILE, index=False)
        self.job_data = combined_data

        # Step 4: Train or update model
        if self.clusterer.model is None:
            print("Training new clustering model...")
            self.job_data = self.clusterer.train_model(self.job_data)
            self.clusterer.save_model()

        # Step 5: Check for user matches
        self.check_user_matches(new_jobs)

    def check_user_matches(self, new_jobs):
        for user in Config.USER_PROFILES:
            user_skills = TextPreprocessor.preprocess(user['skills'])
            user_vector = self.clusterer.vectorizer.transform([user_skills])
            user_cluster = self.clusterer.model.predict(user_vector)[0]

            matching_jobs = []
            for _, job in new_jobs.iterrows():
                job_cluster, _ = self.clusterer.classify_job(job)
                if job_cluster == user_cluster:
                    matching_jobs.append(job)

            if matching_jobs:
                Notifier.notify(user, matching_jobs)

# Run in Colab
if __name__ == "__main__":
    # Initialize the monitor
    monitor = JobMonitor()

    # Run initial collection and processing
    monitor.run_daily_task()

    # Show sample of clustered jobs
    if not monitor.job_data.empty and 'cluster' in monitor.job_data.columns:
        print("\nSample of clustered jobs:")
        print(monitor.job_data[['title', 'company', 'skills', 'cluster']].head())

    # For production, you would add scheduling:
    # schedule.every().day.at("09:00").do(monitor.run_daily_task)
    # while True:
    #     schedule.run_pending()
    #     time.sleep(60)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Error loading model: [Errno 2] No such file or directory: 'job_clustering_model.pkl'
No existing model found. Will train new model.

2025-05-24 10:34:18.830712: Running daily job monitoring...
Scraping page 1...
Scraping page 2...
Training new clustering model...
Optimal clusters: 5 (silhouette score: 0.51)

Notification for Data Scientist (user1@example.com):
Found 4 new jobs matching your skills:
- Machine Learning Physical Design Engineer at Google
  Skills: Aartificial intelligence,Algorithms,Data structuring,Design,Machine learning techniques
  Posted: 2025-05-24 10:34:26

- Senior Product Designer at Observe.AI
  Skills: Design,Leadership Skill,Machine learning techniques
  Posted: 2025-05-24 10:34:26

- Machine Learning Physical Design Engineer at Google
  Skills: Aartificial intelligence,Algorithms,Data structuring,Design,Machine learning techniques
  Posted: 2025-05-24 10:34:34

- Senior Product Designer at Observe.AI
  Skills: Design,Leadership Skill,Machine learning techniqu