In [None]:

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Concatenate, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import json
import re
from typing import List, Dict, Tuple
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class JobMatchingSystem:
    def __init__(self, max_features=10000, max_len=200, embedding_dim=128):
        self.max_features = max_features
        self.max_len = max_len
        self.embedding_dim = embedding_dim

        # Initialize components
        self.tokenizer = None
        self.skill_encoder = MultiLabelBinarizer()
        self.job_category_encoder = LabelEncoder()
        self.model = None

        # Predefined skill categories based on your app
        self.skill_categories = [
            'Animal Care', 'Arts & Crafts', 'Acting', 'Childcare', 'Communication',
            'Community Service', 'Construction', 'Cooking', 'First Aid', 'Fitness',
            'Fundraising', 'Gaming', 'Gardening', 'Health Care', 'Programming',
            'Public Speaking', 'Reading', 'Singing', 'Social Media', 'Sports',
            'Teaching', 'Teamwork', 'Technology', 'Traveling', 'Volunteering', 'Writing'
        ]

    def create_synthetic_data(self, num_samples=5000):
        """Generate synthetic training data based on realistic job-skill mappings"""

        # Job categories and their typical skill requirements
        job_skill_mapping = {
            'Software Developer': ['Programming', 'Technology', 'Communication', 'Teamwork'],
            'UI/UX Designer': ['Arts & Crafts', 'Technology', 'Communication', 'Programming'],
            'Teacher': ['Teaching', 'Communication', 'Public Speaking', 'Childcare'],
            'Healthcare Worker': ['Health Care', 'First Aid', 'Communication', 'Teamwork'],
            'Marketing Specialist': ['Social Media', 'Communication', 'Writing', 'Arts & Crafts'],
            'Chef': ['Cooking', 'Teamwork', 'Communication', 'Arts & Crafts'],
            'Veterinarian': ['Animal Care', 'Health Care', 'Communication', 'First Aid'],
            'Social Worker': ['Communication', 'Community Service', 'Volunteering', 'Public Speaking'],
            'Personal Trainer': ['Fitness', 'Health Care', 'Communication', 'Teaching'],
            'Construction Manager': ['Construction', 'Teamwork', 'Communication', 'Technology'],
            'Writer': ['Writing', 'Communication', 'Reading', 'Arts & Crafts'],
            'Childcare Provider': ['Childcare', 'Communication', 'First Aid', 'Teaching'],
            'Event Coordinator': ['Communication', 'Fundraising', 'Public Speaking', 'Social Media'],
            'Landscaper': ['Gardening', 'Construction', 'Teamwork', 'Fitness'],
            'Travel Guide': ['Traveling', 'Communication', 'Public Speaking', 'Teaching'],
            'Actor': ['Acting', 'Communication', 'Arts & Crafts', 'Public Speaking'],
            'Singer': ['Singing', 'Arts & Crafts', 'Communication', 'Public Speaking'],
            'Game Developer': ['Gaming', 'Programming', 'Technology', 'Arts & Crafts'],
            'Sports Coach': ['Sports', 'Fitness', 'Teaching', 'Communication'],
            'Volunteer Coordinator': ['Volunteering', 'Community Service', 'Communication', 'Fundraising']
        }

        # Generate job descriptions templates
        job_descriptions = {
            'Software Developer': [
                "We are looking for a skilled software developer to join our team. You will be responsible for developing and maintaining web applications.",
                "Seeking an experienced programmer to work on mobile applications and web development projects.",
                "Join our tech team as a software engineer. Experience with modern programming languages required."
            ],
            'UI/UX Designer': [
                "We need a creative UI/UX designer for our mobile app development. At least 2 years experience with web/mobile design required.",
                "Looking for a talented designer to create user-friendly interfaces for our digital products.",
                "Join our design team to create beautiful and functional user experiences."
            ],
            'Teacher': [
                "Elementary school teacher position available. Experience working with children required.",
                "We are hiring passionate educators to join our school community.",
                "Teaching position open for dedicated professionals who love working with students."
            ],
            'Healthcare Worker': [
                "Healthcare professional needed for our medical facility. First aid certification preferred.",
                "Join our healthcare team to provide quality patient care.",
                "Medical assistant position available. Healthcare experience required."
            ],
            'Marketing Specialist': [
                "Digital marketing specialist needed. Social media and content creation skills required.",
                "Marketing professional wanted for brand promotion and social media management.",
                "Join our marketing team to develop creative campaigns and manage online presence."
            ]
        }

        data = []
        for _ in range(num_samples):
            # Randomly select a job category
            job_category = np.random.choice(list(job_skill_mapping.keys()))
            required_skills = job_skill_mapping[job_category].copy()  # Make a copy to avoid modifying original

            # Add some noise - sometimes add extra skills or remove some
            if np.random.random() > 0.3:
                # Add 1-2 random skills
                available_skills = [s for s in self.skill_categories if s not in required_skills]
                if available_skills:
                    num_extra = min(np.random.randint(1, 3), len(available_skills))
                    extra_skills = np.random.choice(
                        available_skills,
                        size=num_extra,
                        replace=False
                    ).tolist()
                    required_skills.extend(extra_skills)

            # Sometimes remove a skill
            if len(required_skills) > 2 and np.random.random() > 0.7:
                required_skills = required_skills[:-1]

            # Generate job description
            if job_category in job_descriptions:
                description = np.random.choice(job_descriptions[job_category])
            else:
                description = f"We are looking for a qualified {job_category.lower()} to join our team."

            # Add some variation to descriptions
            if np.random.random() > 0.5:
                description += f" Experience with {', '.join(required_skills[:2]).lower()} is preferred."

            data.append({
                'job_description': description,
                'job_category': job_category,
                'required_skills': required_skills,
                'match_score': np.random.uniform(0.6, 1.0)  # Simulated match score
            })

        return pd.DataFrame(data)

    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess text data"""
        if not isinstance(text, str):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove special characters and extra spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    def prepare_data(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Prepare data for training"""

        # Preprocess job descriptions
        df['processed_description'] = df['job_description'].apply(self.preprocess_text)

        # Initialize and fit tokenizer
        self.tokenizer = Tokenizer(num_words=self.max_features, oov_token="<OOV>")
        self.tokenizer.fit_on_texts(df['processed_description'])

        # Convert texts to sequences
        sequences = self.tokenizer.texts_to_sequences(df['processed_description'])
        X_text = pad_sequences(sequences, maxlen=self.max_len)

        # Encode skills
        X_skills = self.skill_encoder.fit_transform(df['required_skills'])

        # Encode job categories
        y_categories = self.job_category_encoder.fit_transform(df['job_category'])

        return X_text, X_skills, y_categories

    def build_model(self, num_skills: int, num_categories: int):
        """Build the hybrid CNN-LSTM model for job matching"""

        # Text input branch
        text_input = Input(shape=(self.max_len,), name='text_input')

        # Embedding layer
        embedding = Embedding(
            input_dim=self.max_features,
            output_dim=self.embedding_dim,
            input_length=self.max_len,
            trainable=True
        )(text_input)

        # CNN branch for text
        conv1 = Conv1D(64, 3, activation='relu', padding='same')(embedding)
        conv1 = BatchNormalization()(conv1)
        pool1 = MaxPooling1D(2)(conv1)

        conv2 = Conv1D(128, 3, activation='relu', padding='same')(pool1)
        conv2 = BatchNormalization()(conv2)
        pool2 = MaxPooling1D(2)(conv2)

        # LSTM branch for text
        lstm = LSTM(64, return_sequences=True)(embedding)
        lstm = Dropout(0.3)(lstm)
        lstm_global = GlobalMaxPooling1D()(lstm)

        # Global pooling for CNN
        cnn_global = GlobalMaxPooling1D()(pool2)

        # Combine CNN and LSTM features
        text_features = Concatenate()([cnn_global, lstm_global])
        text_features = Dense(128, activation='relu')(text_features)
        text_features = BatchNormalization()(text_features)
        text_features = Dropout(0.3)(text_features)

        # Skills input branch
        skills_input = Input(shape=(num_skills,), name='skills_input')
        skills_features = Dense(64, activation='relu')(skills_input)
        skills_features = BatchNormalization()(skills_features)
        skills_features = Dropout(0.2)(skills_features)

        # Combine all features
        combined = Concatenate()([text_features, skills_features])
        combined = Dense(256, activation='relu')(combined)
        combined = BatchNormalization()(combined)
        combined = Dropout(0.4)(combined)

        combined = Dense(128, activation='relu')(combined)
        combined = BatchNormalization()(combined)
        combined = Dropout(0.3)(combined)

        # Output layers
        # Job category prediction
        category_output = Dense(num_categories, activation='softmax', name='category_output')(combined)

        # Match score prediction
        match_output = Dense(1, activation='sigmoid', name='match_output')(combined)

        # Create model
        model = Model(
            inputs=[text_input, skills_input],
            outputs=[category_output, match_output],
            name='job_matching_model'
        )

        # Compile model
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            loss={
                'category_output': 'sparse_categorical_crossentropy',
                'match_output': 'mse'
            },
            loss_weights={
                'category_output': 0.7,
                'match_output': 0.3
            },
            metrics={
                'category_output': 'accuracy',
                'match_output': 'mae'
            }
        )

        return model

    def train(self, save_path='job_matching_model'):
        """Train the complete model"""
        logger.info("Generating synthetic training data...")
        df = self.create_synthetic_data(num_samples=8000)

        # Job skill mapping for match score calculation
        job_skill_mapping = {
            'Software Developer': ['Programming', 'Technology', 'Communication', 'Teamwork'],
            'UI/UX Designer': ['Arts & Crafts', 'Technology', 'Communication', 'Programming'],
            'Teacher': ['Teaching', 'Communication', 'Public Speaking', 'Childcare'],
            'Healthcare Worker': ['Health Care', 'First Aid', 'Communication', 'Teamwork'],
            'Marketing Specialist': ['Social Media', 'Communication', 'Writing', 'Arts & Crafts'],
            'Chef': ['Cooking', 'Teamwork', 'Communication', 'Arts & Crafts'],
            'Veterinarian': ['Animal Care', 'Health Care', 'Communication', 'First Aid'],
            'Social Worker': ['Communication', 'Community Service', 'Volunteering', 'Public Speaking'],
            'Personal Trainer': ['Fitness', 'Health Care', 'Communication', 'Teaching'],
            'Construction Manager': ['Construction', 'Teamwork', 'Communication', 'Technology'],
            'Writer': ['Writing', 'Communication', 'Reading', 'Arts & Crafts'],
            'Childcare Provider': ['Childcare', 'Communication', 'First Aid', 'Teaching'],
            'Event Coordinator': ['Communication', 'Fundraising', 'Public Speaking', 'Social Media'],
            'Landscaper': ['Gardening', 'Construction', 'Teamwork', 'Fitness'],
            'Travel Guide': ['Traveling', 'Communication', 'Public Speaking', 'Teaching'],
            'Actor': ['Acting', 'Communication', 'Arts & Crafts', 'Public Speaking'],
            'Singer': ['Singing', 'Arts & Crafts', 'Communication', 'Public Speaking'],
            'Game Developer': ['Gaming', 'Programming', 'Technology', 'Arts & Crafts'],
            'Sports Coach': ['Sports', 'Fitness', 'Teaching', 'Communication'],
            'Volunteer Coordinator': ['Volunteering', 'Community Service', 'Communication', 'Fundraising']
        }

        logger.info("Preparing data...")
        X_text, X_skills, y_categories = self.prepare_data(df)

        # Create match scores (based on skill overlap)
        match_scores = []
        for _, row in df.iterrows():
            # Calculate match score based on skill relevance to job category
            base_skills = job_skill_mapping.get(row['job_category'], [])
            user_skills = row['required_skills']

            # Calculate overlap
            overlap = len(set(base_skills) & set(user_skills))
            total_base = len(base_skills)

            if total_base > 0:
                match_score = overlap / total_base
                # Add some noise
                match_score += np.random.normal(0, 0.1)
                match_score = np.clip(match_score, 0.0, 1.0)
            else:
                match_score = np.random.uniform(0.3, 0.7)

            match_scores.append(match_score)

        y_match = np.array(match_scores)

        # Split data
        indices = np.arange(len(X_text))
        train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

        X_text_train, X_text_test = X_text[train_idx], X_text[test_idx]
        X_skills_train, X_skills_test = X_skills[train_idx], X_skills[test_idx]
        y_cat_train, y_cat_test = y_categories[train_idx], y_categories[test_idx]
        y_match_train, y_match_test = y_match[train_idx], y_match[test_idx]

        logger.info("Building model...")
        num_skills = X_skills.shape[1]
        num_categories = len(self.job_category_encoder.classes_)

        self.model = self.build_model(num_skills, num_categories)

        logger.info("Model architecture:")
        self.model.summary()

        # Callbacks
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )

        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6
        )

        logger.info("Training model...")
        try:
            history = self.model.fit(
                [X_text_train, X_skills_train],
                [y_cat_train, y_match_train],
                validation_data=([X_text_test, X_skills_test], [y_cat_test, y_match_test]),
                epochs=30,  # Reduced epochs
                batch_size=64,  # Larger batch size
                callbacks=[early_stopping, reduce_lr],
                verbose=1
            )
        except Exception as e:
            logger.error(f"Training failed: {e}")
            # Try with smaller batch size
            logger.info("Retrying with smaller batch size...")
            history = self.model.fit(
                [X_text_train, X_skills_train],
                [y_cat_train, y_match_train],
                validation_data=([X_text_test, X_skills_test], [y_cat_test, y_match_test]),
                epochs=20,
                batch_size=32,
                callbacks=[early_stopping, reduce_lr],
                verbose=1
            )

        logger.info("Evaluating model...")
        predictions = self.model.predict([X_text_test, X_skills_test])
        cat_pred = np.argmax(predictions[0], axis=1)
        match_pred = predictions[1].flatten()

        cat_accuracy = accuracy_score(y_cat_test, cat_pred)
        match_mae = np.mean(np.abs(y_match_test - match_pred))

        logger.info(f"Category prediction accuracy: {cat_accuracy:.4f}")
        logger.info(f"Match score MAE: {match_mae:.4f}")

        # Print some example predictions
        logger.info("Sample predictions:")
        for i in range(min(5, len(y_cat_test))):
            true_cat = self.job_category_encoder.classes_[y_cat_test[i]]
            pred_cat = self.job_category_encoder.classes_[cat_pred[i]]
            logger.info(f"  True: {true_cat} | Pred: {pred_cat} | Match Score: {match_pred[i]:.3f} (True: {y_match_test[i]:.3f})")

        # Save model and components
        self.save_model(save_path)

        return history

    def predict(self, job_description: str, user_skills: List[str]) -> Dict:
        """Predict job category and match score"""
        if not self.model:
            raise ValueError("Model not trained or loaded")

        # Preprocess input
        processed_desc = self.preprocess_text(job_description)
        sequence = self.tokenizer.texts_to_sequences([processed_desc])
        X_text = pad_sequences(sequence, maxlen=self.max_len)

        # Encode skills
        skill_vector = np.zeros((1, len(self.skill_categories)))
        for skill in user_skills:
            if skill in self.skill_categories:
                idx = self.skill_categories.index(skill)
                skill_vector[0, idx] = 1

        # Make prediction
        predictions = self.model.predict([X_text, skill_vector])

        # Get results
        category_probs = predictions[0][0]
        match_score = predictions[1][0][0]

        predicted_category = self.job_category_encoder.classes_[np.argmax(category_probs)]
        confidence = np.max(category_probs)

        return {
            'predicted_category': predicted_category,
            'confidence': float(confidence),
            'match_score': float(match_score),
            'category_probabilities': {
                cat: float(prob)
                for cat, prob in zip(self.job_category_encoder.classes_, category_probs)
            }
        }

    def save_model(self, save_path: str):
        """Save model and preprocessing components"""
        import os
        os.makedirs(save_path, exist_ok=True)

        # Save Keras model
        self.model.save(f"{save_path}/model.h5")

        # Save tokenizer
        with open(f"{save_path}/tokenizer.json", "w") as f:
            json.dump(self.tokenizer.to_json(), f)

        # Save encoders
        joblib.dump(self.skill_encoder, f"{save_path}/skill_encoder.pkl")
        joblib.dump(self.job_category_encoder, f"{save_path}/job_category_encoder.pkl")

        # Save configuration
        config = {
            'max_features': self.max_features,
            'max_len': self.max_len,
            'embedding_dim': self.embedding_dim,
            'skill_categories': self.skill_categories
        }

        with open(f"{save_path}/config.json", "w") as f:
            json.dump(config, f, indent=2)

        logger.info(f"Model saved to {save_path}")

    def load_model(self, save_path: str):
        """Load trained model and preprocessing components"""
        # Load configuration
        with open(f"{save_path}/config.json", "r") as f:
            config = json.load(f)

        self.max_features = config['max_features']
        self.max_len = config['max_len']
        self.embedding_dim = config['embedding_dim']
        self.skill_categories = config['skill_categories']

        # Load model
        self.model = tf.keras.models.load_model(f"{save_path}/model.h5")

        # Load tokenizer
        with open(f"{save_path}/tokenizer.json", "r") as f:
            tokenizer_json = json.load(f)

        self.tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_json)

        # Load encoders
        self.skill_encoder = joblib.load(f"{save_path}/skill_encoder.pkl")
        self.job_category_encoder = joblib.load(f"{save_path}/job_category_encoder.pkl")

        logger.info(f"Model loaded from {save_path}")

# Example usage and testing
if __name__ == "__main__":
    # Initialize the system
    job_matcher = JobMatchingSystem()

    # First, test data generation
    print("Testing data generation...")
    test_df = job_matcher.create_synthetic_data(num_samples=100)
    print(f"Generated {len(test_df)} samples")
    print("Sample data:")
    print(test_df.head())
    print(f"Job categories: {test_df['job_category'].unique()}")

    # Train the model
    print("\nTraining the job matching model...")
    try:
        history = job_matcher.train(save_path='job_matching_model')
        print("Training completed successfully!")
    except Exception as e:
        print(f"Training failed: {e}")
        import traceback
        traceback.print_exc()
        exit(1)

    # Test predictions
    print("\nTesting predictions:")

    test_cases = [
        {
            'description': "We need a UI/UX designer with 2 years experience in mobile app design.",
            'skills': ['Arts & Crafts', 'Technology', 'Communication']
        },
        {
            'description': "Looking for a software developer to work on web applications.",
            'skills': ['Programming', 'Technology', 'Communication']
        },
        {
            'description': "Elementary teacher needed for our school.",
            'skills': ['Teaching', 'Childcare', 'Communication']
        }
    ]

    for i, test_case in enumerate(test_cases):
        result = job_matcher.predict(test_case['description'], test_case['skills'])
        print(f"\nTest Case {i+1}:")
        print(f"Description: {test_case['description']}")
        print(f"User Skills: {test_case['skills']}")
        print(f"Predicted Category: {result['predicted_category']}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"Match Score: {result['match_score']:.3f}")

Testing data generation...
Generated 100 samples
Sample data:
                                     job_description           job_category  \
0  We are looking for a qualified construction ma...   Construction Manager   
1  We are looking for a qualified volunteer coord...  Volunteer Coordinator   
2  We are looking for a qualified landscaper to j...             Landscaper   
3  We are looking for a qualified singer to join ...                 Singer   
4  We are looking for a qualified travel guide to...           Travel Guide   

                                     required_skills  match_score  
0  [Construction, Teamwork, Communication, Techno...     0.903431  
1  [Volunteering, Community Service, Communicatio...     0.967833  
2  [Gardening, Construction, Teamwork, Fitness, A...     0.929384  
3  [Singing, Arts & Crafts, Communication, Public...     0.910177  
4  [Traveling, Communication, Public Speaking, Te...     0.626282  
Job categories: [np.str_('Construction Manager') np.str



Epoch 1/30
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 26ms/step - category_output_accuracy: 0.6585 - category_output_loss: 1.2832 - loss: 0.9717 - match_output_loss: 0.2447 - match_output_mae: 0.4121 - val_category_output_accuracy: 0.0600 - val_category_output_loss: 3.4678 - val_loss: 2.4424 - val_match_output_loss: 0.0499 - val_match_output_mae: 0.2113 - learning_rate: 0.0010
Epoch 2/30
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - category_output_accuracy: 0.9993 - category_output_loss: 0.0360 - loss: 0.0592 - match_output_loss: 0.1131 - match_output_mae: 0.2697 - val_category_output_accuracy: 0.0625 - val_category_output_loss: 4.0032 - val_loss: 2.8083 - val_match_output_loss: 0.0202 - val_match_output_mae: 0.1308 - learning_rate: 0.0010
Epoch 3/30
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - category_output_accuracy: 0.9994 - category_output_loss: 0.0172 - loss: 0.0281 - match_output_loss: 0.05



Training completed successfully!

Testing predictions:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step

Test Case 1:
Description: We need a UI/UX designer with 2 years experience in mobile app design.
User Skills: ['Arts & Crafts', 'Technology', 'Communication']
Predicted Category: UI/UX Designer
Confidence: 0.719
Match Score: 0.964
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Test Case 2:
Description: Looking for a software developer to work on web applications.
User Skills: ['Programming', 'Technology', 'Communication']
Predicted Category: Software Developer
Confidence: 0.770
Match Score: 0.954
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step

Test Case 3:
Description: Elementary teacher needed for our school.
User Skills: ['Teaching', 'Childcare', 'Communication']
Predicted Category: Teacher
Confidence: 0.824
Match Score: 0.984
