In [18]:
import pandas as pd 
import numpy as np
import json
import os 
from openai import OpenAI
import re
import time
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel

In [None]:
#pip install streamlit pypdf python-docx
#pip install google-generativeai
#pip install transformers

In [19]:
CSV_FILE_PATH = 'UpdatedResumeDataSet.csv'
OUTPUT_FILE = 'labeled_resume_data.json'
CHECKPOINT_FREQUENCY = 50

In [20]:
# Configuration - selecting the provider
API_PROVIDER = 'openai'

API_KEY = 'sk-4408764883724ccfa264c0b471b25cff'
BASE_URL = 'https://api.deepseek.com'
MODEL_NAME = "deepseek-chat"

In [5]:
SYSTEM_PROMPT = '''
You are an expert Psychometrician and HR Data Scientist.
Your job is to analyze resume text and infer the canditate's Big Five Personality Traits.
You must ouput your answer in strict JSON format.
Do not include any conversational text.
Output format:
{
    "openness": <float 0.0-1.0>,
    "conscientiousness": <float 0.0-1.0>,
    "extroversion": <float 0.0-1.0>,
    "agreeableness": <float 0.0-1.0>,
    "neuroticism": <float 0.0-1.0>
}
'''

In [6]:
class RealLLMLabeler:
    def __init__(self):
        self.client = OpenAI(api_key= API_KEY, base_url=BASE_URL)

    def analyze_resume(self, resume_text):
        """
        Sends the text to the LLM and parses the JSON response.
        """
        try:
            response = self.client.chat.completions.create(
                model = MODEL_NAME, 
                messages = [
                    {'role': 'system', 'content': SYSTEM_PROMPT},
                    {'role': 'user', 'content': f"Analyze this resume:\n\n{resume_text[:4000]}"}
                ],
                response_format = {'type': 'json_object'},
                temperature = 0.1 # low temp = more deterministic/consistent analysis
            )
            # Extract the text string from the response
            content = response.choices[0].message.content
            # Parse the string into a py dictionary
            return json.load(content)
        except Exception as e:
            print(f"API errpr: {e}")
            # Return neutral scores if API fails
            return {"openness": 0.5, "conscientiousness": 0.5, "extroversion": 0.5, "agreeableness": 0.5, "neuroticism": 0.5}

In [10]:
class DataPipeline:
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.labler = RealLLMLabeler()

    def process(self):
        try:
            df = pd.read_csv(self.csv_path)
            # sample only 5 rows, to avoid burning the api credits
            df = df.head(5)
            print(f"Loaded {len(df)}")
        except FileNotFoundError:
            print('CSV not found.')
            return
        labeled_dataset = []
    
        # Loop through resumes
        for index, row in df.iterrows():
            text = row.get('Resume', '') 
            print(f"Processing Resume #{index}...")
    
            # The real API call
            traits = self.labeler.analyze_resume(text)
            point(f"Dervied traits: {traits}")
    
            # Calculate Meta Features (Structural)
            meta = [len(text), text.count('‚Ä¢'), 0.1]
            
            labeled_dataset.append({
                "id": index,
                "text": text,
                "meta_features": meta,
                "labels": list(traits.values())
            })
            
            # Sleep to avoid hitting Rate Limits
            time.sleep(1)
        # Save
        with open('real_labeled_data.json', 'w') as f:
            json.dump(labeled_dataset, f, indent= 4)
        print(" Done! 'real_labeled_data.json' is ready for the RL Agent.")

if __name__ == '__main__':
    pipeline = DataPipeline('UpdatedResumeDataset.csv')
    pipeline.process



In [23]:
import pandas as pd
import json
import random
import re

# --- CONFIGURATION ---
INPUT_CSV = 'UpdatedResumeDataSet.csv'
OUTPUT_FILE = 'final_labeled_dataset.json'

class HeuristicLabeler:
    def __init__(self):
        # Psychological Keyword Dictionaries
        # These words strongly correlate with specific Big 5 traits
        self.keywords = {
            "openness": ["creative", "design", "art", "innovative", "research", "novel", "concept", "graphic"],
            "conscientiousness": ["organized", "managed", "delivered", "deadline", "plan", "budget", "schedule", "efficient"],
            "extroversion": ["team", "leadership", "presented", "spoke", "communication", "sales", "client", "negotiated"],
            "agreeableness": ["collaborated", "support", "helped", "assist", "volunteered", "care", "community", "mentor"],
            "neuroticism": ["critical", "issue", "problem", "stress", "pressure", "urgent", "fix", "error"]
        }

    def score_trait(self, text, trait):
        """
        Calculates a score (0.0 - 1.0) based on keyword density.
        """
        text = text.lower()
        word_count = len(text.split())
        if word_count == 0: return 0.5
        
        matches = 0
        for word in self.keywords[trait]:
            matches += len(re.findall(r'\b' + word + r'\b', text))
        
        # Logic: More keywords = Higher Score
        # We normalize it so it doesn't exceed 1.0
        # Base score is 0.3, max added is 0.7
        score = 0.3 + (matches * 0.15) 
        
        # Add some random noise to make it realistic (humans aren't perfect)
        noise = random.uniform(-0.05, 0.05)
        return min(max(score + noise, 0.1), 0.95)

    def process(self):
        print(f"üöÄ Starting Heuristic Labeling (Free Mode)...")
        
        try:
            df = pd.read_csv(INPUT_CSV)
        except FileNotFoundError:
            print(f"‚ùå Error: '{INPUT_CSV}' not found.")
            return

        labeled_data = []

        # Find the text column
        text_col = None
        for col in df.columns:
            if col in ['Resume', 'Resume_str', 'text', 'content']:
                text_col = col
                break
        
        if not text_col:
            print("‚ùå Could not find Resume column.")
            return

        print(f"üìÑ Processing {len(df)} resumes...")

        for index, row in df.iterrows():
            text = str(row[text_col])
            
            # 1. Calculate Scores using Heuristics
            traits = {
                "openness": round(self.score_trait(text, "openness"), 2),
                "conscientiousness": round(self.score_trait(text, "conscientiousness"), 2),
                "extroversion": round(self.score_trait(text, "extroversion"), 2),
                "agreeableness": round(self.score_trait(text, "agreeableness"), 2),
                "neuroticism": round(self.score_trait(text, "neuroticism"), 2),
            }
            
            # 2. Calculate Meta Features (Structure)
            meta = [len(text), text.count('‚Ä¢'), 0.1]

            labeled_data.append({
                "id": index,
                "text": text,
                "meta_features": meta,
                "labels": list(traits.values())
            })

        # 3. Save
        with open(OUTPUT_FILE, 'w') as f:
            json.dump(labeled_data, f, indent=4)
            
        print(f"‚úÖ Success! Generated {len(labeled_data)} labeled samples.")
        print(f"üìÇ Saved to: {OUTPUT_FILE}")
        print("üëâ You can now proceed to train the RL Agent.")

if __name__ == "__main__":
    labeler = HeuristicLabeler()
    labeler.process()

üöÄ Starting Heuristic Labeling (Free Mode)...
üìÑ Processing 962 resumes...
‚úÖ Success! Generated 962 labeled samples.
üìÇ Saved to: final_labeled_dataset.json
üëâ You can now proceed to train the RL Agent.


In [26]:
import pandas as pd
import json
import time
import os
import google.generativeai as genai
from tqdm import tqdm

# --- CONFIGURATION ---
INPUT_CSV = 'UpdatedResumeDataSet.csv'
OUTPUT_FILE = 'final_labeled_dataset.json'
CHECKPOINT_FREQUENCY = 20 # Save often

# --- GET FREE KEY AT: https://aistudio.google.com/app/apikey ---
GOOGLE_API_KEY = "AIzaSyBIna2HOvlgxEVY6SzfVae4iTYbjSvuJnw"

class GeminiPipeline:
    def __init__(self):
        genai.configure(api_key=GOOGLE_API_KEY)
        # Use the Flash model (Fast & Free Tier available)
        self.model = genai.GenerativeModel('gemini-1.5-flash')
        
    def find_resume_column(self, df):
        possible_names = ['Resume', 'Resume_str', 'text', 'content', 'resume_text', 'cv']
        for col in df.columns:
            if col in possible_names:
                return col
        return None

    def analyze_resume(self, text):
        prompt = f"""
        Act as a Psychometrician. Analyze this resume text and estimate the Big Five Personality traits.
        Return ONLY a JSON object. No markdown, no text explanation.
        Format: {{"openness": 0.0, "conscientiousness": 0.0, "extroversion": 0.0, "agreeableness": 0.0, "neuroticism": 0.0}}
        
        Resume Text:
        {text[:4000]}
        """
        
        try:
            # Generate content
            response = self.model.generate_content(prompt)
            
            # Extract text
            raw_text = response.text
            
            # Clean up potential markdown formatting (```json ... ```)
            clean_text = raw_text.replace("```json", "").replace("```", "").strip()
            
            return json.loads(clean_text)
            
        except Exception as e:
            # Frequent error if you hit the rate limit (15 req/min)
            if "429" in str(e):
                print("‚è≥ Hit Free Tier Rate Limit. Sleeping for 10 seconds...")
                time.sleep(10)
            else:
                print(f"‚ö†Ô∏è Error: {e}")
            return None

    def run(self):
        if not os.path.exists(INPUT_CSV):
            print("‚ùå CSV not found.")
            return

        df = pd.read_csv(INPUT_CSV)
        text_col = self.find_resume_column(df)
        
        if not text_col:
            print("‚ùå No Resume column found.")
            return

        print(f"üöÄ Starting Gemini Free Tier Labeling...")
        
        processed_data = []
        if os.path.exists(OUTPUT_FILE):
            try:
                with open(OUTPUT_FILE, 'r') as f:
                    processed_data = json.load(f)
                    print(f"üîÑ Resuming from {len(processed_data)}...")
            except: pass

        remaining_df = df.iloc[len(processed_data):]

        for index, row in tqdm(remaining_df.iterrows(), total=len(remaining_df)):
            resume_text = str(row[text_col])
            
            if len(resume_text) < 10: continue

            traits = self.analyze_resume(resume_text)
            
            if traits:
                meta = [len(resume_text), resume_text.count('‚Ä¢'), 0.1]
                processed_data.append({
                    "id": index,
                    "text": resume_text,
                    "meta_features": meta,
                    "labels": list(traits.values())
                })
            
            # CHECKPOINT
            if len(processed_data) % CHECKPOINT_FREQUENCY == 0:
                with open(OUTPUT_FILE, 'w') as f:
                    json.dump(processed_data, f, indent=4)
            
            # CRITICAL: SLEEP TO STAY IN FREE TIER
            # Limit is usually 15 RPM (Requests Per Minute)
            # So we sleep 4 seconds between calls (60s / 15 = 4s)
            time.sleep(4)

        with open(OUTPUT_FILE, 'w') as f:
            json.dump(processed_data, f, indent=4)
        print("‚úÖ Done!")

if __name__ == "__main__":
    pipeline = GeminiPipeline()
    pipeline.run()

üöÄ Starting Gemini Free Tier Labeling...
üîÑ Resuming from 962...


0it [00:00, ?it/s]

‚úÖ Done!





In [36]:
DATA_FILE = 'final_labeled_dataset.json'
BATCH_SIZE = 8
LEARNING_RATE = 0.001
EPISODES = 10

In [44]:
class BERTEncoder:
    '''
    The 'Eyes' of the AI. Reads text and converts to numbers.
    '''
    def __init__(self):
        print('Loading BERT Model')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.model.eval()

    def encode(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', max_length= 128, truncation=True, padding='max_length')
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Return [CLS] token embedding (size : 768)
        return outputs.last_hidden_state[:, 0, : ]

class HiringAgent(nn.Module):
    '''
    The 'Brain' of the AI.
    Inputs: BERT Vector (768) + Meta Features (3) = 771 Inputs
    Output: 5 Personality Scores
    '''
    def __init__(self):
        super(HiringAgent, self).__init__()
        self.fc1 = nn.Linear(771, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, bert_vec, meta_vec):
        combined = torch.cat((bert_vec, meta_vec), dim= 1)
        x = self.fc1(combined)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.output(x)
        return self.sigmoid(x)

class TrainingSession:
    def __init__(self):
        self.bert = BERTEncoder()
        self.agent = HiringAgent()
        self.optimizer = optim.Adam(self.agent.parameters(), lr= LEARNING_RATE)
        self.loss_fn = nn.MSELoss()

    def load_data(self):
        if not os.path.exists(DATA_FILE):
            print(f"Error: {DATA_FILE} not found")
            return []
        with open(DATA_FILE, 'r') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} labeled resumes for training")
        return data

    def train(self):
        data = self.load_data()
        if not data: return
        print('Starting AI Training')

        for episode in range(EPISODES):
            random.shuffle(data)
            total_loss= 0

            # Simple Batch Processing
            for item in data[:50]:
                self.optimizer.zero_grad()

                # 1. Prepare Inputs
                text_vec = self.bert.encode(item['text'])

                # Normalize meta features [Length, Bullets, Caps]
                # We divide length by 1000 to keep numbers small (Neural Nets like small numbers)
                meta_raw = item['meta_features']
                meta_norm = [meta_raw[0]/1000.0, meta_raw[1]/10.0, meta_raw[2]]
                meta_vec = torch.tensor([meta_norm], dtype= torch.float32)

                # 2. AI Prediction
                prediction = self.agent(text_vec, meta_vec)

                # 3. The "Correct Answer" (from Gemini)
                target = torch.tensor([item['labels']], dtype=torch.float32)

                # 4. Learning (Backpropagation)
                loss = self.loss_fn(prediction, target)
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

                print(f"Episode {episode+1}: Loss={total_loss:.4f}")
        print('Training Finished. The agent is ready')
    def predict_new_candidate(self, resume_text):
        '''
        Applying the trained model on a brand new resume
        '''
        print('Analyzing new candidate')
        t_vec = self.bert.encode(resume_text)
        meta = [len(resume_text)/1000.0, resume_text.count('‚Ä¢')/10.0, 0.1]
        m_vec = torch.tensor([meta], dtype=torch.float32)

        with torch.no_grad():
            scores = self.agent(t_vec, m_vec)[0]
        traits = ["Openness", "Conscientiousness", "Extroversion", "Agreeableness", "Neuroticism"]
        for t, s in zip(traits, scores):
            print(f"{t}: {s:.2f}")

if __name__ == '__main__':
    session = TrainingSession()
    session.train()
    sample_resume = "Highly organized software engineer. Led a team of 10. Loves public speaking."
    session.predict_new_candidate(sample_resume)
    torch.save(session.agent.state_dict(), 'hiring_agent_model.pth')
    print('Model saved to hiring_agent_model.pth')
        

Loading BERT Model
Loaded 962 labeled resumes for training
Starting AI Training
Episode 1: Loss=0.0411
Episode 1: Loss=0.0922
Episode 1: Loss=0.1144
Episode 1: Loss=0.1469
Episode 1: Loss=0.1845
Episode 1: Loss=0.2225
Episode 1: Loss=0.2393
Episode 1: Loss=0.2933
Episode 1: Loss=0.3660
Episode 1: Loss=0.4056
Episode 1: Loss=0.4480
Episode 1: Loss=0.4657
Episode 1: Loss=0.4904
Episode 1: Loss=0.5020
Episode 1: Loss=0.5370
Episode 1: Loss=0.5644
Episode 1: Loss=0.6702
Episode 1: Loss=0.7150
Episode 1: Loss=0.7268
Episode 1: Loss=0.7743
Episode 1: Loss=0.7914
Episode 1: Loss=0.8682
Episode 1: Loss=0.8848
Episode 1: Loss=0.9006
Episode 1: Loss=0.9709
Episode 1: Loss=1.1503
Episode 1: Loss=1.1888
Episode 1: Loss=1.2866
Episode 1: Loss=1.3149
Episode 1: Loss=1.3504
Episode 1: Loss=1.3627
Episode 1: Loss=1.3831
Episode 1: Loss=1.4928
Episode 1: Loss=1.5280
Episode 1: Loss=1.6534
Episode 1: Loss=1.6609
Episode 1: Loss=1.6922
Episode 1: Loss=1.7225
Episode 1: Loss=1.8253
Episode 1: Loss=1.8407


In [None]:
# To run : streamlit run app.py