In [1]:
!pip install flask pandas scikit-learn nltk



In [2]:
import os
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, jsonify

In [3]:
nltk.download("punkt")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words("english"))

print("✅ Stopwords loaded successfully!")

✅ Stopwords loaded successfully!


In [5]:
file_path = "Resume.csv"  # Update with your actual file path if needed
df = pd.read_csv(file_path)

In [6]:
if "Resume_str" not in df.columns or "Category" not in df.columns:
    raise ValueError("❌ Error: Dataset must contain 'Resume_str' and 'Category' columns!")

print("✅ Dataset loaded successfully!")



✅ Dataset loaded successfully!


In [7]:
def clean_text(text):
    """
    Function to clean text by removing numbers, special characters, stopwords, and short words.
    """
    if not isinstance(text, str):  
        return ""
    
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words and len(word) > 2]  # Remove stopwords & short words

    return " ".join(words)


In [13]:
def rank_resumes(job_description, df, category=None):
    """
    Function to rank resumes based on a given job description using TF-IDF cosine similarity.
    """
    if category:
        df = df[df["Category"].str.lower() == category.lower()].copy()

    if df.empty:
        print("❌ No resumes found for the given category!")
        return pd.DataFrame()
    # Ensure no missing values in Resume_str column
    df = df.dropna(subset=["Resume_str"])

    # Preprocess job description
    job_description = clean_text(job_description)
    
    # Preprocess resumes
    df["cleaned_resume"] = df["Resume_str"].apply(clean_text)
    texts = [job_description] + df["cleaned_resume"].tolist()
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(texts)

    # Compute cosine similarity
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    df["similarity"] = similarities
        # Sort resumes by highest similarity score
    ranked_df = df.sort_values(by="similarity", ascending=False)
    
    return ranked_df[["ID", "Resume_str", "Category", "similarity"]]


In [15]:
test_job_desc = "Python developer with 5 years of experience in web development and data science."
test_category = "INFORMATION-TECHNOLOGY"

ranked_resumes = rank_resumes(test_job_desc, df, test_category)

# Show top-ranked resumes
print("\n🎯 Top-ranked resumes:\n", ranked_resumes.head())


🎯 Top-ranked resumes:
            ID                                         Resume_str  \
249  12635195      Objective    To obtain a position in the i...   
238  25207620           INFORMATION TECHNOLOGY CERTIFIED TECH...   
281  37242217           INFORMATION TECHNOLOGY CONSULTANT    ...   
229  36434348           INFORMATION TECHNOLOGY MANAGER       ...   
309  16186411           DATABASE PROGRAMMER/ANALYST (.NET DEV...   

                   Category  similarity  
249  INFORMATION-TECHNOLOGY    0.147876  
238  INFORMATION-TECHNOLOGY    0.144633  
281  INFORMATION-TECHNOLOGY    0.134053  
229  INFORMATION-TECHNOLOGY    0.131113  
309  INFORMATION-TECHNOLOGY    0.126709  


In [18]:

import threading
app = Flask(__name__)

@app.route("/rank", methods=["POST"])
def rank():
    data = request.json
    job_description = data.get("job_description", "")
    category = data.get("category", "")

    if not job_description or not category:
        return jsonify({"error": "Both job_description and category are required"}), 400

    ranked_resumes = rank_resumes(job_description, df, category)
    if ranked_resumes.empty:
        return jsonify({"message": "No matching resumes found"}), 200

    return jsonify(ranked_resumes.to_dict(orient="records"))

# Function to run Flask in a separate thread
def run_flask():
    app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False)

# Start Flask in the background
flask_thread = threading.Thread(target=run_flask)
flask_thread.start()

print("Flask API is running on http://127.0.0.1:5000")

Flask API is running on http://127.0.0.1:5000
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.100.17:5000
Press CTRL+C to quit
127.0.0.1 - - [22/Feb/2025 21:28:40] "POST /rank HTTP/1.1" 200 -
