In [1]:
import random
import pandas as pd

# Sample data
names = [
    "Alice Johnson", "Bob Smith", "Charlie Brown", "David Lee", "Emma Davis", "Fiona Clark", "George Harris", 
    "Hannah White", "Ian Martinez", "Julia Adams", "Kevin Lewis", "Liam Scott", "Mia Turner", "Nathan Hall", 
    "Olivia Allen", "Paul Wright", "Quincy Young", "Rachel King", "Samuel Walker", "Tina Nelson",
    "Umar Khan", "Vera Brooks", "William Carter", "Xander Evans", "Yvonne Foster", "Zack Green", "Aaron Hughes", 
    "Bella Irving", "Cameron Johnson", "Diana Kim", "Ethan Long", "Faith Moore", "Gabriel Nelson", "Hailey Owens", 
    "Isaac Perez", "Jasmine Quinn", "Kyle Roberts", "Lara Simmons", "Mason Thompson", "Natalie Underwood", 
    "Oscar Vasquez", "Penelope Watson", "Quentin Xiong", "Rebecca Young", "Sebastian Zimmerman", "Taylor Anderson", 
    "Ulysses Bennett", "Valerie Cooper", "Walter Dixon", "Xenia Edwards", "Yosef Franklin", "Zara Grant", "Adam Hayes", 
    "Brooke Ingram", "Connor James", "Delilah King", "Evan Lewis", "Felicity Morgan", "Gavin Norris", "Holly Owens", 
    "Ian Parker", "Jessica Quinn", "Keith Russell", "Lillian Scott", "Matthew Turner", "Nicole Underwood", "Owen Vasquez", 
    "Paige Walker", "Quinn Xavier", "Ryan Young", "Sophia Zimmerman", "Thomas Anderson", "Uma Bennett", "Victoria Clark", 
    "Wesley Davis", "Xander Ellis", "Yara Franklin", "Zane Grant", "Amber Harris", "Brandon Ingram", "Catherine Johnson", 
    "Derek Kim", "Eleanor Long", "Felix Moore", "Grace Nelson", "Henry Owens", "Isla Perez", "Jack Quinn", "Kylie Roberts", 
    "Leo Simmons", "Madeline Thompson", "Nathan Underwood", "Olivia Vasquez", "Peter Watson", "Quincy Xiong", "Rose Young", 
    "Samuel Zimmerman", "Tessa Anderson", "Umar Bennett", "Violet Cooper", "Wyatt Dixon", "Xenia Edwards", "Yusuf Franklin"
]
phone_numbers = [f"+1-202-555-{random.randint(1000,9999)}" for _ in range(100)]
skills = ["Python", "Machine Learning", "Data Analysis", "React", "Java", "SQL", "Cloud Computing", "Cybersecurity", "Natural Language Processing", "TensorFlow"]
companies = ["Google", "Microsoft", "Amazon", "Facebook", "Tesla", "IBM", "Netflix", "Adobe", "Intel", "Twitter"]

# Generate dataset
data = []
for i in range(100):
    resume_skills = random.sample(skills, k=random.randint(3, 6))
    linkedin_skills = random.sample(skills, k=random.randint(3, 6))
    internship = random.choice(companies)
    duration = random.randint(1, 12)  # Internship duration in months
    data.append([names[i], phone_numbers[i], resume_skills, linkedin_skills, internship, duration])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Name", "Phone Number", "Resume Skills", "LinkedIn Skills", "Internship", "Internship Duration"])

# Save dataset
df.to_csv("dataset.csv", index=False)
print("Dataset generated and saved as dataset.csv")


Dataset generated and saved as dataset.csv


In [11]:
import pandas as pd
import spacy
from rapidfuzz import process, fuzz
import ast  # Safer than eval for parsing lists

# Load NLP model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("dataset.csv")

# Extract all unique skills from dataset (ensure correct parsing)
all_skills = set()
for skills in df["Resume Skills"].dropna().tolist() + df["LinkedIn Skills"].dropna().tolist():
    try:
        parsed_skills = ast.literal_eval(skills)  # Safer parsing
        all_skills.update([skill.lower() for skill in parsed_skills])  # Store lowercase for consistency
    except:
        continue  # Skip invalid rows

# Function to extract skills from user input
def extract_skills(user_input):
    doc = nlp(user_input.lower())  # Convert input to lowercase for matching
    extracted_skills = set()

    for token in doc:
        if token.pos_ in ["NOUN", "PROPN"]:  # Consider only nouns and proper nouns
            token_text = token.text.strip()

            # Exact match first
            if token_text in all_skills:
                extracted_skills.add(token_text)
                continue

            # Use fuzzy matching as a fallback (threshold 85 for better accuracy)
            match, score, _ = process.extractOne(token_text, all_skills, scorer=fuzz.ratio)
            if score > 85:
                extracted_skills.add(match)

    return list(extracted_skills)

# Example user input
user_input = input("Describe your project and the skills needed: ")
identified_skills = extract_skills(user_input)

# Display extracted skills
print("\nIdentified Skills:", identified_skills)



Identified Skills: ['tensorflow']


In [16]:
import pandas as pd
import ast

# Load dataset
df = pd.read_csv("dataset.csv")

# Ensure skills are properly parsed
def parse_skills(skill_str):
    try:
        skills = ast.literal_eval(skill_str)  # Convert string to list safely
        return [skill.lower() for skill in skills]  # Convert to lowercase for matching
    except:
        return []  # Return empty list if parsing fails

df["Resume Skills"] = df["Resume Skills"].apply(parse_skills)
df["LinkedIn Skills"] = df["LinkedIn Skills"].apply(parse_skills)

# Function to find top 10 helpers with phone numbers
def find_top_helpers(identified_skills):
    identified_skills = [skill.lower() for skill in identified_skills]  # Normalize case
    matching_candidates = []

    for _, row in df.iterrows():
        resume_skills = set(row["Resume Skills"])
        linkedin_skills = set(row["LinkedIn Skills"])

        # Check if any extracted skill is in Resume or LinkedIn skills
        if any(skill in resume_skills or skill in linkedin_skills for skill in identified_skills):
            matching_candidates.append((row["Name"], row["Phone Number"], row["Internship Duration"]))

    # Sort by internship duration (descending order) and get top 10
    top_helpers = sorted(matching_candidates, key=lambda x: x[2], reverse=True)[:10]

    return top_helpers

# Example usage
identified_skills = ['TensorFlow']  # Use the extracted skills from NLP step
print("\nExtracted Skills for Matching:", identified_skills)

helpers = find_top_helpers(identified_skills)

# Display results
print("\nTop 10 People Who Can Help (sorted by experience):")
if not helpers:
    print("No matching candidates found!")
else:
    for i, (name, phone, duration) in enumerate(helpers, start=1):
        print(f"{i}. {name} | Phone: {phone} | Internship Duration: {duration} months")



Extracted Skills for Matching: ['TensorFlow']

Top 10 People Who Can Help (sorted by experience):
1. David Lee | Phone: +91-202-555-9309 | Internship Duration: 12 months
2. Jessica Quinn | Phone: +91-202-555-3328 | Internship Duration: 12 months
3. Ryan Young | Phone: +91-202-555-8384 | Internship Duration: 12 months
4. Samuel Zimmerman | Phone: +91-202-555-4735 | Internship Duration: 12 months
5. Nathan Hall | Phone: +91-202-555-3704 | Internship Duration: 11 months
6. Olivia Allen | Phone: +91-202-555-4262 | Internship Duration: 11 months
7. Tina Nelson | Phone: +91-202-555-7869 | Internship Duration: 11 months
8. Xander Evans | Phone: +91-202-555-7205 | Internship Duration: 11 months
9. Hailey Owens | Phone: +91-202-555-8588 | Internship Duration: 11 months
10. Taylor Anderson | Phone: +91-202-555-4266 | Internship Duration: 11 months


In [1]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\AAYUSH THE
[nltk_data]     GREAT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True