In [1]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/97.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m92.2/97.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-23.12.11-py3-none-any.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━

In [2]:
!pip install PyPDF2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import os
import PyPDF2
import re
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
def extract_job_description(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, 3)
    return ' '.join(str(sentence) for sentence in summary)

def process_resume(file_path, character_traits):
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)

        text = ""
        for page in reader.pages:
            text += page.extract_text()

    lemmatizer = WordNetLemmatizer()

    trait_scores = {}
    for trait, trait_data in character_traits.items():
        keywords = trait_data['keywords']
        weight = trait_data['weight']

        lemmatized_text = " ".join([lemmatizer.lemmatize(word.lower()) for word, tag in pos_tag(text.split()) if tag.startswith('V') or tag.startswith('N')])
        trait_count = sum(1 for keyword in keywords if re.search(r'\b' + re.escape(lemmatizer.lemmatize(keyword)) + r'\b', lemmatized_text, re.IGNORECASE))

        score = min(trait_count * weight, 3)
        trait_scores[trait] = score

    job_description = extract_job_description(text)

    return trait_scores, job_description

def process_directory(directory, character_traits):
    all_trait_scores = defaultdict(list)
    all_job_descriptions = {}

    invalid_titles = ['owner', 'business owner', 'founder', 'co-founder', 'entrepreneur']

    def is_valid_title(title):
        return not any(keyword in title.lower() for keyword in invalid_titles)

    def extract_job_title(file_path):
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            first_page = reader.pages[0]
            text = first_page.extract_text()

            title_match = re.search(r'^(.+?)\s*(?:,|\(|$)', text, re.MULTILINE)
            if title_match and is_valid_title(title_match.group(1).strip()):
                return title_match.group(1).strip()
            else:
                return None

    for root, dirs, files in os.walk(directory):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(root, filename)
                job_title = extract_job_title(file_path)
                if job_title:
                    trait_scores, job_description = process_resume(file_path, character_traits)
                    folder_name = os.path.basename(root)
                    all_trait_scores[(folder_name, job_title)].append(trait_scores)
                    all_job_descriptions[(folder_name, job_title)] = job_description

    return all_trait_scores, all_job_descriptions

def calculate_average_scores(all_trait_scores, character_traits):
    average_scores = {}
    for (folder_name, job_title), scores in all_trait_scores.items():
        average_scores[(folder_name, job_title)] = {}
        for trait in character_traits.keys():
            trait_scores = [score.get(trait, 0) for score in scores]
            average_score = sum(trait_scores) / len(trait_scores)
            average_scores[(folder_name, job_title)][trait] = int(average_score + 0.5)

    return average_scores

In [None]:
character_traits = {
    "Leadership": {
        "keywords": ["lead", "manage", "supervise", "mentor", "guide", "inspire", "influence", "direct", "oversee", "coach", "motivate", "delegate"],
        "weight": 2
    },
    "Communication": {
        "keywords": ["communicate", "present", "write", "speak", "listen", "articulate", "convey", "interact", "correspond", "negotiate", "persuade", "clarify"],
        "weight": 1.5
    },
    "Teamwork": {
        "keywords": ["collaborate", "cooperate", "team player", "support", "coordinate", "contribute", "assist", "participate", "involve", "engage", "share", "partner"],
        "weight": 1.8
    },
    "Problem Solving": {
        "keywords": ["solve", "analyze", "critical thinking", "strategic", "resourceful", "troubleshoot", "innovate", "create", "develop", "design", "implement", "optimize"],
        "weight": 1.6
    },
    "Creativity": {
        "keywords": ["innovative", "creative", "original", "imaginative", "visionary", "inventive", "artistic", "expressive", "conceptual", "think outside the box", "brainstorm", "ideate"],
        "weight": 1.2
    },
    "Adaptability": {
        "keywords": ["adapt", "flexible", "versatile", "agile", "adjust", "resilient", "change", "modify", "alter", "vary", "transform", "evolve"],
        "weight": 1.4
    },
    "Work Ethic": {
        "keywords": ["dedicated", "hardworking", "committed", "reliable", "responsible", "diligent", "persistent", "tenacious", "conscientious", "meticulous", "thorough", "dependable"],
        "weight": 1.7
    },
    "Time Management": {
        "keywords": ["organize", "prioritize", "multitask", "efficient", "productive", "punctual", "schedule", "plan", "execute", "manage", "balance", "allocate"],
        "weight": 1.3
    },
    "Interpersonal Skills": {
        "keywords": ["empathy", "relationship building", "customer service", "conflict resolution", "negotiation", "diplomacy", "tact", "patience", "understanding", "compassionate", "friendly", "approachable"],
        "weight": 1.6
    },
    "Attention to Detail": {
        "keywords": ["meticulous", "thorough", "accurate", "precise", "attentive", "observant", "focused", "careful", "diligent", "scrupulous", "fastidious", "methodical"],
        "weight": 1.5
    },
    "Initiative": {
        "keywords": ["proactive", "self-starter", "motivated", "ambitious", "driven", "entrepreneurial", "eager", "energetic", "enthusiastic", "passionate", "determined", "persistent"],
        "weight": 1.4
    },
    "Analytical Thinking": {
        "keywords": ["logical", "analytical", "data-driven", "quantitative", "research", "evaluate", "assess", "examine", "investigate", "interpret", "deduce", "infer"],
        "weight": 1.3
    },
    "Emotional Intelligence": {
        "keywords": ["self-aware", "empathetic", "socially aware", "relationship management", "self-regulation", "motivation", "intuitive", "perceptive", "insightful", "reflective", "considerate", "thoughtful"],
        "weight": 1.7
    },
    "Integrity": {
        "keywords": ["honest", "ethical", "trustworthy", "transparent", "accountable", "principled", "moral", "sincere", "genuine", "reliable", "dependable", "consistent"],
        "weight": 1.9
    },
    "Resilience": {
        "keywords": ["resilient", "perseverance", "grit", "determination", "tenacity", "endurance", "stamina", "robust", "durable", "tough", "strong", "adaptable"],
        "weight": 1.6
    },
    "Cultural Awareness": {
        "keywords": ["diversity", "inclusive", "multicultural", "global", "international", "cross-cultural", "cultural sensitivity", "cultural competence", "cultural intelligence", "cultural fluency", "cultural agility", "cultural adaptability"],
        "weight": 1.4
    },
    "Programming Languages": {
        "keywords": ["Python", "Java", "C++", "JavaScript", "SQL"],
        "weight": 1.8
    },
    "Technical Skills": {
        "keywords": ["machine learning", "data analysis", "web development", "software engineering"],
        "weight": 1.6
    },
    "Office Tools": {
        "keywords": ["Microsoft Office", "Excel", "PowerPoint", "Word"],
        "weight": 1.2
    }
}

In [4]:
!pip install nltk

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

directory = "/content/drive/My Drive/data"
all_trait_scores, all_job_descriptions = process_directory(directory, character_traits)
average_scores = calculate_average_scores(all_trait_scores, character_traits)



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


NameError: name 'process_directory' is not defined

In [None]:
df = pd.DataFrame.from_dict(average_scores, orient='index')
df = df.fillna(0).astype(int)
df['Job Description'] = df.index.map(all_job_descriptions.get)
df.index.names = ['Domain', 'Job Title']
df = df.reset_index()

In [None]:
df.head(20)

Unnamed: 0,Domain,Job Title,Leadership,Communication,Teamwork,Problem Solving,Creativity,Adaptability,Work Ethic,Time Management,...,Initiative,Analytical Thinking,Emotional Intelligence,Integrity,Resilience,Cultural Awareness,Programming Languages,Technical Skills,Office Tools,Job Description
0,DIGITAL-MEDIA,DIRECTOR,2,0,3,3,0,1,1,1,...,0,0,0,0,0,0,0,1,0,Delivered marketing programs and sales tools e...
1,DIGITAL-MEDIA,MEDIA ACTIVITIES SPECIALIST,2,0,0,3,0,0,2,3,...,0,1,0,0,2,1,0,0,0,Summary Multi-Tasking Media Relations Results-...
2,DIGITAL-MEDIA,SENIOR MANAGER,3,0,0,3,0,0,2,3,...,0,0,0,0,0,1,0,0,0,", State Created Customer Lifecycle Program gen..."
3,DIGITAL-MEDIA,DIGITAL PROJECT MANAGER,3,0,1,3,1,1,1,3,...,0,2,0,0,1,1,0,1,0,Created metrics to evaluate: Who is visiting 6...
4,DIGITAL-MEDIA,MONITOR TECH,2,3,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,2,Ability to perform duties under critical deadl...
5,DIGITAL-MEDIA,COMMUNICATIONS SPECIALIST,2,0,2,2,1,0,0,3,...,0,3,0,0,0,1,0,0,3,"Skills Adobe Creative Suite 6, Microsoft Offic..."
6,DIGITAL-MEDIA,SOCIAL MEDIA MARKETING MANAGER,2,0,0,3,0,0,0,1,...,3,3,0,0,0,1,0,0,0,Summary Recent graduate with excellent researc...
7,DIGITAL-MEDIA,EDITORIAL ASSISTANT,0,0,2,0,0,0,2,0,...,0,0,0,0,2,1,0,0,0,Summary Focused Journalist successful in devel...
8,DIGITAL-MEDIA,SENIOR DIRECTOR,0,2,3,3,0,1,0,3,...,0,3,0,0,2,1,0,0,0,Crafted media strategy track for the company's...
9,DIGITAL-MEDIA,DIGITAL MARKETING ASSOCIATE,2,0,3,3,0,0,2,1,...,0,0,0,2,0,0,0,0,0,Summary Dynamic and highly enthusiastic indivi...


In [None]:
from google.colab import drive
drive.mount('/content/drive')

output_path = '/content/drive/My Drive/data/OFI.csv'
df.to_csv(output_path, index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
X = df.drop(['Job Title', 'Domain', 'Job Description'], axis=1)
y_title = df['Job Title']
y_domain = df['Domain']

y_title_encoded = pd.get_dummies(y_title)
y_domain_encoded = pd.get_dummies(y_domain)
y = pd.concat([y_title_encoded, y_domain_encoded], axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

model.save('job_recommendation_model.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

  saving_api.save_model(


In [None]:
print("Please rate your character traits on a scale of 0 to 3:")
user_input = {}
num_questions = len(X.columns)
for i, trait in enumerate(X.columns, start=1):
    while True:
        try:
            rating = int(input(f"{trait}: "))
            if 0 <= rating <= 3:
                user_input[trait] = rating
                break
            else:
                print("Invalid input. Please enter a value between 0 and 3.")
        except ValueError:
            print("Invalid input. Please enter a valid integer.")
    print(f"{num_questions - i} questions remaining.\n")

user_df = pd.DataFrame([user_input])
user_scaled = scaler.transform(user_df)

predictions = model.predict(user_scaled)
num_predictions = 5

predicted_titles = y_title_encoded.columns[predictions[:, :len(y_title_encoded.columns)].argsort()[-num_predictions:][::-1]]
predicted_domains = y_domain_encoded.columns[predictions[:, len(y_title_encoded.columns):].argsort()[-num_predictions:][::-1]]

print("\nTop Predicted Jobs:")
for i in range(num_predictions):
    job_title = predicted_titles[0][i]
    domain = predicted_domains[0][i]
    job_description = df[(df['Job Title'] == job_title) & (df['Domain'] == domain)]['Job Description'].values[0]

    print(f"Predicted Job {i+1}: {job_title}")
    print(f"Predicted Domain {i+1}: {domain}")
    print(f"Job Description {i+1}: {job_description}\n")

Please rate your character traits on a scale of 0 to 3:
Leadership: 1
18 questions remaining.

Communication: 2
17 questions remaining.

Teamwork: 3
16 questions remaining.

Problem Solving: 2
15 questions remaining.

Creativity: 1
14 questions remaining.

Adaptability: 2
13 questions remaining.

Work Ethic: 3
12 questions remaining.

Time Management: 3
11 questions remaining.

Interpersonal Skills: 1
10 questions remaining.

Attention to Detail: 2
9 questions remaining.

Initiative: 3
8 questions remaining.

Analytical Thinking: 2
7 questions remaining.

Emotional Intelligence: 0
6 questions remaining.

Integrity: 0
5 questions remaining.

Resilience: 0
