In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from tqdm import tqdm
import spacy, nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

In [None]:
df = pd.read_csv('./datasets/fake_job_postings.csv').drop(['job_id', 'salary_range', 'location'], axis=1)

for col, pct_null in pd.Series(df.isna().sum() / len(df)).items():
    if pct_null < .05:
        df.dropna(subset=[col], inplace=True)
    else:
        df[col].fillna('not provided', inplace=True)

In [None]:
df.head()

In [None]:
FULL_TEXT_TEMPLATE = """
# {title}

Employment Type: {employment_type}
Department: {department}
Industry: {industry}
Function: {function}
Required Experience: {required_experience}
Required Education: {required_education}

# Company Profile
{company_profile}

# Description
{description}

# Requirements
{requirements}

# Benefits
{benefits}
"""

def get_full_text_feature(row):
    return FULL_TEXT_TEMPLATE.format(
        title=row["title"], employment_type=row["employment_type"], department=row["department"],
        industry=row["industry"], function=row["function"], required_experience=row["required_experience"],
        required_education=row["required_education"], company_profile=row["company_profile"],
        description=row["description"], requirements=row["requirements"], benefits=row["benefits"]
    )

In [None]:
df['full_text_feature'] = df.apply(lambda row: get_full_text_feature(row), axis=1)

In [None]:
text_features = df['full_text_feature'].values
print(text_features[0])

In [None]:
text_entities = []

for feat in tqdm(text_features[:10]):
    doc = nlp(feat)
    entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
    text_entities.append({
        "text": feat,
        "entities": entities
    })

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,3), stop_words='english')
X_tfidf = tfidf.fit_transform(text_features)

In [None]:
X_tfidf

In [None]:
nmf = NMF(n_components=5)
doc_topics = nmf.fit_transform(X_tfidf)
topic_words = nmf.components_

In [None]:
feature_names = tfidf.get_feature_names_out()

In [None]:
for idx, topic in enumerate(topic_words):
    top_words = [feature_names[i] for i in topic.argsort()[:-6:-1]]
    print(f"Topic #{idx + 1}: {' | '.join(top_words)}")

In [None]:
doc_top_topics = np.argmax(doc_topics, axis=1)
for i, doc in enumerate(text_features[:3]):
    print(f"\nDocument {i} (Topic {doc_topics[i]})")

In [None]:
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# X_bert = embedding_model.encode(text_features, normalize_embeddings=True)
# joblib.dump(X_bert, './datasets/X_bert.joblib')

X_bert = joblib.load('./datasets/X_bert.joblib')

In [None]:
bert_topic_model = BERTopic()
topics, probs = bert_topic_model.fit_transform(text_features[:10])

In [None]:
# shows topic and probabilities for 0th doc
topics[0], probs[0]

In [None]:
bert_topic_model.get_topic(topics[0])

In [None]:
bert_topic_model.get_topic_info()

In [None]:
tokenized_sentences = []
vectorized_sentences = []
for f in text_features:
    tokens = word_tokenize(f.lower())
    clean_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokenized_sentences.append(clean_tokens)

In [None]:
dictionary = corpora.Dictionary(tokenized_sentences)
corpus = [dictionary.doc2bow(text) for text in tokenized_sentences]

In [None]:
print(corpus[0])

In [None]:
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, random_state=42)
lda.print_topics()

In [None]:
lda[corpus[0]]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_features)
X_tok = tokenizer.texts_to_sequences(text_features)
X_tok = pad_sequences(X_tok, maxlen=1000)