In [None]:
import os
import pandas as pd
import numpy as np

import spacy
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import matplotlib.pyplot as plt
import umap
import seaborn as sns
%matplotlib inline

In [None]:
# os.chdir("C:\\Users\\Mhamed\\Desktop\\datakind_dive\\msvdd_Bloc\\own_work")
df = pd.read_csv('all_jobs.csv')

In [None]:
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser", "ner"])
nlp.max_length = 5000000
raw_desc_texts = list(df.description.values)
processed_desc_texts = [nlp(text) for text in raw_desc_texts]
processed_desc_vectors = np.array([text.vector for text in processed_desc_texts])

embedding = umap.UMAP().fit_transform(processed_desc_vectors)
plt.scatter(embedding[:, 0], embedding[:, 1], s=0.1);

In [None]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

stop = stopwords.words('english')
stop.extend(['degree', 'experience', 'work','field','related','from', 'subject', 're', 'edu',
                   'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do',
                   'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 
                   'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 
                   'line', 'even', 'also', 'may', 'take', 'come','li','br','datum','use','span','strong','tool','ul'])

exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    normalized = " ".join(lemma.lemmatize(word) for word in stop_free.split())
    return normalized

doc_complete = list(df.description)
data_words = [clean(doc).split() for doc in doc_complete]  

In [None]:
import gensim
from gensim.utils import lemmatize, simple_preprocess
from gensim import corpora

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def process_words(texts, stop_words=stop, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

In [None]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
id2word = corpora.Dictionary(data_ready)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
corpus = [id2word.doc2bow(text) for text in data_ready]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(corpus=corpus, id2word=id2word, num_topics=4, random_state=7,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

print(ldamodel.print_topics())

In [None]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

# Get topic weights
topic_weights = []
for i, row_list in enumerate(ldamodel[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values
# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

umap_lda = umap.UMAP().fit_transform(arr)

import seaborn as sns
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 4
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="UMAP Clustering of {} LDA Topics".format(n_topics), 
              plot_width=700, plot_height=500)
plot.scatter(x=umap_lda[:,0], y=umap_lda [:,1], color=mycolors[topic_num])
show(plot)

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = ldamodel.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()