In [275]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [276]:
events = pd.read_csv("./filestore/events/fact_events.csv").drop('Unnamed: 0', axis=1)

In [277]:
events.head()

Unnamed: 0,description,duration,headcount,event_id,maybe_rsvp_count,name,rating,rsvp_limit,status,time,updated,utc_offset,venue_id,visibility,waitlist_count,yes_rsvp_count
0,These meetups are very informal. I won't be st...,9000000.0,12,147478282,0,PyLadies Dublin Inaugural meetup - bring laptop!,,,past,1384799400000,1384853013000,0,16176442,public,0,22
1,"Our second meetup will be at Engine Yard, a bi...",,0,152107272,0,Second PyLadies Dublin Meetup - Let's get coding!,,,past,1387218600000,1387230236000,0,13054852,public,0,12
2,Happy New Year! Hope you all had a good Christ...,10800000.0,0,159368332,0,Our first PyLadies Dublin meetup of 2014,,,past,1390240800000,1390470097000,0,17757332,public,0,11
3,Bring your laptops along. If you want some foo...,10800000.0,0,162851382,0,PyLadies Dublin Feb meetup,,,past,1392660000000,1392672314000,0,18096492,public,0,9
4,!!!CHANGE OF VENUE UPDATE!!! &gt;&gt; More inf...,10800000.0,0,166955082,0,PyLadies Dublin Meetup,,,past,1395165600000,1395219566000,0,18950322,public,0,11


In [278]:
events.dtypes

description          object
duration            float64
headcount             int64
event_id             object
maybe_rsvp_count      int64
name                 object
rating              float64
rsvp_limit          float64
status               object
time                  int64
updated               int64
utc_offset            int64
venue_id              int64
visibility           object
waitlist_count        int64
yes_rsvp_count        int64
dtype: object

# Let's look at the event descriptions

In [279]:
desc = events['description'].tolist()

In [280]:
desc[47]



## Cleaning up the description

### Unicode, URLS, smiley faces

In [281]:
special_dict = {
    'smile' : r'[:;=]-[)D]?',
    'uni' : r'\xa0',
    'url' : r'(?:https?|ftp|file)://\S+',
    'uls_chars' :  r'(?:&[gla][tm]p?)+',
    'dupe_space' : r'\s{2,}|\s+\Z'
}

def remove_special(s):
    for k, regex in special_dict.items():
        if k == 'uni':
            s = re.sub(regex, ' ', s)
        elif k == 'dupe_space':
            s = re.sub(regex, ' ', s)
        else:
            s = re.sub(regex, '', s)
    return s

clean_special = [remove_special(s) for s in desc]

In [282]:
clean_special[47]



### Punctuation and emojis
There are some emoji characters and unwanted punctuation

In [283]:
def find_unwanted_chars(s):
#     pattern = r"[^a-zA-Z0-9\s.\-/':!?&@€$_+Éáéóć%]"
    pattern = r"[^a-zA-Z0-9\s/@€$_+Éáéóć%]"
    return set(re.findall(pattern, s))

unwanted = set(char for e in desc for char in find_unwanted_chars(e))
clean_punct = []

for sent in clean_special:
    for punct in unwanted:
        sent = sent.replace(punct, "")
    clean_punct.append(sent)

In [284]:
clean_punct[47]

'Workday will be hosting us for our July meetup Food and refreshments will also be provided We have two speakers from Workday  Amanda Galligan  Principal Network Engineer will do a talk on Ansible A network engineers best friend  Alan Kennedy  Principal Software Development Engineer Infra Services will also do a talk on Writing network services using python coroutines  Naomi OReilly  QA Engineer Grid Cloud Master will be conducting a talk on BDD in Python  An introduction to behaviour driven development in Python with a focus on automated acceptance testing Remember to bring your laptop You will have a chance to deep dive with speakers pair programme on a tutorial a project could even be your own ask a question dont be shy we are here to help If you have announcements events projects questions feel free to add them to  ROUGH RUNNING ORDER 1830 Guests arrive  food  beverages1900 Welcome  Announcements by Vicky1905 Quick word from Workday representative1910 Lightning talk 11925 Lightning

# Removing words that have digits

Reason for this is that after tokenizing, I found that there are tokens which consist of digits (perhaps meetup start/end times)

In [285]:
def remove_digits(s):
    pattern = re.compile(r'\b(?:\d+\S+|\S+\d+)\b')
    return re.sub(pattern, '', s)

clean_digits = [remove_digits(s) for s in clean_punct]

In [286]:
clean_digits[47]

'Workday will be hosting us for our July meetup Food and refreshments will also be provided We have two speakers from Workday  Amanda Galligan  Principal Network Engineer will do a talk on Ansible A network engineers best friend  Alan Kennedy  Principal Software Development Engineer Infra Services will also do a talk on Writing network services using python coroutines  Naomi OReilly  QA Engineer Grid Cloud Master will be conducting a talk on BDD in Python  An introduction to behaviour driven development in Python with a focus on automated acceptance testing Remember to bring your laptop You will have a chance to deep dive with speakers pair programme on a tutorial a project could even be your own ask a question dont be shy we are here to help If you have announcements events projects questions feel free to add them to  ROUGH RUNNING ORDER  Guests arrive  food   Welcome  Announcements by  Quick word from Workday  Lightning talk  Lightning talk  Lightning talk   Deep dive with speakers s

# Cleaning duplicate spaces

In [287]:
regex = r'\s{2,}|\s+\Z'
event_corpus = [re.sub(regex, ' ', s) for s in clean_digits]

In [288]:
event_corpus[47]

'Workday will be hosting us for our July meetup Food and refreshments will also be provided We have two speakers from Workday Amanda Galligan Principal Network Engineer will do a talk on Ansible A network engineers best friend Alan Kennedy Principal Software Development Engineer Infra Services will also do a talk on Writing network services using python coroutines Naomi OReilly QA Engineer Grid Cloud Master will be conducting a talk on BDD in Python An introduction to behaviour driven development in Python with a focus on automated acceptance testing Remember to bring your laptop You will have a chance to deep dive with speakers pair programme on a tutorial a project could even be your own ask a question dont be shy we are here to help If you have announcements events projects questions feel free to add them to ROUGH RUNNING ORDER Guests arrive food Welcome Announcements by Quick word from Workday Lightning talk Lightning talk Lightning talk Deep dive with speakers selfdriven tutorials

# NLP to clean the text

In [289]:
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

nlp = spacy.load('en')

In [290]:
nlp_corpus = [nlp(sent) for sent in event_corpus]

In [291]:
nlp_corpus[47]

Workday will be hosting us for our July meetup Food and refreshments will also be provided We have two speakers from Workday Amanda Galligan Principal Network Engineer will do a talk on Ansible A network engineers best friend Alan Kennedy Principal Software Development Engineer Infra Services will also do a talk on Writing network services using python coroutines Naomi OReilly QA Engineer Grid Cloud Master will be conducting a talk on BDD in Python An introduction to behaviour driven development in Python with a focus on automated acceptance testing Remember to bring your laptop You will have a chance to deep dive with speakers pair programme on a tutorial a project could even be your own ask a question dont be shy we are here to help If you have announcements events projects questions feel free to add them to ROUGH RUNNING ORDER Guests arrive food Welcome Announcements by Quick word from Workday Lightning talk Lightning talk Lightning talk Deep dive with speakers selfdriven tutorials 

In [292]:
len(nlp_corpus)

75

In [293]:
entity_dict = {}
for entry in nlp_corpus:
    for entity in entry.ents:
        try:
            entity_dict[entity.label_].append(entity.text)
        except:
            entity_dict[entity.label_] = [entity.text]

In [294]:
entity_dict.keys()

dict_keys(['PERSON', 'ORG', 'TIME', 'ORDINAL', 'GPE', 'EVENT', 'DATE', 'NORP', 'PRODUCT', 'LOC', 'MONEY', 'CARDINAL', 'WORK_OF_ART', 'FAC', 'LAW', 'LANGUAGE'])

In [295]:
{p for p in entity_dict['PERSON']}

{'AGENDA',
 'AOBFeel',
 'Accenture Centre',
 'Aimi',
 'Aimi Forgan',
 'Alan Kennedy',
 'Allen Thomas Varghese Agnes',
 'Andrea Fagan',
 'Andrea Magnorsky',
 'Annie Lowney',
 'Announcements Talk',
 'Ariane',
 'Ariane Description',
 'Based',
 'CPython',
 'Carlos Amaral Daire Selfdriven',
 'Cheryl',
 'Chris Docherty',
 'Claire Hough',
 'Clavis Insight',
 'Cloud Native',
 'Cork Max',
 'Cormac McGuireDublin Scala Users',
 'Craft Night',
 'Deadline',
 'Deepali',
 'Deirdre Lee Talk Title',
 'Delighted',
 'Demonware',
 'Depending',
 'Description Build',
 'Description Deepali',
 'Details TBA',
 'Django',
 'Django Girls',
 'Django Girls Dublin',
 'Dogpatch Labs',
 'Down',
 'EMEA Comp',
 'Eamon',
 'Engineer',
 'Fa',
 'Feb',
 'Feb Please',
 'Feel',
 'Fionnuala Gibney',
 'Flexi Room',
 'Floor Turn',
 'Following Lias',
 'Fortune',
 'Frances Morgan',
 'Georges Dock',
 'Girls Dublin',
 'GitOps',
 'Grace Django',
 'Grid Cloud Master',
 'Groupon',
 'Guys',
 'Hacker Devalyst',
 'Heres',
 'Hey Flask',
 'H

In [296]:
lemmatized_corpus = [ ' '.join(word.lemma_.lower() if word.lemma_ != '-PRON-' else word.text for word in sent) for sent in nlp_corpus]

In [297]:
lemmatized_corpus[47]

'workday will be host us for our july meetup food and refreshment will also be provide We have two speaker from workday amanda galligan principal network engineer will do a talk on ansible a network engineer good friend alan kennedy principal software development engineer infra services will also do a talk on writing network service use python coroutine naomi oreilly qa engineer grid cloud master will be conduct a talk on bdd in python an introduction to behaviour drive development in python with a focus on automate acceptance testing remember to bring your laptop You will have a chance to deep dive with speaker pair programme on a tutorial a project could even be your own ask a question do not be shy we be here to help if you have announcement event project question feel free to add them to rough running order guest arrive food welcome announcements by quick word from workday lightning talk lightning talk lightning talk deep dive with speaker selfdriven tutorial own project pair progr

## Tokenizing and removing stop words

In [298]:
event_text = lemmatized_corpus

In [299]:
en_stopwords = stopwords.words('english')

In [320]:
extra_stopwords = [
    'python', 'speaker', 'speakers', 'dublin', 'ireland', 'pyladies',
    'talk', 'talks', 'irish', 'james', 'julie', 'leticia', 'charlie',
    'michael', 'marjai', 'atmasked', 'masked', 'isabella', 'annie',
    'lowney', 'daire', 'amaral', 'carlos', 'campbell', 'chris', 
    'docherty', 'louise', 'deepali', 'andrea', 'diarmuid', 'sorcha',
    'jonathan', 'eamon', 'shane', 'stella', 'mclennan', 'ingrid',
    'aimi', 'niamh', 'forgan', 'jans', 'sabine', 'vicky', 'ariane',
    'kats', 'bourke', 'georges', 'frances', 'heres', 'donnelly', 
    'lia', 'thanks', 'per', 'simon', 'maybe', 'else', 'joanna', 
    'guy', 'piotr', 'milian', 'mccluskey', 'irelands', 'twomey',
    'máté', 'serena', 'actually', 'mick', 'mansura', 'mihai', 
    'atmaske', 'mark', 'hiroki', 'claire', 'yvonne', 'franciscos',
    'mask', 'annie', 'lowney'
]
en_stopwords = en_stopwords + extra_stopwords

Vectorizer splits our documents into a distribution of words.
 
X is a term document matrix, where each document is a column and words are rows. The value associated to each cell is the TF-IDF

In [321]:
tfidf_vectorizer = TfidfVectorizer(stop_words = set(en_stopwords))
X_tfidf = tfidf_vectorizer.fit_transform(event_text)

The vectorizer we got above is used as input for LDA or NMF to build the model

# Exploring K-means Clustering

In [322]:
# from sklearn.cluster import KMeans
# from sklearn import metrics
# from scipy.spatial.distance import cdist 

Since we don't have an idea of how many topics can there be, let's use the silhouette score as a measure of how many clusters we should have.

In [323]:
# clusters = range(2,30)
# distortions = []
# silhouette_coeffs = []

# for k in clusters:
#     km = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10,verbose=0)
#     km.fit(X_tfidf)
    
#     distortions.append(km.inertia_)
#     silhouette_coeffs.append(metrics.silhouette_score(X_tfidf, km.labels_))

In [324]:
# plt.style.use('seaborn-darkgrid')
# fig, ax = plt.subplots(figsize=(8,6))

# ax.plot(clusters, distortions, marker='o', color='b')
# plt.show()

In [325]:
# plt.style.use('seaborn-darkgrid')
# fig, ax = plt.subplots(figsize=(8,6))

# ax.bar(x=clusters, height=silhouette_coeffs, color='g')
# plt.show()

It seems that the preprocessing done was not enough and the k-means algorithm is being too sensitive to the data. It could be worthwhile trying to extract the event descriptions manually as there are only 70 ish events...

Below a sample clustering for k=15, doesn't seem good.

In [326]:
# order_centroids = km.cluster_centers_.argsort()[:, ::-1]
# terms = tfidf_vectorizer.get_feature_names()

# for i in range(15):
#     print("Cluster %d:" % i, end='')
#     for ind in order_centroids[i, :10]:
#         print(' %s' % terms[ind], end='')
#     print()

# Exploring LDA

In [327]:
count_vectorizer = CountVectorizer(stop_words = set(en_stopwords))
X_count = count_vectorizer.fit_transform(event_text)

In [328]:
count_df = pd.DataFrame(X_count.toarray(), columns= count_vectorizer.get_feature_names())

In [329]:
agg_counts = pd.DataFrame({
    'word' : count_vectorizer.get_feature_names(),
    'count' : count_df.T.apply(np.sum, axis=1)
}).reset_index(drop=True)

In [330]:
feats = count_vectorizer.get_feature_names()
digits = re.compile(r'^\d+')

print(f"There are {len(feats)} feature names.")
print(f"Of which {len([re.match(digits, f) for f in feats if re.match(digits, f)])} \
start with digits and might be noisy values")

There are 1972 feature names.
Of which 0 start with digits and might be noisy values


In [331]:
# agg_counts.sort_values('count', ascending=False).iloc[:50,:]

From looking at the counts above, the text quality doesn't look good. I will apply some manual cleansing to the description to get a more accurate representation of the Event descriptions

In [332]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [333]:
# Helper function
def print_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

First, an attempt with just the CountVectorizer

In [334]:
# Tweak the two parameters below
number_topics = 15
number_words = 10
jobs = -1
max_iter = 25

alpha = None
eta = None

In [335]:
# Create and fit the LDA model
tfifd_lda = LDA(doc_topic_prior = alpha, topic_word_prior = eta, n_components=number_topics, n_jobs = jobs, max_iter=max_iter)
tfifd_lda.fit(X_tfidf)

# Print the topics found by the LDA model
print("Topics found via LDA with tfidf vectorizer:")
print_topics(tfifd_lda, tfidf_vectorizer, number_words)

Topics found via LDA with tfidf vectorizer:

Topic #0:
ics lala rsvp ict society parking demonware cannot collaborative long

Topic #1:
infuse whoop raspberry pi bench passata teazza tisane similar tearoom

Topic #2:
apply instal wifi structure log bitbuzz stick upstairs depend mean

Topic #3:
kdb october traffic yard engine aol github thing username pythonic

Topic #4:
kx contribute radio accessibility jupyter lot two recently defined kdb

Topic #5:
rte depend park lane staff meantime biccies etsy step via

Topic #6:
pytorch rest coder sure different organiser info right building liquor

Topic #7:
dbs career school tue graduate study basic since remind pleased

Topic #8:
cryptoparty anything pepper twisted aon tool makers maker come cheryl

Topic #9:
meetup event work project bring free please workshop laptop food

Topic #10:
women night metricfire group metric via flask hosted graphite order

Topic #11:
well pythonrelated mailing pythony discussion list qualtrics indeedcom thing octo

To do:
* Review LDA model to understand how to fine tune alpha and eta
* Vizualize results to see if they make sense
* NMF (?)

In [336]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [337]:
pyLDAvis.sklearn.prepare(tfifd_lda, X_tfidf, tfidf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
