In [183]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
events = pd.read_csv("./filestore/events/fact_events.csv").drop('Unnamed: 0', axis=1)

In [12]:
events.head()

Unnamed: 0,description,duration,headcount,event_id,maybe_rsvp_count,name,rating,rsvp_limit,status,time,updated,utc_offset,venue_id,visibility,waitlist_count,yes_rsvp_count
0,These meetups are very informal. I won't be st...,9000000.0,12,147478282,0,PyLadies Dublin Inaugural meetup - bring laptop!,,,past,1384799400000,1384853013000,0,16176442,public,0,22
1,"Our second meetup will be at Engine Yard, a bi...",,0,152107272,0,Second PyLadies Dublin Meetup - Let's get coding!,,,past,1387218600000,1387230236000,0,13054852,public,0,12
2,Happy New Year! Hope you all had a good Christ...,10800000.0,0,159368332,0,Our first PyLadies Dublin meetup of 2014,,,past,1390240800000,1390470097000,0,17757332,public,0,11
3,Bring your laptops along. If you want some foo...,10800000.0,0,162851382,0,PyLadies Dublin Feb meetup,,,past,1392660000000,1392672314000,0,18096492,public,0,9
4,!!!CHANGE OF VENUE UPDATE!!! &gt;&gt; More inf...,10800000.0,0,166955082,0,PyLadies Dublin Meetup,,,past,1395165600000,1395219566000,0,18950322,public,0,11


In [13]:
events.dtypes

description          object
duration            float64
headcount             int64
event_id             object
maybe_rsvp_count      int64
name                 object
rating              float64
rsvp_limit          float64
status               object
time                  int64
updated               int64
utc_offset            int64
venue_id              int64
visibility           object
waitlist_count        int64
yes_rsvp_count        int64
dtype: object

# Let's look at the event descriptions

In [14]:
desc = events['description'].tolist()

In [15]:
desc

["These meetups are very informal. I won't be structuring until we get a feel on what we want/like to do. It all depends on who shows up. :-) So bring your laptop and please have Python installed, and we will do a quick introduction and get stuck in why we use Python, and start from there. They have free wi-fi (via bitbuzz) but that means we have to log in every 30 minutes. And we will be upstairs.",
 "Our second meetup will be at Engine Yard, a big thanks to Eamon and Jonathan for helping us out. So a few things:- • The meetup will be from 6:30PM to 9PM. •\xa0It's free to attend. • Bring your laptop. • If you have an idea and/or project, problems, bring them along. • We will be running through what source control is about, and for those who haven't set up their github account, please do so and email [masked] you username so I can add you to PyLadies Dublin github team. • Guys are welcome as well. :-D Venue thanks to Engine Yard:  Food thanks to Python Ireland: ",
 'Happy New Year! Hop

In [53]:
desc[4]

"!!!CHANGE OF VENUE UPDATE!!! &gt;&gt; More info on how to get there:&lt;&lt; • This is the building (google street view):\xa0http://bit.ly/Nr5DVk • Lab is in the annexe building – 1st\xa0floor keep right – through the double doors and its on the right A115 •\xa0Ask the porter in the lobby when you get here – its in the newer building beside the one in the photo…where the canteen is. AGENDA •\xa0Make sure those who want to work on PyLadies Dublin projects are on its github working group. •\xa0PyLadies Dublin first project will be the website. So we will be working on the Initial designs on PyLadies Dublin website. •\xa0Then decide who wants to work on what (you don't have to be committed to this, you can change to work on different section). •\xa0People can also work on personal projects, and others can also join in to help. • If you want to learn Python, come along, we have plenty of Pythonistas to help you. • Since venue change, not bringing tea/instant coffee, and I'm not sure about

## Cleaning up the description

### Punctuation and emojis
There are some emoji characters and unwanted punctuation

In [162]:
def find_unwanted_chars(s):
    pattern = r"[^a-zA-Z0-9\s.\-/':!?&@€$_+Éáéóć%]"
    return set(re.findall(pattern, s))

In [163]:
unwanted = set(char for e in desc for char in find_unwanted_chars(e))

In [164]:
clean_punct = []
for sent in desc:
    for punct in unwanted:
        sent = sent.replace(punct, "")
    clean_punct.append(sent)

In [165]:
clean_punct

["These meetups are very informal. I won't be structuring until we get a feel on what we want/like to do. It all depends on who shows up. :- So bring your laptop and please have Python installed and we will do a quick introduction and get stuck in why we use Python and start from there. They have free wi-fi via bitbuzz but that means we have to log in every 30 minutes. And we will be upstairs.",
 "Our second meetup will be at Engine Yard a big thanks to Eamon and Jonathan for helping us out. So a few things:-  The meetup will be from 6:30PM to 9PM. \xa0It's free to attend.  Bring your laptop.  If you have an idea and/or project problems bring them along.  We will be running through what source control is about and for those who haven't set up their github account please do so and email masked you username so I can add you to PyLadies Dublin github team.  Guys are welcome as well. :-D Venue thanks to Engine Yard:  Food thanks to Python Ireland: ",
 "Happy New Year! Hope you all had a go

### Unicode, URLS, smiley faces

In [170]:
special_dict = {
    'smile' : r'[:;=]-[)D]?',
    'uni' : r'\xa0',
    'dupe_space' : r'\s{2,}|\s\Z',
    'uls_chars' :  r'(?:&[gla][tm]p?)+',
    'url' : r'(?:https|ftp|file)://\S+'
}


def remove_special(s):
    for k, regex in special_dict.items():
        s = re.sub(regex, "", s)
    return s

In [171]:
clean_special = [remove_special(s).lower() for s in clean_punct]

## Tokenizing and removing stop words

In [172]:
en_stopwords = stopwords.words('english')
en_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [186]:
vectorizer = TfidfVectorizer(stop_words = set(en_stopwords))
vectorizer.fit_transform(clean_special)

<75x2461 sparse matrix of type '<class 'numpy.float64'>'
	with 7582 stored elements in Compressed Sparse Row format>

In [188]:
vectorizer.get_feature_names()

['00',
 '00lightning',
 '00meetup',
 '00pm',
 '05',
 '09',
 '10',
 '100',
 '101',
 '11',
 '119',
 '12',
 '120',
 '125000',
 '13',
 '14',
 '15',
 '150',
 '15mins',
 '16',
 '16call',
 '16th',
 '17',
 '17th',
 '18',
 '18th',
 '19',
 '1967',
 '1993',
 '1fu8zrk',
 '1qyluok',
 '1ug31a4',
 '20',
 '2003',
 '2004',
 '2010',
 '2013',
 '2017',
 '20mins',
 '21',
 '219',
 '21st',
 '23000',
 '23rd',
 '24',
 '25',
 '25s',
 '2nd',
 '30',
 '30mins',
 '30pm',
 '30speaker',
 '32',
 '320',
 '32c3',
 '35',
 '350',
 '360',
 '3d',
 '3rd',
 '3until',
 '40',
 '4000',
 '420',
 '45',
 '45mins',
 '50',
 '50000',
 '55',
 '55000',
 '5pm',
 '7pm',
 '7th',
 '800sqf',
 '83',
 '8500',
 '99',
 '9pm',
 'a115',
 'aa',
 'ability',
 'able',
 'aboutmáté',
 'aboutpython',
 'academic',
 'academy',
 'acceleration',
 'accelerator',
 'accenture',
 'acceptance',
 'accepting',
 'access',
 'accessibility',
 'accessible',
 'account',
 'achieving',
 'acia',
 'across',
 'act',
 'action',
 'activities',
 'activity',
 'activity21',
 'act

The vectorizer we got above is used as input for LDA or NMF to build the model