### The following notebook is a compilation of code from "LTD_EDA_ali.ipnyb" which will be used in the final documentation notebook.

In [1]:
import nltk
import pandas as pd
import re
import numpy as np

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
df = pd.read_csv('LTD_rawevents.csv')

In [4]:
events_df = events_df.rename({'WHAT TO DO?' : 'Alex_targets'}, axis='columns')
events_df_lab = events_df.dropna(subset=['Alex_targets'])
events_df_lab.Alex_targets = events_df_lab.Alex_targets.apply(lambda x: x.split(',')[0])
events_df_lab.Alex_targets.unique()

NameError: name 'events_df' is not defined

In [None]:
# Processing event descriptions with Count Vectorizer/Bag of words method 
count_vec4 = CountVectorizer(ngram_range=(1,2), min_df=7, max_df=.9, max_features=5000)
X_train_count4 = count_vec4.fit_transform(events_df_lab['Lem_words'])
X_train_count4.shape

In [None]:
# Running Naive Bayes classifier on bag of words, training on all events but the first 20
multiNB = MultinomialNB()
cntvecMNB = multiNB.fit(X_train_count4[20:,], events_df_lab.Alex_targets[20:])

In [None]:
# Testing NB classifier on first 20 events, viewing the predicted label output
new_docs = X_train_count4[:20]
cnt_predicted = cntvecMNB.predict(new_docs)
cnt_predicted

In [None]:
# The actual labels Alex assigned for the first 20 events, for comparison
events_df_lab.Alex_targets[:20]

In [None]:
# Alternative word processing method to plain count vectorizer: TfIdf (penalizing frequent words)
from sklearn.feature_extraction.text import TfidfVectorizer

# only includes words that appear more than 7x, and in less than 90% of events, and the top 5000 of those words
tf_idf = TfidfVectorizer(ngram_range=(1,2), min_df=7, max_df=.9, max_features=5000) 
X_train_tf = tf_idf.fit_transform(events_df_lab['Lem_words'])

In [None]:
# Fitting NB classifier on new TfIdf word data
tf_idfMNB = multiNB.fit(X_train_tf[20:], events_df_lab.Alex_targets[20:])

In [None]:
# Showing predictions for first 20 events
new_docs_tf = X_train_tf[:20]
tfidf_predicted = tf_idfMNB.predict(new_docs_tf)
tfidf_predicted

In [None]:
# Printing metric reports for both word count methods
from sklearn import metrics
print('The report for CountVectorizer word embedding through a Multinomial model:')
print(metrics.classification_report(events_df_lab.Alex_targets[:20], cnt_predicted, target_names=events_df_lab.Alex_targets.unique() ))

In [None]:
print('The report for TF-IDF Vectorizer word embedding through a Multinomial model:')
print(metrics.classification_report(events_df_lab.Alex_targets[:20], tfidf_predicted, target_names= events_df_lab.Alex_targets.unique()))

## LDA (Latent Dirichlet Allocation)

In [None]:
import gensim

processed_docs  = []

for doc in events_df_lab['Lem_words'][1:]:
    doc = doc.split()
    processed_docs.append(doc)
    
processed_docs

In [None]:
# Creating bag of words with indices
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
# Keep words that appear 7 or more times, but in less than 20% of events
dictionary.filter_extremes(no_below=7, no_above=0.2, keep_n=50000)
len(dictionary.keys())

In [None]:
# Shows the first 10 dictionary entries of words in dataset
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
# Shows how many time each word appears in specified document
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

In [None]:
lda_model_tfidf2 = gensim.models.LdaMulticore(tfidf_corpus, num_topics=6, id2word=dictionary, passes=2, eta=.01)
pyLDAvis.gensim.prepare(lda_model_tfidf2, bow_corpus, dictionary)

In [None]:
# topic 1 - outdoor fairs, family / children friendly
# topic 2 - locations /"world", artsy expos, comedy, wine/craft
# topic 3 - local/ethnic culture
# topic 4 - physical and mental wellness - active, but also comedy, literacy
# topic 5 - food parties
# topic 6 - southern cultural

## 1 and 2 both have art as overlap