In [1]:
import pandas as pd
import json
df = pd.read_csv('deardata.csv')
def parse_text(row):
    t = json.loads(row[5])
    s = ''
    for item in t:
        s+=item['content']
    return s

df['doc'] = df.apply(parse_text, axis=1)
df.head()

Unnamed: 0,week,title,author,visual_img,legend_img,text,doc
0,1,Week 01: A week of clocks,Giorgia,01_Giorgia_DearData_01_Front.jpg,01_Giorgia_DearData_01_Back.jpg,"[{""header"": ""The topic:"", ""content"": "" This wa...",This was the first week of Dear Data – I was ...
1,1,Week 01: A week of clocks,Stefanie,01_Stefanie_DearData_01+front.jpg,01_Stefanie_DearData_01+back.jpg,"[{""header"": ""Data-gathering:"", ""content"": "" Or...",Originally Giorgia and I thought that we woul...
2,2,Week 02: A week of public transportation,Giorgia,02_Giorgia_DearData_02_Front.jpg,02_Giorgia_DearData_02_Back.jpg,"[{""header"": ""Data gathering:"", ""content"": "" Th...",This week I collected data on my walks and tr...
3,2,Week 02: A week of public transportation,Stefanie,02_Stefanie_DearData_02+front.jpg,02_Stefanie_DearData_02+back.jpg,"[{""header"": ""Data-gathering:"", ""content"": "" La...",Last week’s intensive data gathering means th...
4,3,Week 03: A week of thank yous,Giorgia,03_Giorgia_DearData_03_Front.jpg,03_Giorgia_DearData_03_Back.jpg,"[{""header"": ""The topic:"", ""content"": "" This we...","This week we wanted to see how kind we are, a..."


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(list(df['doc']))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(list(df['doc']))
tf_feature_names = tf_vectorizer.get_feature_names()

In [17]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


In [19]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
print('NMF-----------------------------')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('LDA-----------------------------')
display_topics(lda, tf_feature_names, no_top_words)

NMF-----------------------------
Topic 0:
time stefanie dear postcard new people kind topic boyfriend postcards
Topic 1:
giorgia card gathering drawing like ve husband think just quite
Topic 2:
sounds sound birds subway london hear home background reminds hour
Topic 3:
envy feelings negative positive envious feeling thoughts negativity festival language
Topic 4:
clothes wardrobe clothing wear dresses cycling worn organise dress changes
Topic 5:
books bookshelf book read selection apartment survey italy room old
Topic 6:
swear words swearing word speak use language italian used situation
Topic 7:
met festival people eyeo minneapolis talk meeting new time drinks
Topic 8:
music songs listen tracks old listening past years older driven
Topic 9:
foods food eat dinner suggested chocolate boyfriend love dataset uk
LDA-----------------------------
Topic 0:
makes words thoughts eventually stopped houses end situations second laugh
Topic 1:
positive drawing time giorgia card little like work lif