In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim
import numpy as np
from datetime import datetime, date, time
import csv
import sys
import importlib
import pandas as pd

In [2]:
LANGUAGE = '[English]'

In [3]:
def tf_vctorize(n_max_features, text):
    #n_features = 1000
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_max_features,
                                stop_words='english',
                                max_df = 0.95,
                                min_df = 2)
    tf = tf_vectorizer.fit_transform(text)
    tf_feature_names = tf_vectorizer.get_feature_names()
    return tf, tf_vectorizer, tf_feature_names
    

def LDA_analyze(tf_vector, n_topics):
    lda = LatentDirichletAllocation(n_topics=n_topics,doc_topic_prior =1/(n_topics*10), max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                evaluate_every = 1,
                                random_state=0)
    lda.fit(tf_vector)
    print(lda)
    return lda

def tf_idf(n_max_features, text):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                       min_df=2,
                                       max_features=n_max_features,
                                       stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(text)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    return tfidf, tfidf_feature_names


In [14]:
data = pd.read_csv('by_months/to_string/Jul_combine_strings.csv')

In [15]:
print(data['string'].head(1))

0    Property,House,Cottage,Building,Roof,Home,Farm...
Name: string, dtype: object


In [16]:
data['string'] = data['string'].replace(np.nan, '', regex=True)

In [17]:
data['string'] = data['string'].apply(str)

In [18]:
print(data['string'])

0      Property,House,Cottage,Building,Roof,Home,Farm...
1      Building,Estate,Property,Mansion,Stately home,...
2      Nature reserve,Land lot,Cottage,Rural area,Roo...
3      Castle,Medieval architecture,Building,Property...
4      Landmark,Architecture,Building,Sky,Steeple,Pla...
5                                                       
6      Building,Landmark,Architecture,Estate,Classica...
7      Road,Lane,Vehicle,Building,Car,Infrastructure,...
8      Natural landscape,Natural environment,Vegetati...
9      Landmark,Architecture,Sky,Building,Town,City,F...
10                                                      
11     Landmark,Steeple,Architecture,Building,Tree,To...
12     Water,Water resources,Fountain,Watercourse,Wat...
13     Sky,Landmark,Blue,Natural landscape,Estate,Bui...
14     Landmark,Architecture,Building,Daytime,City,Sk...
15     Garden,Botanical garden,Estate,Botany,Park,Tre...
16     Water,Botany,Tree,Garden,Botanical garden,Park...
17     Water,Reflection,Pond,Ba

In [19]:
text_all = (data['string']).tolist()

In [20]:
print(len(text_all))

492


In [21]:
n_max_features=3000
n_topics = 10
n_top_words = 10

tf_vector, tf_vectorizer, tf_feature_names = tf_vctorize(n_max_features, text_all)
lda = LDA_analyze(tf_vector, n_topics)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.01,
             evaluate_every=1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [None]:
#tfidf, tfidf_feature_names = tf_idf(n_max_features, text_all)
pyLDAvis.enable_notebook()
lda_prepared = pyLDAvis.sklearn.prepare(lda, tf_vector, tf_vectorizer,R=20)
pyLDAvis.show(lda_prepared)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [29/May/2019 19:45:02] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [29/May/2019 19:45:02] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [29/May/2019 19:45:02] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [29/May/2019 19:45:02] "GET /LDAvis.js HTTP/1.1" 200 -


In [52]:
print(lda_prepared)

PreparedData(topic_coordinates=            Freq  cluster  topics         x         y
topic                                                
0      57.775999        1       1  0.304212 -0.118978
9      21.997965        1       2 -0.188753 -0.005915
6      13.942636        1       3 -0.178576 -0.198191
3       5.019783        1       4 -0.002614  0.281333
7       0.845409        1       5  0.057550  0.006807
8       0.083642        1       6 -0.001767  0.008760
5       0.083642        1       7  0.000712  0.006478
4       0.083642        1       8  0.003076  0.006541
2       0.083642        1       9  0.008455  0.004700
1       0.083642        1      10 -0.002293  0.008465, topic_info=     Category        Freq          Term       Total  loglift  logprob
term                                                                 
14    Default  114.000000      building  114.000000  30.0000  30.0000
3     Default  181.000000  architecture  181.000000  29.0000  29.0000
53    Default   86.000000    

In [59]:
N=10
for i in lda_prepared[0:1]:
    #i.shape()
    print(i)
    N+=1
    #i.to_csv("output"+ str(N) +'.csv')

            Freq  cluster  topics         x         y
topic                                                
0      57.775999        1       1  0.304212 -0.118978
9      21.997965        1       2 -0.188753 -0.005915
6      13.942636        1       3 -0.178576 -0.198191
3       5.019783        1       4 -0.002614  0.281333
7       0.845409        1       5  0.057550  0.006807
8       0.083642        1       6 -0.001767  0.008760
5       0.083642        1       7  0.000712  0.006478
4       0.083642        1       8  0.003076  0.006541
2       0.083642        1       9  0.008455  0.004700
1       0.083642        1      10 -0.002293  0.008465


In [237]:
for topic_idx, topic in enumerate(lda.components_):
    print("Topic #%d:" % topic_idx)
    #print(topic)
    print(topic.argsort()[-10:-1])
    #print(topic.argsort()[:-n_top_words - 1:-1])
    #print(tf_feature_names)
    

Topic #0:
[279 126 122 258  73 257 174 182  36]
Topic #1:
[ 30 277 105 210 193 149  52 201 197]
Topic #2:
[169  76  68  13  52  12 194  16  17]
Topic #3:
[ 39  80 283  57 205  59 197 111 179]
Topic #4:
[216 187 262 209  55  19  42 133 129]
Topic #5:
[279 126 122 258  73 257 174 182  36]
Topic #6:
[ 30 277 105 210 193 149  52 201 197]
Topic #7:
[ 57 133 137  42 274  35 197 179  24]
Topic #8:
[237 163  36 122 241 124 257  10 184]
Topic #9:
[135  19  24  66 297 163 279  55  58]


In [183]:
print(tf_feature_names)

['00', '000', '10', '1000', '1080', '1085', '10th', '11', '1100', '1145', '11th', '12', '12th', '14th', '15', '15th', '17th', '19th', '1st', '20', '20th', '30', '3pm', '900', 'able', 'absolute', 'absolutely', 'access', 'accessible', 'according', 'acoustics', 'action', 'activities', 'actually', 'addition', 'adjacent', 'admire', 'admission', 'adults', 'afternoon', 'age', 'ages', 'ago', 'air', 'alike', 'alive', 'allow', 'allowed', 'altar', 'alter', 'amazing', 'amazingly', 'ambience', 'ancient', 'anytime', 'app', 'appreciate', 'april', 'arch', 'architectural', 'architecture', 'area', 'areas', 'array', 'arrived', 'art', 'artifacts', 'ask', 'asked', 'aspects', 'assembled', 'astrological', 'astronomic', 'astronomical', 'atmosphere', 'attached', 'attended', 'attraction', 'attractions', 'attractive', 'august', 'aula', 'austere', 'authentic', 'autumn', 'available', 'away', 'awe', 'awesome', 'baby', 'bar', 'basement', 'basic', 'basis', 'bathrooms', 'beatiful', 'beautiful', 'beautifull', 'beautifu

In [239]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-30 - 1:-1]]))
    print()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
lund buildings old museum sweden different swedish history houses visit time place interesting kulturen day trip really cathedral people outside went house exhibits times life culture small visited lived exhibitions
Topic #1:
kids place playground children liked peaceful really fun various bit enjoyed exhibits denmark great area going especially love caf outdoor old set fantastic perfect took season beauty summer amazing way
Topic #2:
exhibition art areas people april children architectural dating early medieval religious noon different lund day crypt visited huge beautiful visit nice center church old fascinating cathedral interesting museum small century
Topic #3:
park nice good place close pond city walk enjoy cafe recommended people lund children little birds relax students coffee located miss day town outdoor play weather stadsparken look easily sunny
Topic #4:
clock impressive inside cathedral astronomical church read time outside religious giant town basement visit wal