In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim
import numpy as np
from datetime import datetime, date, time
import csv
import sys
import importlib
import pandas as pd

In [2]:
LANGUAGE = '[English]'

In [3]:
def tf_vctorize(n_max_features, text):
    #n_features = 1000
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_max_features,
                                stop_words='english',
                                max_df = 0.95,
                                min_df = 2)
    tf = tf_vectorizer.fit_transform(text)
    tf_feature_names = tf_vectorizer.get_feature_names()
    return tf, tf_vectorizer, tf_feature_names
    

def LDA_analyze(tf_vector, n_topics):
    lda = LatentDirichletAllocation(n_topics=n_topics,doc_topic_prior =1/(n_topics*10), max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                evaluate_every = 1,
                                random_state=0)
    lda.fit(tf_vector)
    print(lda)
    return lda

def tf_idf(n_max_features, text):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                       min_df=2,
                                       max_features=n_max_features,
                                       stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(text)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    return tfidf, tfidf_feature_names


In [29]:
data = pd.read_csv('"city vitality".csv', names = 'abcdef')

In [30]:
data.head()

Unnamed: 0,a,b,c,d,e,f
0,1949,1,Correlation and integration: A study in city b...,"AJ Dickson - New Zealand Engineering, 1949 - s...","… They contained slums or advanced decadence, ...",
1,1957,1,The Reconstruction of Warsaw,"S Dziewulski, S Jankowski - Town …, 1957 - onl...",… Copyright (c) Liverpool University Press Pag...,Cited by 4 Related articles All 3 versions
2,1962,2,Panning the Planners,PA Pfretzschner - 1962 - JSTOR,"… Why, through plan- ning, of course. The sobe...",Cited by 2 Related articles
3,1962,2,The amplitude of design,EW Strong - Journal of the American Institute ...,… metropolitan areas. Today we are not compete...,Cited by 3 Related articles Library Search
4,1964,1,[PDF][PDF] The relationship of traffic attract...,PH Wright - 1964 - smartech.gatech.edu,"Page 1. ""In presenting the dissertation as a p...",Cited by 2 Related articles All 2 versions ...


In [31]:
#data_language = data['e']
#text_all = data_language.tolist()
#print(text_all[0:2])

In [32]:
data_new = data['e'].replace(np.nan, '', regex=True)
data_new = data_new.apply(str)

In [33]:
text_all = data_new.tolist()
print(text_all[0:3])

["… They contained slums or advanced decadence, and over wide areas obsolescence was rife.\nOn main traffic routes congestion frequently was so great as to sap the city's vitality by strangulation.\nAuthorities seemed to be fighting a losing battle with dis- integrating forces\xa0… \n", '… Copyright (c) Liverpool University Press Page 2. 210 THE RECONSTRUCTION OF WARSAW\nOCTOBI R of the former population remained. However, the city\'s vitality gave impetus to\nreconstruction. The National Assembly of 165"9 appointed a commission\xa0… \n', '… Why, through plan- ning, of course. The sober truth is that Jane Jacobs is a city planner after all.\nOnly she would do it differently. "Consider," she asks us, "the kind of goals at which city planning\nmust begin to aim, if the object is to plan for city vitality." What are the goals\xa0… \n']


In [34]:
n_max_features=3000
n_topics = 10
n_top_words = 10

tf_vector, tf_vectorizer, tf_feature_names = tf_vctorize(n_max_features, text_all)
lda = LDA_analyze(tf_vector, n_topics)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.01,
             evaluate_every=1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [None]:
#tfidf, tfidf_feature_names = tf_idf(n_max_features, text_all)
pyLDAvis.enable_notebook()
lda_prepared = pyLDAvis.sklearn.prepare(lda, tf_vector, tf_vectorizer, R=20)
pyLDAvis.show(lda_prepared)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [14/Jul/2019 11:02:48] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jul/2019 11:02:48] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Jul/2019 11:02:48] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Jul/2019 11:02:48] "GET /LDAvis.js HTTP/1.1" 200 -


In [240]:
print(lda_prepared)

PreparedData(topic_coordinates=            Freq  cluster  topics         x         y
topic                                                
9      34.416216        1       1  0.107060  0.036155
5      23.640975        1       2  0.141462 -0.072641
0      14.211574        1       3  0.141462 -0.072641
7      12.682552        1       4  0.015045  0.172638
3       6.937734        1       5 -0.097264  0.237869
8       4.700521        1       6  0.173647 -0.107607
6       1.688222        1       7 -0.231094 -0.121963
1       1.270062        1       8 -0.231094 -0.121963
2       0.360067        1       9 -0.019476  0.024162
4       0.092077        1      10  0.000250  0.025989, topic_info=     Category        Freq          Term       Total  loglift  logprob
term                                                                 
197   Default  205.000000         place  205.000000  30.0000  30.0000
174   Default  185.000000        museum  185.000000  29.0000  29.0000
139   Default   60.000000    

In [253]:
N=10
for i in lda_prepared[3:]:
    #i.shape()
    print(i)
    N+=1
    #i.to_csv("output"+ str(N) +'.csv')

30
0.01
{'xlab': 'PC1', 'ylab': 'PC2'}
[10, 6, 1, 8, 4, 9, 7, 2, 3, 5]


In [237]:
for topic_idx, topic in enumerate(lda.components_):
    print("Topic #%d:" % topic_idx)
    #print(topic)
    print(topic.argsort()[-10:-1])
    #print(topic.argsort()[:-n_top_words - 1:-1])
    #print(tf_feature_names)
    

Topic #0:
[279 126 122 258  73 257 174 182  36]
Topic #1:
[ 30 277 105 210 193 149  52 201 197]
Topic #2:
[169  76  68  13  52  12 194  16  17]
Topic #3:
[ 39  80 283  57 205  59 197 111 179]
Topic #4:
[216 187 262 209  55  19  42 133 129]
Topic #5:
[279 126 122 258  73 257 174 182  36]
Topic #6:
[ 30 277 105 210 193 149  52 201 197]
Topic #7:
[ 57 133 137  42 274  35 197 179  24]
Topic #8:
[237 163  36 122 241 124 257  10 184]
Topic #9:
[135  19  24  66 297 163 279  55  58]


In [183]:
print(tf_feature_names)

['00', '000', '10', '1000', '1080', '1085', '10th', '11', '1100', '1145', '11th', '12', '12th', '14th', '15', '15th', '17th', '19th', '1st', '20', '20th', '30', '3pm', '900', 'able', 'absolute', 'absolutely', 'access', 'accessible', 'according', 'acoustics', 'action', 'activities', 'actually', 'addition', 'adjacent', 'admire', 'admission', 'adults', 'afternoon', 'age', 'ages', 'ago', 'air', 'alike', 'alive', 'allow', 'allowed', 'altar', 'alter', 'amazing', 'amazingly', 'ambience', 'ancient', 'anytime', 'app', 'appreciate', 'april', 'arch', 'architectural', 'architecture', 'area', 'areas', 'array', 'arrived', 'art', 'artifacts', 'ask', 'asked', 'aspects', 'assembled', 'astrological', 'astronomic', 'astronomical', 'atmosphere', 'attached', 'attended', 'attraction', 'attractions', 'attractive', 'august', 'aula', 'austere', 'authentic', 'autumn', 'available', 'away', 'awe', 'awesome', 'baby', 'bar', 'basement', 'basic', 'basis', 'bathrooms', 'beatiful', 'beautiful', 'beautifull', 'beautifu

In [239]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-30 - 1:-1]]))
    print()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
lund buildings old museum sweden different swedish history houses visit time place interesting kulturen day trip really cathedral people outside went house exhibits times life culture small visited lived exhibitions
Topic #1:
kids place playground children liked peaceful really fun various bit enjoyed exhibits denmark great area going especially love caf outdoor old set fantastic perfect took season beauty summer amazing way
Topic #2:
exhibition art areas people april children architectural dating early medieval religious noon different lund day crypt visited huge beautiful visit nice center church old fascinating cathedral interesting museum small century
Topic #3:
park nice good place close pond city walk enjoy cafe recommended people lund children little birds relax students coffee located miss day town outdoor play weather stadsparken look easily sunny
Topic #4:
clock impressive inside cathedral astronomical church read time outside religious giant town basement visit wal