In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim
import numpy as np
from datetime import datetime, date, time
import csv
import sys
import importlib
import pandas as pd

In [2]:
LANGUAGE = '[English]'

In [3]:
def tf_vctorize(n_max_features, text):
    #n_features = 1000
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_max_features,
                                stop_words='english',
                                max_df = 0.95,
                                min_df = 2)
    tf = tf_vectorizer.fit_transform(text)
    tf_feature_names = tf_vectorizer.get_feature_names()
    return tf, tf_vectorizer, tf_feature_names
    

def LDA_analyze(tf_vector, n_topics):
    lda = LatentDirichletAllocation(n_topics=n_topics,doc_topic_prior =1/(n_topics*10), max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                evaluate_every = 1,
                                random_state=0)
    lda.fit(tf_vector)
    print(lda)
    return lda

def tf_idf(n_max_features, text):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                       min_df=2,
                                       max_features=n_max_features,
                                       stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(text)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    return tfidf, tfidf_feature_names


In [4]:
data = pd.read_csv('all_184_rebasic_senti_analysis.csv')

In [8]:
print(data['eng_text'].head(5))

0    Really enjoyed the lamb on a skewer, the meat ...
1    The restaurant is ok and the food was also goo...
2    All meze and main dishes were great, in partic...
3    We were taken here by our cousin who lives up ...
4    A lovely selection of meze dishes. 6 or 7 for ...
Name: eng_text, dtype: object


In [5]:
print(data['string'].head(1))

0    Building,Classical architecture,Landmark,Archi...
Name: string, dtype: object


In [6]:
data['string'] = data['string'].replace(np.nan, '', regex=True)

In [7]:
data['string'] = data['string'].apply(str)

In [8]:
print(data['string'])

0      Building,Classical architecture,Landmark,Archi...
1      Fountain,Reflecting pool,Water,Garden,Botanica...
2      Crowd,Community,Competition event,Spring,Event...
3      Landmark,Architecture,Building,Classical archi...
4      Neighbourhood,Building,City,Town,Human settlem...
5      Landmark,Sky,Architecture,Building,City,Town,D...
6      Natural landscape,Nature,Vegetation,Bank,River...
7      Architecture,Building,Landmark,Medieval archit...
8      Property,Estate,House,Building,Home,Architectu...
9      Building,Landmark,Architecture,Town,Stately ho...
10     Landmark,Building,Estate,Mansion,Architecture,...
11     Landmark,Architecture,Palace,Building,Mansion,...
12     Building,Property,Landmark,Architecture,Estate...
13     Landmark,Architecture,Building,Church,Sky,Stee...
14     Property,House,Estate,Natural landscape,Tree,B...
15     Landmark,Medieval architecture,Architecture,Bu...
16     Estate,Building,Mansion,Palace,Château,Officia...
17     Mansion,Estate,Palace,Bu

In [9]:
text_all = (data['eng_text']).tolist()

In [10]:
print(len(text_all))

9858


In [None]:
n_max_features=3000
n_topics = 10
n_top_words = 10

tf_vector, tf_vectorizer, tf_feature_names = tf_vctorize(n_max_features, text_all)
lda = LDA_analyze(tf_vector, n_topics)



In [None]:
#tfidf, tfidf_feature_names = tf_idf(n_max_features, text_all)
pyLDAvis.enable_notebook()
lda_prepared = pyLDAvis.sklearn.prepare(lda, tf_vector, tf_vectorizer)
pyLDAvis.show(lda_prepared)

In [None]:
#print(lda_prepared)

In [253]:
N=10
for i in lda_prepared[3:]:
    #i.shape()
    print(i)
    N+=1
    #i.to_csv("output"+ str(N) +'.csv')

30
0.01
{'xlab': 'PC1', 'ylab': 'PC2'}
[10, 6, 1, 8, 4, 9, 7, 2, 3, 5]


In [237]:
for topic_idx, topic in enumerate(lda.components_):
    print("Topic #%d:" % topic_idx)
    #print(topic)
    print(topic.argsort()[-10:-1])
    #print(topic.argsort()[:-n_top_words - 1:-1])
    #print(tf_feature_names)
    

Topic #0:
[279 126 122 258  73 257 174 182  36]
Topic #1:
[ 30 277 105 210 193 149  52 201 197]
Topic #2:
[169  76  68  13  52  12 194  16  17]
Topic #3:
[ 39  80 283  57 205  59 197 111 179]
Topic #4:
[216 187 262 209  55  19  42 133 129]
Topic #5:
[279 126 122 258  73 257 174 182  36]
Topic #6:
[ 30 277 105 210 193 149  52 201 197]
Topic #7:
[ 57 133 137  42 274  35 197 179  24]
Topic #8:
[237 163  36 122 241 124 257  10 184]
Topic #9:
[135  19  24  66 297 163 279  55  58]


In [183]:
print(tf_feature_names)

['00', '000', '10', '1000', '1080', '1085', '10th', '11', '1100', '1145', '11th', '12', '12th', '14th', '15', '15th', '17th', '19th', '1st', '20', '20th', '30', '3pm', '900', 'able', 'absolute', 'absolutely', 'access', 'accessible', 'according', 'acoustics', 'action', 'activities', 'actually', 'addition', 'adjacent', 'admire', 'admission', 'adults', 'afternoon', 'age', 'ages', 'ago', 'air', 'alike', 'alive', 'allow', 'allowed', 'altar', 'alter', 'amazing', 'amazingly', 'ambience', 'ancient', 'anytime', 'app', 'appreciate', 'april', 'arch', 'architectural', 'architecture', 'area', 'areas', 'array', 'arrived', 'art', 'artifacts', 'ask', 'asked', 'aspects', 'assembled', 'astrological', 'astronomic', 'astronomical', 'atmosphere', 'attached', 'attended', 'attraction', 'attractions', 'attractive', 'august', 'aula', 'austere', 'authentic', 'autumn', 'available', 'away', 'awe', 'awesome', 'baby', 'bar', 'basement', 'basic', 'basis', 'bathrooms', 'beatiful', 'beautiful', 'beautifull', 'beautifu

In [239]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-30 - 1:-1]]))
    print()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
lund buildings old museum sweden different swedish history houses visit time place interesting kulturen day trip really cathedral people outside went house exhibits times life culture small visited lived exhibitions
Topic #1:
kids place playground children liked peaceful really fun various bit enjoyed exhibits denmark great area going especially love caf outdoor old set fantastic perfect took season beauty summer amazing way
Topic #2:
exhibition art areas people april children architectural dating early medieval religious noon different lund day crypt visited huge beautiful visit nice center church old fascinating cathedral interesting museum small century
Topic #3:
park nice good place close pond city walk enjoy cafe recommended people lund children little birds relax students coffee located miss day town outdoor play weather stadsparken look easily sunny
Topic #4:
clock impressive inside cathedral astronomical church read time outside religious giant town basement visit wal