In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle
import numpy as np

cv = CountVectorizer()
_dir = "/Users/shinbo/Desktop/metting/LDA/0. data/20news-bydate/newsgroup_preprocessed_corpus.pickle"


class LDA_sklearn:
    def __init__(self, path_data, alpha, eta, K):
        # loading data
        self.data = pickle.load(open(path_data, 'rb'))
        np.random.seed(0)
        idx = np.random.choice(len(self.data), 1000, replace=False)
        self.data = [j for i, j in enumerate(self.data) if i in idx]
        self.K = K
        self.alpha = alpha
        self.eta = eta

    def _make_vocab(self):
        self.vocab = []
        for lst in self.data:
            self.vocab += lst
        self.vocab = sorted(list(set(self.vocab)))
        self.w2idx = {j: i for i, j in enumerate(self.vocab)}
        self.idx2w = {val: key for key, val in self.w2idx.items()}
        self.doc2idx = [[self.w2idx[word] for word in doc] for doc in self.data]
        self.data = [' '.join(doc) for doc in self.data]

    def _cv(self):
        self._make_vocab()
        self.cv = CountVectorizer()
        self.df = self.cv.fit_transform(self.data)

    def _train(self):
        self._make_vocab
        self._cv()
        lda = LatentDirichletAllocation(n_components=self.K, 
                                        doc_topic_prior=self.alpha, topic_word_prior=self.eta,
                                        learning_method='batch', max_iter=1000)
        lda.fit(self.df)
        return lda

In [4]:
lda = LDA_sklearn(_dir, 5, 0.1, 10)
result = lda._train() 

### Result with sklearn lda

In [5]:
lda_lam = [result.components_[i,:] for i in range(10)]

def print_top_words(lam, feature_names, n_top_words):
    for topic_id, topic in enumerate(lam):
        print('\nTopic Nr.%d:' % int(topic_id + 1))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
                       + ' | ' for i in topic.argsort()[:-n_top_words - 1:-1]]))
print_top_words(lda_lam, list(lda.cv.get_feature_names()), 10)


Topic Nr.1:
right 245.64 | state 161.44 | well 141.33 | ca 138.66 | people 124.58 | israel 102.1 | second 89.7 | father 87.1 | israeli 84.1 | use 82.95 | 

Topic Nr.2:
game 161.1 | university 128.27 | de 77.78 | year 64.98 | new 61.01 | anyone 59.18 | go 58.27 | please 56.13 | canada 52.95 | last 51.8 | 

Topic Nr.3:
key 159.65 | time 128.4 | gun 111.18 | think 102.61 | use 85.0 | system 83.31 | attack 79.3 | risk 64.1 | death 62.45 | like 60.54 | 

Topic Nr.4:
go 377.8 | come 138.62 | get 130.5 | know 130.05 | look 93.52 | take 82.07 | car 79.02 | well 77.29 | still 75.45 | want 74.71 | 

Topic Nr.5:
get 147.92 | space 125.47 | science 96.2 | use 79.17 | high 73.8 | know 68.83 | make 63.45 | technology 61.47 | new 60.95 | also 60.93 | 

Topic Nr.6:
year 141.71 | university 134.33 | good 106.19 | team 104.37 | get 93.07 | win 91.84 | player 88.27 | computer 85.5 | distribution 78.42 | well 69.65 | 

Topic Nr.7:
window 281.28 | use 275.61 | mail 184.0 | get 166.6 | system 163.33 | prog

In [12]:
result.bound_

5430.2962585206815

In [14]:
result.score(lda.df)

-1124795.5669155763

### Result with my lda

In [11]:
import pickle
my_model = pickle.load(open('../../model_lda/DMM_result.pkl','rb'))

data = pickle.load(open('preprocessed_review.pickle', 'rb'))
data_join = [' '.join(doc) for doc in data]
cv = CountVectorizer()
X = cv.fit_transform(data_join).toarray()

lda_lam = [my_model.components_[k,:] for k in range(10)]
def print_top_words(lam, feature_names, n_top_words):
    for topic_id, topic in enumerate(lam):
        print('\nTopic Nr.%d:' % int(topic_id + 1))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
                       + ' | ' for i in topic.argsort()[:-n_top_words - 1:-1]]))
print_top_words(lda_lam, list(cv.get_feature_names()), 10)


Topic Nr.1:
room 1201.1 | hotel 1130.31 | stay 960.16 | great 594.12 | staff 489.66 | clean 374.92 | location 338.84 | one 314.09 | service 309.45 | beach 308.85 | 

Topic Nr.2:
room 2899.14 | hotel 2509.11 | stay 1685.79 | beach 1605.94 | view 1365.41 | great 1236.5 | waikiki 1080.26 | nice 910.01 | get 767.98 | pool 763.47 | 

Topic Nr.3:
hotel 4344.5 | great 3830.73 | room 2871.35 | location 2616.8 | stay 2458.15 | beach 2190.62 | staff 2053.06 | good 1810.13 | nice 1328.09 | waikiki 1277.36 | 

Topic Nr.4:
room 829.66 | stay 544.93 | staff 333.44 | view 293.14 | hotel 292.53 | get 256.11 | great 248.55 | location 225.47 | beach 205.38 | time 198.97 | 

Topic Nr.5:
hotel 1599.98 | stay 1160.94 | room 867.11 | staff 829.19 | time 663.48 | great 449.11 | check 431.48 | get 422.04 | go 413.63 | make 381.85 | 

Topic Nr.6:
hotel 1138.91 | room 991.07 | great 791.95 | staff 742.16 | beach 723.18 | stay 720.35 | waikiki 569.31 | location 556.7 | get 438.43 | friendly 356.59 | 

Topic Nr.

In [10]:
my_model.gamma

array([1.53320008e+04, 5.00012733e+00, 5.00012733e+00, 5.00012733e+00,
       6.99903117e+00, 5.00012733e+00, 5.00012733e+00, 5.00012733e+00,
       5.99931584e+00, 5.00012733e+00])

### Result with clda

In [56]:
model = pickle.load(open('/Users/shinbo/PycharmProjects/model/clda_newsgroup.pickle','rb'))

data = pickle.load(open(_dir, 'rb'))
data_join = [' '.join(doc) for doc in data]
cv = CountVectorizer()
X = cv.fit_transform(data_join).toarray()

lda_lam = [model.components_[k,:] for k in range(7)]
def print_top_words(lam, feature_names, n_top_words):
    for topic_id, topic in enumerate(lam):
        print('\nTopic Nr.%d:' % int(topic_id + 1))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
                       + ' | ' for i in topic.argsort()[:-n_top_words - 1:-1]]))
print_top_words(lda_lam, list(cv.get_feature_names()), 10)


Topic Nr.1:
know 641.62 | think 555.67 | time 442.18 | well 402.65 | come 398.39 | see 318.61 | want 314.08 | right 301.04 | good 300.9 | also 299.48 | 

Topic Nr.2:
know 403.55 | think 349.01 | time 315.98 | well 301.96 | see 291.1 | new 272.48 | good 271.17 | also 248.17 | right 226.62 | want 219.54 | 

Topic Nr.3:
know 420.68 | think 388.19 | time 345.76 | well 320.28 | good 300.33 | db 288.96 | also 285.56 | see 266.77 | new 252.79 | system 239.3 | 

Topic Nr.4:
know 399.28 | program 380.86 | system 361.73 | time 348.1 | also 347.11 | use 337.8 | think 327.47 | see 325.15 | good 303.81 | well 302.47 | 

Topic Nr.5:
ax 9912.39 | max 698.08 | know 345.28 | think 316.79 | time 272.68 | di 271.5 | well 243.85 | good 239.87 | ei 236.36 | tm 230.21 | 

Topic Nr.6:
know 382.9 | think 381.88 | time 305.24 | drive 304.21 | well 287.2 | system 286.87 | good 276.76 | also 267.32 | new 255.27 | see 254.51 | 

Topic Nr.7:
know 404.52 | system 385.76 | also 360.93 | think 351.25 | time 348.9 | 

In [12]:
model.components_

array([[89.52925916, 95.40932824, 18.34532888, ...,  1.09184158,
         0.10767849,  0.10080052],
       [44.09064436, 41.7864441 , 13.74226963, ...,  0.10143394,
         2.08582988,  0.59588468],
       [60.37497639, 32.13482869, 22.83849665, ...,  0.10144952,
         0.10118269,  0.10057967],
       ...,
       [13.91398321,  6.12219929,  1.23121052, ...,  0.10153522,
         0.10111832,  0.10060761],
       [75.47577938, 75.47124745, 18.84751955, ...,  0.10169393,
         0.10139479,  0.10081048],
       [20.30630819, 16.55334175,  9.21009231, ...,  0.1010206 ,
         0.10118058,  0.10067582]])

In [13]:
model.seed_words

{'alt': [4814,
  50728,
  39434,
  4185,
  39430,
  19919,
  19820,
  4813,
  20056,
  30395,
  56244],
 'comp': [35932,
  11526,
  54423,
  24553,
  26230,
  66698,
  36993,
  53839,
  28432,
  5346,
  7106,
  58537,
  60089,
  5207,
  3656,
  64728,
  37067,
  47772,
  21259,
  17178],
 'misc': [19942,
  36553,
  13082,
  29790,
  18502,
  42965,
  12747,
  47631,
  45104,
  54869,
  12087,
  34932,
  54225,
  53024,
  45063,
  6688,
  9743,
  63878,
  43751,
  4534,
  43157,
  7683,
  54168,
  22124,
  25680],
 'rec': [46404,
  34095,
  66678,
  16723,
  5527,
  26907,
  50029,
  46409,
  53713,
  6921,
  9298,
  57571,
  53953,
  59584,
  33849],
 'sci': [56443,
  53658,
  10417,
  29272,
  18768,
  16465,
  14326,
  26585,
  42415,
  32343,
  11950,
  11109],
 'soc': [36319,
  34748,
  37033,
  6444,
  67125,
  61802,
  23891,
  10556,
  24147,
  21332,
  26199,
  63894,
  50064,
  33954,
  61767,
  19767,
  10539,
  59249,
  34693,
  23620,
  45376,
  60298,
  65995,
  20545,
  3

In [10]:
seed_words = pickle.load(open('/Users/shinbo/Desktop/metting/LDA/0. data/20news-bydate/topics_top_words.pickle','rb'))
seed_words

{'alt': ['atheist',
  'religion',
  'morality',
  'argument',
  'moral',
  'example',
  'evidence',
  'atheism',
  'exist',
  'islam',
  'something'],
 'comp': ['mac',
  'color',
  'set',
  'graphic',
  'help',
  'window',
  'max',
  'scsi',
  'image',
  'ax',
  'bit',
  'support',
  'thanks',
  'available',
  'anyone',
  'version',
  'mb',
  'problem',
  'file',
  'driver'],
 'misc': ['excellent',
  'manual',
  'cover',
  'interested',
  'email',
  'offer',
  'copy',
  'price',
  'pc',
  'shipping',
  'condition',
  'list',
  'send',
  'sale',
  'pay',
  'best',
  'cd',
  'usa',
  'original',
  'ask',
  'old',
  'book',
  'sell',
  'forsale',
  'hard'],
 'rec': ['play',
  'league',
  'win',
  'dod',
  'back',
  'hockey',
  'really',
  'player',
  'score',
  'bike',
  'car',
  'still',
  'season',
  'team',
  'last'],
 'sci': ['space',
  'science',
  'chip',
  'information',
  'encryption',
  'distribution',
  'data',
  'high',
  'number',
  'key',
  'computer',
  'clipper'],
 'soc': [

## 전처리 파일 확인

In [14]:
newsgroup = pickle.load(open('/Users/shinbo/Desktop/metting/LDA/0. data/20news-bydate/newsgroup_preprocessed_corpus.pickle','rb'))

In [17]:
len(newsgroup[0])

40

In [21]:
len()

35

In [26]:
X.shape, len(cv.get_feature_names())

((11314, 70094), 70094)

In [28]:
vocab_inverse = {j:i for i,j in cv.vocabulary_.items()}

In [34]:
a = []
for k in seed_words.keys():
    a += seed_words[k]

for i in np.nonzero(X[0,:])[0]:
    if vocab_inverse[i] in a:
        print(vocab_inverse[i], i)

first 21414
give 23891
god 24170
something 56244


In [32]:
a

['atheist',
 'religion',
 'morality',
 'argument',
 'moral',
 'example',
 'evidence',
 'atheism',
 'exist',
 'islam',
 'something',
 'mac',
 'color',
 'set',
 'graphic',
 'help',
 'window',
 'max',
 'scsi',
 'image',
 'ax',
 'bit',
 'support',
 'thanks',
 'available',
 'anyone',
 'version',
 'mb',
 'problem',
 'file',
 'driver',
 'excellent',
 'manual',
 'cover',
 'interested',
 'email',
 'offer',
 'copy',
 'price',
 'pc',
 'shipping',
 'condition',
 'list',
 'send',
 'sale',
 'pay',
 'best',
 'cd',
 'usa',
 'original',
 'ask',
 'old',
 'book',
 'sell',
 'forsale',
 'hard',
 'play',
 'league',
 'win',
 'dod',
 'back',
 'hockey',
 'really',
 'player',
 'score',
 'bike',
 'car',
 'still',
 'season',
 'team',
 'last',
 'space',
 'science',
 'chip',
 'information',
 'encryption',
 'distribution',
 'data',
 'high',
 'number',
 'key',
 'computer',
 'clipper',
 'make',
 'like',
 'may',
 'believe',
 'work',
 'truth',
 'give',
 'christianity',
 'go',
 'find',
 'hell',
 'use',
 'reason',
 'law',

In [36]:
temp1 = np.ones(10)
temp1[np.array([1,2,3])] = 1000
temp1

array([   1., 1000., 1000., 1000.,    1.,    1.,    1.,    1.,    1.,
          1.])

In [37]:
temp1[np.array([])]

IndexError: arrays used as indices must be of integer (or boolean) type

In [44]:
a = -10
c = 3

b = 0.8

np.exp(a*b+c)

0.006737946999085467

In [45]:
np.exp(c) * np.exp(b)**a

0.006737946999085458

In [49]:
c**(0.2)

1.2457309396155174

In [55]:
a = 2
b = 0.8
c = 0.1
np.exp(a)**b, np.exp(a)**c

(4.953032424395115, 1.2214027581601699)