In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle
import numpy as np

cv = CountVectorizer()
_dir = "/Users/shinbo/PycharmProjects/paper/LDA/preprocessed_review.pickle"


class LDA_sklearn:
    def __init__(self, path_data, alpha, eta, K):
        # loading data
        self.data = pickle.load(open(path_data, 'rb'))
        np.random.seed(0)
        idx = np.random.choice(len(self.data), 1000, replace=False)
        self.data = [j for i, j in enumerate(self.data) if i in idx]
        self.K = K
        self.alpha = alpha
        self.eta = eta

    def _make_vocab(self):
        self.vocab = []
        for lst in self.data:
            self.vocab += lst
        self.vocab = sorted(list(set(self.vocab)))
        self.w2idx = {j: i for i, j in enumerate(self.vocab)}
        self.idx2w = {val: key for key, val in self.w2idx.items()}
        self.doc2idx = [[self.w2idx[word] for word in doc] for doc in self.data]
        self.data = [' '.join(doc) for doc in self.data]

    def _cv(self):
        self._make_vocab()
        self.cv = CountVectorizer()
        self.df = self.cv.fit_transform(self.data)

    def _train(self):
        self._make_vocab
        self._cv()
        lda = LatentDirichletAllocation(n_components=self.K, 
                                        doc_topic_prior=self.alpha, topic_word_prior=self.eta,
                                        learning_method='batch', max_iter=1000)
        lda.fit(self.df)
        return lda

In [5]:
lda = LDA_sklearn(_dir, 5, 0.1, 5)
result = lda._train() 

In [7]:
lda_lam = [result.components_[i,:] for i in range(5)]

def print_top_words(lam, feature_names, n_top_words):
    for topic_id, topic in enumerate(lam):
        print('\nTopic Nr.%d:' % int(topic_id + 1))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
                       + ' | ' for i in topic.argsort()[:-n_top_words - 1:-1]]))
print_top_words(lda_lam, list(lda.cv.get_feature_names()), 10)


Topic Nr.1:
great 684.1 | staff 508.1 | location 479.1 | waikiki 333.83 | service 249.1 | friendly 237.1 | helpful 170.1 | excellent 156.1 | close 153.1 | hotel 147.76 | 

Topic Nr.2:
room 503.52 | one 222.1 | day 196.1 | check 160.1 | go 159.1 | get 139.25 | back 118.1 | front 118.1 | desk 100.1 | hotel 98.66 | 

Topic Nr.3:
stay 721.22 | hotel 383.96 | place 150.46 | time 149.5 | make 148.84 | amazing 113.1 | beautiful 112.1 | experience 107.1 | best 101.1 | hawaii 99.1 | 

Topic Nr.4:
hotel 496.02 | room 440.68 | good 327.1 | nice 300.1 | clean 284.1 | bed 139.1 | area 134.1 | well 126.32 | comfortable 121.1 | small 110.1 | 

Topic Nr.5:
beach 555.1 | view 309.1 | walk 201.1 | restaurant 187.23 | resort 150.1 | really 147.1 | waikiki 131.37 | ocean 130.1 | shop 113.1 | right 112.1 | 


In [8]:
model = pickle.load(open('/Users/shinbo/PycharmProjects/model/lda_modified.pickle','rb'))
lda_lam = [model.components_[k,:] for k in range(5)]
def print_top_words(lam, feature_names, n_top_words):
    for topic_id, topic in enumerate(lam):
        print('\nTopic Nr.%d:' % int(topic_id + 1))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
                       + ' | ' for i in topic.argsort()[:-n_top_words - 1:-1]]))
print_top_words(lda_lam, list(model.cv.get_feature_names()), 10)


Topic Nr.1:
room 10343.97 | one 2923.41 | hotel 2547.3 | night 2510.1 | service 2508.51 | day 2435.1 | check 2270.1 | bed 2215.1 | front 2069.1 | desk 1724.1 | 

Topic Nr.2:
view 5256.1 | nice 4504.1 | pool 3981.1 | room 3976.23 | beach 3920.4 | breakfast 2560.15 | ocean 2423.1 | area 2163.1 | small 1855.1 | also 1540.02 | 

Topic Nr.3:
hotel 11755.64 | great 10458.1 | location 6861.1 | waikiki 4742.81 | good 4717.1 | beach 4393.8 | staff 3952.15 | friendly 3705.1 | clean 3418.85 | walk 3320.1 | 

Topic Nr.4:
stay 10881.66 | staff 3353.78 | place 2866.67 | love 2084.1 | time 2011.06 | best 1890.1 | family 1854.1 | amazing 1767.1 | will 1762.33 | make 1740.18 | 

Topic Nr.5:
hotel 2997.36 | get 2969.19 | go 2412.29 | like 2221.1 | just 2120.89 | need 1930.62 | look 1533.1 | time 1481.14 | even 1319.1 | much 1239.1 | 


In [29]:
print(model.perplexity)
print(model._ELBO_history)

[1382.4321584106922, 1237.2992001199618, 1175.5975361337626, 1146.0636211074727, 1132.7570026778992, 1126.3016270968562, 1122.8174190841885, 1120.602928670254, 1119.0812926619994, 1118.0833220047298, 1117.3696354836397, 1116.7974136920539, 1116.422025693084, 1116.1269160838808, 1115.868116363026, 1115.6734022311464, 1115.520403349518, 1115.3763112056065, 1115.2604845292974, 1115.1543845807296, 1115.0463408856212, 1114.980782929488, 1114.9230604625484, 1114.8596822140541, 1114.806268448976, 1114.7704737874196, 1114.7267401238857, 1114.6893544842633, 1114.6427384857132, 1114.6209901038835, 1114.5970078975927, 1114.5790633149463, 1114.558751302624, 1114.530759577534, 1114.5123468843947, 1114.497707726814, 1114.471999473783, 1114.4495501197298, 1114.429381124068, 1114.4086962080735, 1114.389673992671, 1114.372919297995, 1114.3574142518714, 1114.343945704983, 1114.3328424843075, 1114.325022847135, 1114.3143392391235, 1114.3085888448818, 1114.3034498763998, 1114.2912634930017, 1114.283051697

## 전처리 파일 확인

In [9]:
newsgroup = pickle.load(open('/Users/shinbo/Desktop/metting/LDA/0. data/20news-bydate/newsgroup_preprocessed_corpus.pickle','rb'))

In [12]:
newsgroup

[['koresh',
  'god',
  'kupajava',
  'east',
  'krakatoa',
  'mailer',
  'psilink',
  'late',
  'news',
  'seem',
  'koresh',
  'give',
  'finish',
  'write',
  'sequel',
  'bible',
  'mathew',
  'write',
  'seven',
  'seal',
  'something',
  'along',
  'line',
  'already',
  'write',
  'first',
  'seven',
  'around',
  'page',
  'hand',
  'assistant',
  'proofread',
  'expect',
  'decent',
  'messiah',
  'build',
  'spellchecker',
  'maybe',
  'koresh',
  'come'],
 ['note',
  'bobby',
  'tampere',
  'university',
  'technology',
  'compute',
  'centre',
  'distribution',
  'sfnet',
  'mozumder',
  'writes',
  'insult',
  'atheistic',
  'genocide',
  'totally',
  'unintentional',
  'atheism',
  'anything',
  'happen',
  'good',
  'bad',
  'include',
  'genocide',
  'know',
  'conveniently',
  '_defined_',
  'theist',
  'someone',
  'wrong',
  '_defined_',
  'people',
  'wrong',
  'atheist',
  'statement',
  'circular',
  'mention',
  'bigoting',
  'value',
  'sami',
  'aario',
  'see',

In [13]:
def gen_even_slices(n, n_packs, *, n_samples=None):
    """Generator to create n_packs slices going up to n.
    Parameters
    ----------
    n : int
    n_packs : int
        Number of slices to generate.
    n_samples : int or None (default = None)
        Number of samples. Pass n_samples when the slices are to be used for
        sparse matrix indexing; slicing off-the-end raises an exception, while
        it works for NumPy arrays.
    Yields
    ------
    slice
    Examples
    --------
    >>> from sklearn.utils import gen_even_slices
    >>> list(gen_even_slices(10, 1))
    [slice(0, 10, None)]
    >>> list(gen_even_slices(10, 10))
    [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
    >>> list(gen_even_slices(10, 5))
    [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
    >>> list(gen_even_slices(10, 3))
    [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
    """
    start = 0
    if n_packs < 1:
        raise ValueError("gen_even_slices got n_packs=%s, must be >=1"
                         % n_packs)
    for pack_num in range(n_packs):
        this_n = n // n_packs
        if pack_num < n % n_packs:
            this_n += 1
        if this_n > 0:
            end = start + this_n
            if n_samples is not None:
                end = min(n_samples, end)
            yield slice(start, end, None)
            start = end

In [15]:
list(gen_even_slices(10, 1))

[slice(0, 10, None)]

In [17]:
list(gen_even_slices(10, 10))

[slice(0, 1, None),
 slice(1, 2, None),
 slice(2, 3, None),
 slice(3, 4, None),
 slice(4, 5, None),
 slice(5, 6, None),
 slice(6, 7, None),
 slice(7, 8, None),
 slice(8, 9, None),
 slice(9, 10, None)]

In [22]:
list(gen_even_slices(10, 5))

[slice(0, 2, None),
 slice(2, 4, None),
 slice(4, 6, None),
 slice(6, 8, None),
 slice(8, 10, None)]

In [26]:
a = range(20)

for sl in list(gen_even_slices(20, 6)):
    print(a[sl])

range(0, 4)
range(4, 8)
range(8, 11)
range(11, 14)
range(14, 17)
range(17, 20)


In [None]:
X : Document-Term matrix whose dimension is D*V

    components_ : Word-topic distribution for corpus denoted
        by lambda in the literature

    expElogbeta : Exponential of expectation of log beta.

    alpha : Prior for document topic distribution
        denoted by alpha in the literature

    maxIter : Maximum number of iterations for individual document loop.

    threshold : Threshold for individual document loop

    random_state : Integer
        Random number of initialization of gamma parameters

    Returns