# 필요 패키지 다운로드

* nltk : 영어 형태소 분석 등에 필요한 패키지
* gensim: topic 모델링에 필요한 패키지

In [None]:
!pip install nltk

In [None]:
!pip install gensim

In [None]:
import nltk
nltk.download('stopwords') #stopwords 다운로드 하기

# import packages

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from pprint import pprint

# prepare data

* open data: 20 category news data

In [None]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups(subset='all')

In [None]:
df = pd.DataFrame()
df['content'] = data.data
df['target'] = data.target
df['target_names'] = df.target.map(dict(enumerate(data.target_names)))

In [None]:
df.head()

# prepare stopwords

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
stop_words

# Preprocessing

* remove emails and newline characters
* tokenize words and clean-up texts

## remove emails

In [None]:
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [None]:
data[0] #전처리 이후

In [None]:
df['content'].loc[0] #전처리 전

## Tokenize words and clean up

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))


In [None]:
data_words[0]

# Creating Bigram

* Micheal, Jackson -> Micheal Jackson. front, bumpoer -> front bumper

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [None]:
bigram_mod[data_words[0]]

# Remove stopwords and lemmitize

* remove stopwords
* lemmitize

## remove stopwords

In [None]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [None]:
data_words_nostops[0]

In [None]:
data_words_bigrams[0]

# lemmitization

* pos tagging and select only adj, verb, noun, adv

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [[lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)) for word, pos_tag in token_
                       if pos_tag.startswith('J') or pos_tag.startswith('V') or pos_tag.startswith('N') or pos_tag.startswith('R')] for token_ in pos_tokens]
        return pos_tokens

    
lemmatizer = WordNetLemmatizer()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()
data_lemmatized = lemmatization_using_pos_tagger.pos_tag(data_words_bigrams)

In [None]:
nltk.pos_tag(['I', 'went', 'to', 'nice', 'school', '.'])

In [None]:
data_lemmatized[0]

# preprocessing done

# Preprocessing for modeling

* create dictionary
* create corpus (word_number, word_frequency)

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
id2word[0]

In [None]:
corpus[0]

In [None]:
[[(id2word[id], freq) for id, freq in corpus[0]]] #example

# Build topic model and fit

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())

# Topic distribution across documents

In [None]:
final = []
for corpus_ in corpus:
    final.append(lda_model.get_document_topics(corpus_)) #BOW format #https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel

data_f = []
for f_ in final:
    data_f.append(pd.DataFrame(f_, columns=['topics', 'probability']).set_index('topics'))

data_f = pd.concat(data_f, axis=1).T.fillna(0)
data_f = data_f.reset_index(drop=True)

In [None]:
data_f