# 測試Lemmate和stem

In [1]:
import os
import numpy as np
import pandas as pd
from enum import Enum
from pprint import pprint

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [36]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_sections():
    return get_contents(ContentType.SEC)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()
#snowball_stemmer = SnowballStemmer('english')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token2 = wordnet_lemmatizer.lemmatize(token, pos='v')
            token2 = wordnet_lemmatizer.lemmatize(token2, pos='n')
            token3 = porter_stemmer.stem(token2)
            result.append((token,token2,token3))
    return result

def get_preprocess_data(doc_id):
    doc = contents[doc_id]
    data = preprocess(doc)
    df = pd.DataFrame(data,columns=['preprocessed','lemmatized','stemmed'])
    return df

In [37]:
contents = get_all_abstracts()

documents = pd.DataFrame(data=contents,columns=['abstract'])
documents['index'] = documents.index

print('共',len(contents),'篇論文\n')

共 1343 篇論文



In [38]:
doc_id = 1001
contents[doc_id]

'In real-world applications of natural language generation, there are often constraints on the target sentences in addition to fluency and naturalness requirements. Existing language generation techniques are usually based on recurrent neural networks (RNNs). However, it is non-trivial to impose constraints on RNNs while maintaining generation quality, since RNNs generate sentences sequentially (or with beam search) from the first word to the last. In this paper, we propose CGMH, a novel approach using Metropolis-Hastings sampling for constrained sentence generation. CGMH allows complicated constraints such as the occurrence of multiple keywords in the target sentences, which cannot be handled in traditional RNN-based approaches. Moreover, CGMH works in the inference stage, and does not require parallel corpora for training. We evaluate our method on a variety of tasks, including keywords-to-sentence generation, unsupervised sentence paraphrasing, and unsupervised sentence error correc

In [39]:
df = get_preprocess_data(doc_id)
df.iloc[6:22]

Unnamed: 0,preprocessed,lemmatized,stemmed
6,constraints,constraint,constraint
7,target,target,target
8,sentences,sentence,sentenc
9,addition,addition,addit
10,fluency,fluency,fluenci
11,naturalness,naturalness,natur
12,requirements,requirement,requir
13,existing,exist,exist
14,language,language,languag
15,generation,generation,gener
