# **웹 크롤링**
Web Mining
## **1 Natural Language Processing**
자연어 분석을 위한 크롤링

In [5]:
# pip install beautifulsoup4
import os
import numpy as np
from bs4 import BeautifulSoup
moviehtmldir, moviedict = './data/Movie/', {}

for filename in [f for f in os.listdir(moviehtmldir) if f[0]!='.']:
    id = filename.split('.')[0]
    f  = open(moviehtmldir + '/' + filename, encoding="ISO-8859-1")
    parsed_html   = BeautifulSoup(f.read(), "lxml")
    try:    title = parsed_html.body.h1.text       
    except: title = 'none'
    moviedict[id] = title

In [11]:
import nltk
from nltk.corpus   import stopwords
from nltk.tokenize import WordPunctTokenizer

# nltk.download('stopwords')
stoplist = stopwords.words('english')
tknzr    = WordPunctTokenizer()
print(len(stoplist))
stoplist[::17]

def ListDocs(dirname):
    docs = []
    titles = []
    for filename in [f for f in os.listdir(dirname) if str(f)[0]!='.']:
        f = open(dirname+'/'+filename,'r')
        id = filename.split('.')[0].split('_')[1]
        titles.append(moviedict[id])
        docs.append(f.read())
    return docs,titles

dir_ = './data/txt_sentoken/'
pos_textreviews, pos_titles = ListDocs(dir_ + 'pos/')
neg_textreviews, neg_titles = ListDocs(dir_ + 'neg/')
tot_textreviews = pos_textreviews + neg_textreviews
tot_titles      = pos_titles + neg_titles

179


KeyError: '11962'

In [10]:
#test tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def PreprocessTfidf(texts,stoplist=[],stem=False):
    newtexts = []
    for text in texts:
        if stem:
            tmp = [w for w in tknzr.tokenize(text) if w not in stoplist]
        else:
            tmp = [stemmer.stem(w) for w in [w for w in tknzr.tokenize(text) if w not in stoplist]]
        newtexts.append(' '.join(tmp))
    return newtexts

vectorizer = TfidfVectorizer(min_df=1)
processed_reviews = PreprocessTfidf(tot_textreviews,stoplist,True)
mod_tfidf = vectorizer.fit(processed_reviews)
vec_tfidf = mod_tfidf.transform(processed_reviews)
tfidf = dict(zip(vectorizer.get_feature_names(),vectorizer.idf_))

In [None]:
#dump tf-idf into file
import cPickle as pickle
#print mod_tfidf.get_feature_names()
print len(processed_reviews),'--',len(mod_tfidf.get_feature_names())
v= mod_tfidf.transform(processed_reviews)
#print v
with open('vectorizer.pk', 'wb') as fin:
      pickle.dump(mod_tfidf, fin)
file = open("vectorizer.pk",'r')
load_tfidf =  pickle.load(file)
        
print load_tfidf.transform(PreprocessTfidf([' '.join(['drama'])],stoplist,True))

In [None]:
#test LSA
import gensim
from gensim import models
class GenSimCorpus(object):
    def __init__(self, texts, stoplist=[],stem=False):
        self.texts = texts
        self.stoplist = stoplist
        self.stem = stem
        self.dictionary = gensim.corpora.Dictionary(self.iter_docs(texts, stoplist))


    def __len__(self):
        return len(self.texts)
    def __iter__(self):
        for tokens in self.iter_docs(self.texts, self.stoplist):
            yield self.dictionary.doc2bow(tokens)
    def iter_docs(self,texts, stoplist):
        for text in texts:
            if self.stem:
                yield (stemmer.stem(w) for w in [x for x in tknzr.tokenize(text) if x not in stoplist])
            else:
                yield (x for x in tknzr.tokenize(text) if x not in stoplist)

In [None]:
corpus = GenSimCorpus(tot_textreviews,stoplist,True)
dict_corpus = corpus.dictionary
ntopics = 10
lsi =  models.LsiModel(corpus, num_topics=ntopics, id2word=dict_corpus)

In [None]:
U = lsi.projection.u
Sigma = np.eye(ntopics)*lsi.projection.s
#calculate V
V = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s
dict_words = {}
for i in range(len(dict_corpus)):
    dict_words[dict_corpus[i]] = i

In [None]:
from collections import namedtuple

def PreprocessDoc2Vec(text,stop=[],stem=False):
    words = tknzr.tokenize(text)
    if stem:
        words_clean = [stemmer.stem(w) for w in [i.lower() for i in words if i not in stop]]
    else:
        words_clean = [i.lower() for i in words if i not in stop]
    return words_clean

In [None]:
Review = namedtuple('Review','words tags')
dir = './review_polarity/txt_sentoken/'
do2vecstem = False
reviews_pos = []
cnt = 0

In [None]:
for filename in [f for f in os.listdir(dir+'pos/') if str(f)[0]!='.']:
    f = open(dir+'pos/'+filename,'r')
    reviews_pos.append(Review(PreprocessDoc2Vec(f.read(),stoplist,do2vecstem),['pos_'+str(cnt)]))
    cnt+=1

In [None]:
reviews_neg = []
cnt= 0
for filename in [f for f in os.listdir(dir+'neg/') if str(f)[0]!='.']:
    f = open(dir+'neg/'+filename,'r')
    reviews_neg.append(Review(PreprocessDoc2Vec(f.read(),stoplist,do2vecstem),['neg_'+str(cnt)]))
    cnt+=1
tot_reviews = reviews_pos + reviews_neg

In [None]:
#define doc2vec
from gensim.models import Doc2Vec
import multiprocessing

In [None]:
cores = multiprocessing.cpu_count()
vec_size = 500
model_d2v = Doc2Vec(dm=1, dm_concat=0, size=vec_size, window=10, negative=0, hs=0, min_count=1, workers=cores)

In [None]:
#build vocab
model_d2v.build_vocab(tot_reviews)
#train
numepochs= 20

In [None]:
for epoch in range(numepochs):
    try:
        print 'epoch %d' % (epoch)
        model_d2v.train(tot_reviews)
        model_d2v.alpha *= 0.99
        model_d2v.min_alpha = model_d2v.alpha
    except (KeyboardInterrupt, SystemExit):
        break

In [None]:
#query
query = ['science','future','action']
#similar tfidf
#sparse matrix so the metrics transform into regular vectors before computing cosine
from sklearn.metrics.pairwise import cosine_similarity
query_vec = mod_tfidf.transform(PreprocessTfidf([' '.join(query)],stoplist,True))
sims= cosine_similarity(query_vec,vec_tfidf)[0]
indxs_sims = sims.argsort()[::-1]

In [None]:
for d in list(indxs_sims)[:5]:
    print 'sim:',sims[d],' title:',tot_titles[d]

In [None]:
#LSA query
def TransformWordsListtoQueryVec(wordslist,dict_words,stem=False):
    q = np.zeros(len(dict_words.keys()))
    for w in wordslist:
        if stem:
            q[dict_words[stemmer.stem(w)]]=1.
        else:
            q[dict_words[w]] = 1.
    return q

In [None]:
q = TransformWordsListtoQueryVec(query,dict_words,True)
qk =   np.dot(np.dot(q,U),Sigma)
sims = np.zeros(len(tot_textreviews))
for d in range(len(V)):
    sims[d]=np.dot(qk,V[d])

In [None]:
indxs_sims = np.argsort(sims)[::-1]  
for d in list(indxs_sims)[:5]:
    print 'sim:',sims[d],' doc:',tot_titles[d]

In [None]:
#doc2vec query
#force inference to get the same result
model_d2v.random = np.random.RandomState(1)
query_docvec = model_d2v.infer_vector(PreprocessDoc2Vec(' '.join(query),stoplist,do2vecstem))

In [None]:
#model_d2v.docvecs.most_similar([query_docvec], topn=3)
reviews_related = model_d2v.docvecs.most_similar([query_docvec], topn=5)
for review in reviews_related:
    print 'relevance:',review[1],'  title:',tot_titles[review[0]]

## 2. Post Processing (데이터 전처리)

In [None]:
import os
import numpy as np

In [None]:
#get titles
from BeautifulSoup import BeautifulSoup
moviehtmldir = './movie/'
moviedict = {}

In [None]:
for filename in [f for f in os.listdir(moviehtmldir) if f[0]!='.']:
    id = filename.split('.')[0]
    f = open(moviehtmldir+'/'+filename)
    parsed_html = BeautifulSoup(f.read())
    try:
        title = parsed_html.body.h1.text
    except:
        title = 'none'
    moviedict[id] = title

In [None]:
def ListDocs(dirname):
    docs = []
    titles = []
    for filename in [f for f in os.listdir(dirname) if str(f)[0]!='.']:
        f = open(dirname+'/'+filename,'r')
        id = filename.split('.')[0].split('_')[1]
        titles.append(moviedict[id])
        docs.append(f.read())
    return docs,titles

In [None]:
dir = './review_polarity/txt_sentoken/'
pos_textreviews,pos_titles = ListDocs(dir+'pos/')
neg_textreviews,neg_titles = ListDocs(dir+'neg/')
tot_textreviews = pos_textreviews+neg_textreviews
tot_titles = pos_titles+neg_titles

In [None]:
#LDA
import gensim.models
from gensim import models
from nltk.tokenize import RegexpTokenizer
tknzr = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [None]:
class GenSimCorpus(object):
           def __init__(self, texts, stoplist=[],bestwords=[],stem=False):
               self.texts = texts
               self.stoplist = stoplist
               self.stem = stem
               self.bestwords = bestwords
               self.dictionary = gensim.corpora.Dictionary(self.iter_docs(texts, stoplist))
            
           def __len__(self):
               return len(self.texts)
           def __iter__(self):
               for tokens in self.iter_docs(self.texts, self.stoplist):
                   yield self.dictionary.doc2bow(tokens)
           def iter_docs(self,texts, stoplist):
               for text in texts:
                   if self.stem:
                      yield (stemmer.stem(w) for w in [x for x in tknzr.tokenize(text) if x not in stoplist])
                   else:
                      if len(self.bestwords)>0:
                         yield (x for x in tknzr.tokenize(text) if x in self.bestwords)
                      else:
                         yield (x for x in tknzr.tokenize(text) if x not in stoplist)                      

In [None]:
num_topics = 10
corpus = GenSimCorpus(tot_textreviews, stoplist,[],False)
dict_lda = corpus.dictionary
lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dict_lda,passes=10, iterations=50)
print lda.show_topics(num_topics=num_topics)

In [None]:
import copy
#filter out very common words like mobie and film or very unfrequent terms
out_ids = [tokenid for tokenid, docfreq in dict_lda.dfs.iteritems() if docfreq > 1000 or docfreq < 3 ]
dict_lfq = copy.deepcopy(dict_lda)
dict_lfq.filter_tokens(out_ids)
dict_lfq.compactify()
corpus = [dict_lfq.doc2bow(tknzr.tokenize(text)) for text in tot_textreviews]

In [None]:
lda_lfq = models.LdaModel(corpus, num_topics=num_topics, id2word=dict_lfq,passes=10, iterations=50,alpha=0.01,eta=0.01)
for t in range(num_topics):
    print 'topic ',t,'  words: ',lda_lfq.print_topic(t,topn=10)
    print

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
tknzr = WordPunctTokenizer()

In [None]:
from nltk.tokenize import RegexpTokenizer
tknzr = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)

In [None]:
nltk.download('stopwords')
stoplist = stopwords.words('english')
from nltk.stem.porter import PorterStemmer
from collections import namedtuple
stemmer = PorterStemmer()

In [None]:
def PreprocessReviews(text,stop=[],stem=False):
    #print profile
    words = tknzr.tokenize(text)
    if stem:
        words_clean = [stemmer.stem(w) for w in [i.lower() for i in words if i not in stop]]
    else:
        words_clean = [i.lower() for i in words if i not in stop]
    return words_clean

In [None]:
Review = namedtuple('Review','words title tags')
dir = './review_polarity/txt_sentoken/'
do2vecstem = True
reviews_pos = []
cnt = 0

In [None]:
for filename in [f for f in os.listdir(dir+'pos/') if str(f)[0]!='.']:
    f = open(dir+'pos/'+filename,'r')
    id = filename.split('.')[0].split('_')[1]
    reviews_pos.append(Review(PreprocessReviews(f.read(),stoplist,do2vecstem),moviedict[id],['pos_'+str(cnt)]))
    cnt+=1

In [None]:
reviews_neg = []
cnt= 0
for filename in [f for f in os.listdir(dir+'neg/') if str(f)[0]!='.']:
    f = open(dir+'neg/'+filename,'r')
    id = filename.split('.')[0].split('_')[1]
    reviews_neg.append(Review(PreprocessReviews(f.read(),stoplist,do2vecstem),moviedict[id],['neg_'+str(cnt)]))
    cnt+=1

In [None]:
tot_reviews = reviews_pos + reviews_neg
#split in test training sets
def word_features(words):
    return dict([(word, True) for word in words])

In [None]:
negfeatures = [(word_features(r.words), 'neg') for r in reviews_neg]
posfeatures = [(word_features(r.words), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)

In [None]:
print portionpos,'-',portionneg
trainfeatures = negfeatures[:portionneg] + posfeatures[:portionpos]
print len(trainfeatures)
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
#shuffle(testfeatures)

In [None]:
from nltk.classify import NaiveBayesClassifier
#training naive bayes 
classifier = NaiveBayesClassifier.train(trainfeatures)
##testing
err = 0
print 'test on: ',len(testfeatures)

In [None]:
for r in testfeatures:
    sent = classifier.classify(r[0])
    if sent != r[1]:
       err +=1.
print 'error rate: ',err/float(len(testfeatures))

In [None]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from random import shuffle

In [None]:
#train bigram:
def bigrams_words_features(words, nbigrams=200,measure=BigramAssocMeasures.chi_sq):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(measure, nbigrams)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

In [None]:
negfeatures = [(bigrams_words_features(r.words,500), 'neg') for r in reviews_neg]
posfeatures = [(bigrams_words_features(r.words,500), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)

In [None]:
print portionpos,'-',portionneg
trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg]
print len(trainfeatures)
classifier = NaiveBayesClassifier.train(trainfeatures)

In [None]:
##test bigram
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
shuffle(testfeatures)
err = 0
print 'test on: ',len(testfeatures)

In [None]:
for r in testfeatures:
    sent = classifier.classify(r[0])
    #print r[1],'-pred: ',sent
    if sent != r[1]:
       err +=1.
print 'error rate: ',err/float(len(testfeatures))

In [None]:
import nltk.classify.util, nltk.metrics
from nltk.probability import FreqDist, ConditionalFreqDist
tot_poswords = [val for l in [r.words for r in reviews_pos] for val in l]
tot_negwords = [val for l in [r.words for r in reviews_neg] for val in l]
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

In [None]:
for word in tot_poswords:
    word_fd[word.lower()] +=1
    label_word_fd['pos'][word.lower()] +=1

In [None]:
for word in tot_negwords:
    word_fd[word.lower()] +=1
    label_word_fd['neg'][word.lower()] +=1

In [None]:
pos_words = len(tot_poswords)
neg_words = len(tot_negwords)
tot_words = pos_words + neg_words
#select the best words in terms of information contained in the two classes pos and neg
word_scores = {}

In [None]:
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                (freq, pos_words), tot_words)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                (freq, neg_words), tot_words)
    word_scores[word] = pos_score + neg_score

In [None]:
print 'total: ',len(word_scores)
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])

In [None]:
#training naive bayes with chi square feature selection of best words
def best_words_features(words):
    return dict([(word, True) for word in words if word in bestwords])

In [None]:
negfeatures = [(best_words_features(r.words), 'neg') for r in reviews_neg]
posfeatures = [(best_words_features(r.words), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)
print portionpos,'-',portionneg

In [None]:
trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg]
print len(trainfeatures)

In [None]:
classifier = NaiveBayesClassifier.train(trainfeatures)
##test with feature chi square selection
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
shuffle(testfeatures)
err = 0
print 'test on: ',len(testfeatures)

In [None]:
for r in testfeatures:
    sent = classifier.classify(r[0])
    #print r[1],'-pred: ',sent
    if sent != r[1]:
        err +=1.
print 'error rate: ',err/float(len(testfeatures))

In [None]:
from gensim.models import Doc2Vec
import multiprocessing

shuffle(tot_reviews)
cores = multiprocessing.cpu_count()
vec_size = 500
model_d2v = Doc2Vec(dm=1, dm_concat=0, size=vec_size, window=5, negative=0, hs=0, min_count=1, workers=cores)

In [None]:
#build vocab
model_d2v.build_vocab(tot_reviews)
#train
numepochs= 20
for epoch in range(numepochs):
    try:
        print 'epoch %d' % (epoch)
        model_d2v.train(tot_reviews)
        model_d2v.alpha *= 0.99
        model_d2v.min_alpha = model_d2v.alpha
    except (KeyboardInterrupt, SystemExit):
        break

In [None]:
#split train,test sets
trainingsize = 2*int(len(reviews_pos)*0.8)
train_d2v = np.zeros((trainingsize, vec_size))
train_labels = np.zeros(trainingsize)
test_size = len(tot_reviews)-trainingsize
test_d2v = np.zeros((test_size, vec_size))
test_labels = np.zeros(test_size)

In [None]:
cnt_train = 0
cnt_test = 0
for r in reviews_pos:
    name_pos = r.tags[0]
    if int(name_pos.split('_')[1])>= int(trainingsize/2.):
        test_d2v[cnt_test] = model_d2v.docvecs[name_pos]
        test_labels[cnt_test] = 1
        cnt_test +=1
    else:
        train_d2v[cnt_train] = model_d2v.docvecs[name_pos]
        train_labels[cnt_train] = 1
        cnt_train +=1

In [None]:
for r in reviews_neg:
    name_neg = r.tags[0]
    if int(name_neg.split('_')[1])>= int(trainingsize/2.):
        test_d2v[cnt_test] = model_d2v.docvecs[name_neg]
        test_labels[cnt_test] = 0
        cnt_test +=1
    else:
        train_d2v[cnt_train] = model_d2v.docvecs[name_neg]       
        train_labels[cnt_train] = 0
        cnt_train +=1

In [None]:
#train log regre
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(train_d2v, train_labels)
print 'accuracy:',classifier.score(test_d2v,test_labels)

In [None]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(train_d2v, train_labels)
print 'accuracy:',clf.score(test_d2v,test_labels)

In [None]:
#svm linear
clf = SVC(kernel='linear')
clf.fit(train_d2v, train_labels)
print clf.score(test_d2v,test_labels)