In [25]:
# KaggleWord2VecUtility.py
import re
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

class KaggleWord2VecUtility(object):
    '''
    KaggleWord2VecUtility is a utility class for processing raw HTML text 
    into segments for further learning.
    '''
    
    @staticmethod
    def review_to_wordlist(review, remove_stopwords=False):
        '''
        Function to convert a document to a sequence of words, optionally
        removing stop words. 
        Returns a list of words.
        '''
        # 1. Remove HTML
        review_text = BeautifulSoup(review, 'lxml').get_text()
        
        # 2. Remove non-letters
        review_text = re.sub('[^a-zA-Z]', ' ', review_text)
        
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words('english'))
            words = [w for w in words if not w in stops]
         
        # 5. Return a  list of words
        return words
    
    # Define a function to split a review into parsed sentences.
    @staticmethod
    def review_to_sentences(review, tokenizer, remove_stopwords=False):
        '''
        Function to split a review into parsed sentences. 
        Return a list of sentences, where each sentences is a list of words.
        '''
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        
        # 2. Loop over each sentence
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it.
            if len(raw_sentence) > 0 :
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append(KaggleWord2VecUtility.review_to_wordlist(raw_sentence, remove_stopwords))
        
        # Return the list of sentences (each sentence is a list of words, so this returns a list of lists)
        return sentences

In [26]:
# 测试类是否可用，其中对HTML的处理不具备普遍性，需要针对不同网站修改。
import requests
blog = requests.get('https://www.cnblogs.com/zoe233/p/8628566.html')
a = KaggleWord2VecUtility.review_to_wordlist(blog.text)
print(a)

['zoe', 'var', 'currentblogapp', 'zoe', 'cb', 'enable', 'mathjax', 'false', 'var', 'islogined', 'false', 'zoe', 'posts', 'comments', 'trackbacks', 'a', 'row', 'col', 'for', 'row', 'in', 'a', 'for', 'col', 'in', 'range', 'len', 'a', 'list', 'map', 'list', 'zip', 'a', 'a', 'len', 'a', 'a', 'row', 'col', 'i', 'j', 'j', 'i', 'zip', 'map', 'zip', 'map', 'python', 'python', 'zip', 'zip', 'tuple', 'zip', 'zip', 'a', 'a', 'a', 'a', 'a', 'b', 'c', 'd', 'zip', 'zip', 'a', 'a', 'a', 'print', 'zip', 'zip', 'object', 'at', 'x', 'f', 'a', 'c', 'for', 'i', 'in', 'zip', 'print', 'i', 'zip', 'zip', 'a', 'a', 'a', 'print', 'zip', 'zip', 'object', 'at', 'x', 'f', 'a', 'd', 'for', 'j', 'in', 'zip', 'print', 'j', 'a', 'b', 'c', 'zip', 'zip', 'a', 'print', 'zip', 'zip', 'object', 'at', 'x', 'f', 'a', 'd', 'for', 'i', 'in', 'zip', 'print', 'i', 'a', 'b', 'c', 'd', 'zip', 'zip', 'a', 'print', 'zip', 'zip', 'object', 'at', 'x', 'f', 'a', 'f', 'for', 'j', 'in', 'zip', 'print', 'j', 'a', 'b', 'c', 'd', 'a', 'b',

In [28]:
# BagOfWords.py
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

if __name__ =='__main__':
    train = ''