In [18]:
import json 
import pandas as pd 

from tqdm.autonotebook import tqdm

# 对Yelps数据的处理

## 查看原始数据

In [15]:
with open("../data/review.json", "r", encoding="utf8") as f:
    for line in f:
        text = json.loads(line)
        #print(text)
        temp = pd.DataFrame(text, index=[0])
        print(temp.head())
        break

                review_id                 user_id             business_id  \
0  Q1sbwvVQXV2734tPgoKj4Q  hG7b0MtEbXx5QzbzE6C_VA  ujmEBvifdJM6h6RLv4wQIg   

   stars  useful  funny  cool  \
0    1.0       6      1     0   

                                                text                 date  
0  Total bill for this horrible service? Over $8G...  2013-05-07 04:34:36  


In [17]:
temp.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,6,1,0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36


## 对数据进行转换

In [19]:
with open("../data/review.json", "r", encoding="utf8") as f:
    for i, line in tqdm(enumerate(f)):
        record = json.loads(line)
        record = pd.DataFrame(record, index=[0])
        if i == 0: 
            record.to_csv("../data/review.csv", header=True, encoding="utf-8", index=False)
        else:
            record.to_csv("../data/review.csv", header=False, encoding="utf-8", mode="a", index=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 874 (char 873)

In [20]:
data = pd.read_csv("../data/review.csv", nrows=10000)

In [21]:
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,6,1,0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36
1,GJXCdrto3ASJOqKeVWPi6Q,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5.0,0,0,0,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33
2,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,3,0,0,I have to say that this office really has it t...,2016-11-09 20:09:03
3,yi0R0Ugj_xUx_Nek0-_Qig,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5.0,0,0,0,Went in for a lunch. Steak sandwich was delici...,2018-01-09 20:56:38
4,11a8sVPMUFtaC7_ABRkmtw,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1.0,7,0,0,Today was my second out of three sessions I ha...,2018-01-30 23:07:38


## 将数据集转换成训练需要的格式

In [None]:
# 对标签进行处理，大小3的表示pos，小于等于3的表示neg
data['emotion'] = data['stars'].apply(lambda x: int(x>3))

In [None]:
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import spacy
import re 
from nltk.stem import WordNetLemmatizer

# 划分训练集和验证集
from sklearn.model_selection import train_test_split

stoplists = stopwords.words("english")

wnl = WordNetLemmatizer()

In [None]:
# 添加需要去除的标点符号集，问号和感叹号除外
puncts = [',', '.', '"', ':', ')', '(', '-', '?', '!', '|', ';', "'", '$', '&', '/', 
          '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', 
          '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', 
          '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', 
          '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥',
          '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', 
          '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
          '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', 
          '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

# 定义一些常见的缩写
contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'),
                        (r'isn\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), 
                        (r'(\w+)n\'t', '\g<1> not'),(r'(\w+)\'ve', '\g<1> have'), 
                        (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), 
                        (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'),
                        (r'dont', 'do not'), (r'wont', 'will not') ]

def clean_text(text):
    # 去除对情感分类没有用的数字
    text = re.sub("[0-9]+", "", text)
    # 对重复出现的标点进行天魂
    text = re.sub(r"(\!)\1+", "multiExclamation", text)
    text = re.sub(r"(\?)\1+", "multiQuestion", text)
    text = re.sub(r"(\.)\1+", "multiStop", text)
    
    # 对缩写进行替换
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    
    # 在标点前面加上空格
    for punct in puncts:
        text = text.replace(punct, f" {punct} ")
    
    #print(text)
    # 对文本进行分词
    text_split = tokenize.word_tokenize(text)
    text = [word for word in text_split if word not in stoplists]
    text = [wnl.lemmatize(word) for word in text]
    
    return " ".join(text)

In [None]:
data['text'] = data['text'].apply(clean_text）

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['text'].values, data['emotion'].values,
                                                   test_size=0.3, random_state=2019, shuffle=True,
                                                   stratify=data['emotion'].values)

In [None]:
# 将训练集和测试集写入文件中
with open("../data/yelps_train_fasttext.txt", "w", encoding="utf8") as f: 
    train = zip(X_train, y_train)
    for X, y in tqdm(train):
        label = "pos" if y > 0 else "neg"
        record = X + "\t__label__"+label+"\n"
        f.write(record)

In [None]:
# 将测试集写入文件中
with open("../data/yelps_test_fasttext.txt", "w", encoding="utf-8") as f: 
    test = zip(X_test, y_test)
    for X, y in tqdm(test):
        label = "pos" if y>0 else "neg"
        record = X+"\t__label__"+label+"\n"
        f.write(record)

# 对IMDB review的处理

In [2]:
import pandas as pd 
from bs4 import BeautifulSoup

In [9]:
unlabel = pd.read_csv("../data/rawData/unlabeledTrainData.tsv", sep="\t", lineterminator="\n", 
                     error_bad_lines=False)

b'Skipping line 43043: expected 2 fields, saw 3\n'


In [10]:
unlabel.head()

Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


In [11]:
label = pd.read_csv("../data/rawData/labeledTrainData.tsv", sep="\t", lineterminator="\n",
                   error_bad_lines=False)

In [12]:
label.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [15]:
def get_rate(string):
    string = string.split("_")
    return int(string[1])

In [16]:
label["rate"] = label["id"].apply(get_rate)

In [17]:
label.head()

Unnamed: 0,id,sentiment,review,rate
0,5814_8,1,With all this stuff going down at the moment w...,8
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",9
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,3
3,3630_4,0,It must be assumed that those who praised this...,4
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,8


In [19]:
def clean_text(text):
    # 添加需要去除的标点符号集，问号和感叹号除外
    puncts = [',', '.', '"', ':', ')', '(', '-', '?', '!', '|', ';', "'", '$', '&', '/', 
              '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', 
              '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', 
              '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', 
              '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥',
              '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', 
              '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
              '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', 
              '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

    # 定义一些常见的缩写
    contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'),
                            (r'isn\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), 
                            (r'(\w+)n\'t', '\g<1> not'),(r'(\w+)\'ve', '\g<1> have'), 
                            (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), 
                            (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'),
                            (r'dont', 'do not'), (r'wont', 'will not') ]    
    # 去除对情感分类没有用的数字
    text = re.sub("[0-9]+", "", text)
    # 对重复出现的标点进行天魂
    text = re.sub(r"(\!)\1+", "multiExclamation", text)
    text = re.sub(r"(\?)\1+", "multiQuestion", text)
    text = re.sub(r"(\.)\1+", "multiStop", text)
    
    # 对缩写进行替换
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    
    # 在标点前面加上空格
    for punct in puncts:
        text = text.replace(punct, f" {punct} ")
    
    #print(text)
    # 对文本进行分词
    text_split = tokenize.word_tokenize(text)
    text = [word for word in text_split if word not in stoplists]
    text = [wnl.lemmatize(word) for word in text]
    
    return " ".join(text)

In [20]:
unlabel.shape

(49998, 2)

In [21]:
label.shape

(25000, 4)

In [22]:
unlabel['review'] = unlabel['review'].apply(clean_text)

In [24]:
label['review'] = label['review'].apply(clean_text)

In [25]:
label.head()

Unnamed: 0,id,sentiment,review,rate
0,5814_8,1,With stuff going moment MJ started listening m...,8
1,2381_9,1,\ The Classic War Worlds \ `` Timothy Hines en...,9
2,7759_3,0,The film start manager ( Nicholas Bell ) givin...,3
3,3630_4,0,It must assumed praised film ( \ greatest film...,4
4,9495_8,1,Superbly trashy wondrously unpretentious ' exp...,8


In [26]:
def cleanReview(string):
    beau = BeautifulSoup(string)
    newstring = beau.get_text()
    newstring = newstring.strip().lower()
    return newstring

In [27]:
unlabel['review'] = unlabel["review"].apply(cleanReview)

In [28]:
label['review'] = label['review'].apply(cleanReview)

In [29]:
unlabel.head()

Unnamed: 0,id,review
0,9999_0,"watching time chasers , obvious made bunch fri..."
1,45057_0,i saw film year ago remember particularly nast...
2,15561_0,"minor spoilers < br / > < br / > in new york ,..."
3,7161_0,i went see film great deal excitement i school...
4,43971_0,"yes , i agree everyone site movie very very ba..."


In [30]:
label.head()

Unnamed: 0,id,sentiment,review,rate
0,5814_8,1,with stuff going moment mj started listening m...,8
1,2381_9,1,\ the classic war worlds \ `` timothy hines en...,9
2,7759_3,0,the film start manager ( nicholas bell ) givin...,3
3,3630_4,0,it must assumed praised film ( \ greatest film...,4
4,9495_8,1,superbly trashy wondrously unpretentious ' exp...,8


In [34]:
# 过滤HTML中的标签
# 将HTML中标签等信息去掉
# @param htmlstr HTML字符串
def filter_tags(htmlstr):
    # 先过滤CDATA
    # re_cdata = re.compile('<!\[CDATA\[[^>]*//\]\]>', re.I)  # 匹配CDATA
    re_cdata = re.compile('(<!\[CDATA\[)[\s\S]*?(//\]\]>)', re.I)  # 匹配CDATA

    # re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)  # Script
    re_script = re.compile(
        '(<\s*script\s*[^>]*>)[\s\S]*?(</\s*script\s*>)', re.I)  # Script

    re_style = re.compile(
        '<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)  # style
    re_br = re.compile('<br\s*?/?>')  # 处理换行
    re_h = re.compile('</?\w+[^>]*>')  # HTML标签
    re_comment = re.compile('<!--[^>]*-->')  # HTML注释
    re_htmlhead = re.compile('<!DOCTYPE html[^>]*>')  # HTML头部

    s = re_cdata.sub('', htmlstr)  # 去掉CDATA
    s = re_script.sub('', s)  # 去掉SCRIPT
    s = re_style.sub('', s)  # 去掉style
    s = re_br.sub('\n', s)  # 将br转换为换行
    s = re_h.sub('', s)  # 去掉HTML 标签
    s = re_comment.sub('', s)  # 去掉HTML注释

    s = re_htmlhead.sub('', s)  # HTML头部

    # 过滤连续空格
    space_line = re.compile(' +')
    s = space_line.sub(' ', s)

    # 去掉连续的空行
    blank_line = re.compile('\n+')
    s = blank_line.sub('\n', s)

    # 过滤只有空格的行
    line = re.compile(' \n')
    s = line.sub('', s)

    s = replaceCharEntity(s)  # 替换实体
    
    s = re.sub(r"</? \w+[^>]*>", "", s)
    
    
    return s


# 替换常用HTML字符实体.
# 使用正常的字符替换HTML中特殊的字符实体.
# 你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
# @param htmlstr HTML字符串.
def replaceCharEntity(htmlstr):
    CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
                     'lt': '<', '60': '<',
                     'gt': '>', '62': '>',
                     'amp': '&', '38': '&',
                     'quot': '"', '34': '"', }

    re_charEntity = re.compile(r'&#?(?P<name>\w+);')
    sz = re_charEntity.search(htmlstr)
    while sz:
        # entity = sz.group()  # entity全称，如&gt;
        key = sz.group('name')  # 去除&;后entity,如&gt;为gt
        try:
            htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
        except KeyError:
            # 以空串代替
            htmlstr = re_charEntity.sub('', htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
    return htmlstr


def repalce(s, re_exp, repl_string):
    return re_exp.sub(repl_string, s)

In [35]:
unlabel['review'] = unlabel['review'].apply(filter_tags)

In [36]:
unlabel.head()

Unnamed: 0,id,review
0,9999_0,"watching time chasers , obvious made bunch fri..."
1,45057_0,i saw film year ago remember particularly nast...
2,15561_0,"minor spoilers in new york , joan barnard ( ..."
3,7161_0,i went see film great deal excitement i school...
4,43971_0,"yes , i agree everyone site movie very very ba..."


In [37]:
label["review"] = label['review'].apply(filter_tags)

In [38]:
# 将所有的文本输出用于训练词向量
new_df = pd.concat([unlabel['review'], label['review']], axis=0)

In [39]:
new_df.to_csv("../data/preprocess/word2vec.txt", index=False)

  """Entry point for launching an IPython kernel.


In [40]:
# 将有标签的处理之后的数据保留
label = label[["review", "sentiment", "rate"]]
label.to_csv("../data/preprocess/labeldTrain.csv", index=False)

In [41]:
label.head()

Unnamed: 0,review,sentiment,rate
0,with stuff going moment mj started listening m...,1,8
1,\ the classic war worlds \ `` timothy hines en...,1,9
2,the film start manager ( nicholas bell ) givin...,0,3
3,it must assumed praised film ( \ greatest film...,0,4
4,superbly trashy wondrously unpretentious ' exp...,1,8


In [42]:
label['sentiment'] = label['sentiment'].apply(lambda x: "__label__pos" if x > 0 else "__label__neg")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
label.head()

Unnamed: 0,review,sentiment,rate
0,with stuff going moment mj started listening m...,__label__pos,8
1,\ the classic war worlds \ `` timothy hines en...,__label__pos,9
2,the film start manager ( nicholas bell ) givin...,__label__neg,3
3,it must assumed praised film ( \ greatest film...,__label__neg,4
4,superbly trashy wondrously unpretentious ' exp...,__label__pos,8


In [45]:
X_train, X_test, y_train, y_test = train_test_split(label['review'].values, label['sentiment'].values,
                                                   test_size=0.25, stratify=label['sentiment'].values,
                                                   random_state=2019, shuffle=True)

In [47]:
with open("../data/preprocess/imdb_fasttext_train.txt", "w", encoding="utf8") as f: 
    train = zip(X_train, y_train)
    for x, y in tqdm(train):
        f.write(x + '\t' + y + '\n')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [48]:
with open("../data/preprocess/imdb_fasttext_test.txt", "w", encoding="utf-8") as f: 
    test = zip(X_test, y_test)
    for x, y in tqdm(test):
        f.write(x+"\t"+y+"\n")

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


